Mercurial > projects > dil
annotate trunk/src/Lexer.d @ 11:dffcdaa7c47a
- Added Unicode line and paragraph separators.
- Checking for LS and PS in // comment scanner.
author | aziz |
---|---|
date | Sat, 23 Jun 2007 08:54:00 +0000 |
parents | 3ee65d6e39c9 |
children | 0989206cf73c |
rev | line source |
---|---|
0 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL2 | |
4 +/ | |
5 module Lexer; | |
3 | 6 import Token; |
2 | 7 import std.stdio; |
4 | 8 import std.utf; |
9 import std.uni; | |
0 | 10 |
11 /// ASCII character properties table. | |
2 | 12 static const int ptable[256] = [ |
13 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,23,23,23,23,23,23,23,22,22, 0, 0, 0, 0, 0, 0, | |
15 0,28,28,28,28,28,28,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24, 0, 0, 0, 0,16, | |
16 0,28,28,28,28,28,28,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24, 0, 0, 0, 0, 0, | |
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
21 ]; | |
0 | 22 |
23 enum CProperty | |
24 { | |
1 | 25 Octal = 1, |
0 | 26 Digit = 1<<1, |
27 Hex = 1<<2, | |
28 Alpha = 1<<3, | |
29 Identifier = 1<<4 | |
30 } | |
31 | |
32 int isoctal(char c) { return ptable[c] & CProperty.Octal; } | |
33 int isdigit(char c) { return ptable[c] & CProperty.Digit; } | |
34 int ishexad(char c) { return ptable[c] & CProperty.Hex; } | |
35 int isalpha(char c) { return ptable[c] & CProperty.Alpha; } | |
36 int isalnum(char c) { return ptable[c] & (CProperty.Alpha | CProperty.Digit); } | |
37 int isident(char c) { return ptable[c] & CProperty.Identifier; } | |
2 | 38 /+ |
0 | 39 static this() |
40 { | |
2 | 41 // Initialize character properties table. |
0 | 42 for (int i; i < ptable.length; ++i) |
43 { | |
44 if ('0' <= i && i <= '7') | |
45 ptable[i] |= CProperty.Octal; | |
46 if ('0' <= i && i <= '9') | |
47 ptable[i] |= CProperty.Digit; | |
48 if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F') | |
49 ptable[i] |= CProperty.Hex; | |
50 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') | |
51 ptable[i] |= CProperty.Alpha; | |
52 if (isalnum(i) || i == '_') | |
1 | 53 ptable[i] |= CProperty.Identifier; |
0 | 54 } |
2 | 55 // Print a formatted array literal. |
56 char[] array = "[\n"; | |
57 for (int i; i < ptable.length; ++i) | |
58 { | |
59 char c = ptable[i]; | |
60 array ~= std.string.format("%2d,", c, ((i+1) % 32) ? "":"\n"); | |
61 } | |
62 array.length = array.length - 2; // remove ",\n" | |
63 array ~= "\n]"; | |
64 writefln(array); | |
0 | 65 } |
2 | 66 +/ |
0 | 67 |
11 | 68 const char[3] LS = \u2028; |
69 const char[3] PS = \u2029; | |
70 | |
0 | 71 class Lexer |
72 { | |
4 | 73 Token token; |
74 char[] text; | |
75 char* p; | |
76 char* end; | |
77 | |
78 this(char[] text) | |
79 { | |
80 this.text = text; | |
81 this.text.length = this.text.length + 1; | |
82 this.text[$-1] = 0; | |
83 | |
84 this.p = this.text.ptr; | |
85 this.end = this.p + this.text.length; | |
86 } | |
87 | |
88 public void scan(out Token t) | |
3 | 89 { |
4 | 90 assert(p < end); |
0 | 91 |
10 | 92 uint c = *p; |
4 | 93 |
5 | 94 while(1) |
4 | 95 { |
5 | 96 t.start = p; |
97 if (c == 0) | |
98 { | |
99 t.type = TOK.EOF; | |
100 t.end = p+1; | |
101 return; | |
102 } | |
4 | 103 |
5 | 104 if (isident(c) && !isdigit(c)) |
105 { | |
106 do | |
107 { c = *++p; } | |
108 while (isident(c)) | |
109 t.type = TOK.Identifier; | |
110 t.end = p; | |
111 return; | |
112 } | |
113 | |
8 | 114 if (c == '/') |
5 | 115 { |
8 | 116 c = *++p; |
117 if (c == '+') | |
5 | 118 { |
8 | 119 uint level = 1; |
120 do | |
7 | 121 { |
8 | 122 c = *++p; |
123 if (c == 0) | |
124 throw new Error("unterminated /+ +/ comment."); | |
125 else if (c == '/' && p[1] == '+') | |
126 { | |
127 ++p; | |
128 ++level; | |
129 } | |
130 else if (c == '+' && p[1] == '/') | |
131 { | |
132 ++p; | |
133 if (--level == 0) | |
134 break; | |
135 } | |
136 } while (1) | |
137 p += 2; | |
138 t.type = TOK.Comment; | |
139 t.end = p; | |
140 return; | |
141 } | |
142 else if (c == '*') | |
143 { | |
144 do | |
7 | 145 { |
8 | 146 c = *++p; |
147 if (c == 0) | |
148 throw new Error("unterminated /* */ comment."); | |
149 } while (c != '*' || p[1] != '/') | |
150 p += 2; | |
151 t.type = TOK.Comment; | |
152 t.end = p; | |
153 return; | |
154 } | |
10 | 155 else if (c == '/') |
156 { | |
157 do | |
158 { | |
159 c = *++p; | |
11 | 160 if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) |
161 break; | |
10 | 162 } while (c != '\n' && c != 0) |
163 t.type = TOK.Comment; | |
164 t.end = p; | |
165 return; | |
166 } | |
5 | 167 } |
168 | |
9
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
169 if (c == '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
170 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
171 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
172 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
173 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
174 throw new Error("unterminated string literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
175 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
176 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
177 } while (c != '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
178 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
179 t.type = TOK.String; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
180 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
181 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
182 } |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
183 |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
184 if (c == '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
185 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
186 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
187 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
188 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
189 throw new Error("unterminated character literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
190 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
191 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
192 } while (c != '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
193 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
194 t.type = TOK.Character; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
195 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
196 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
197 } |
5 | 198 c = *++p; |
4 | 199 } |
3 | 200 } |
4 | 201 |
3 | 202 public TOK nextToken() |
203 { | |
4 | 204 scan(this.token); |
205 return this.token.type; | |
206 } | |
207 | |
208 Token[] getTokens() | |
209 { | |
210 Token[] tokens; | |
211 while (nextToken() != TOK.EOF) | |
212 tokens ~= this.token; | |
213 tokens ~= this.token; | |
214 return tokens; | |
3 | 215 } |
5 | 216 } |