Mercurial > projects > dil
annotate trunk/src/Lexer.d @ 17:9bd0bac79479
- Removed Whitespace from enum list.
- Added code to count current number of lines.
author | aziz |
---|---|
date | Sat, 23 Jun 2007 20:12:03 +0000 |
parents | 476e8e55c1d4 |
children | c48d2125f1e2 |
rev | line source |
---|---|
0 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL2 | |
4 +/ | |
5 module Lexer; | |
3 | 6 import Token; |
2 | 7 import std.stdio; |
4 | 8 import std.utf; |
9 import std.uni; | |
0 | 10 |
11 /// ASCII character properties table. | |
2 | 12 static const int ptable[256] = [ |
16 | 13 0, 0, 0, 0, 0, 0, 0, 0, 0,32, 0,32,32, 0, 0, 0, |
13 | 14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
16 | 15 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
13 | 16 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, |
17 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
18 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0,16, | |
19 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
20 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, | |
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
2 | 29 ]; |
0 | 30 |
31 enum CProperty | |
32 { | |
1 | 33 Octal = 1, |
0 | 34 Digit = 1<<1, |
35 Hex = 1<<2, | |
36 Alpha = 1<<3, | |
13 | 37 Underscore = 1<<4, |
16 | 38 Whitespace = 1<<5 |
0 | 39 } |
40 | |
13 | 41 private alias CProperty CP; |
42 int isoctal(char c) { return ptable[c] & CP.Octal; } | |
43 int isdigit(char c) { return ptable[c] & CP.Digit; } | |
44 int ishexad(char c) { return ptable[c] & CP.Hex; } | |
45 int isalpha(char c) { return ptable[c] & CP.Alpha; } | |
46 int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); } | |
47 int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); } | |
48 int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); } | |
16 | 49 int isspace(char c) { return ptable[c] & CP.Whitespace; } |
13 | 50 |
51 version(gen_ptable) | |
0 | 52 static this() |
53 { | |
2 | 54 // Initialize character properties table. |
0 | 55 for (int i; i < ptable.length; ++i) |
56 { | |
13 | 57 ptable[i] = 0; |
0 | 58 if ('0' <= i && i <= '7') |
13 | 59 ptable[i] |= CP.Octal; |
0 | 60 if ('0' <= i && i <= '9') |
13 | 61 ptable[i] |= CP.Digit; |
0 | 62 if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F') |
13 | 63 ptable[i] |= CP.Hex; |
0 | 64 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') |
13 | 65 ptable[i] |= CP.Alpha; |
66 if (i == '_') | |
67 ptable[i] |= CP.Underscore; | |
16 | 68 if (i == ' ' || i == '\t' || i == '\v'|| i == '\f') |
69 ptable[i] |= CP.Whitespace; | |
0 | 70 } |
2 | 71 // Print a formatted array literal. |
72 char[] array = "[\n"; | |
73 for (int i; i < ptable.length; ++i) | |
74 { | |
13 | 75 int c = ptable[i]; |
76 array ~= std.string.format("%2d,", c, ((i+1) % 16) ? "":"\n"); | |
2 | 77 } |
13 | 78 array[$-2..$] = "\n]"; |
2 | 79 writefln(array); |
0 | 80 } |
81 | |
11 | 82 const char[3] LS = \u2028; |
83 const char[3] PS = \u2029; | |
84 | |
0 | 85 class Lexer |
86 { | |
4 | 87 Token token; |
88 char[] text; | |
89 char* p; | |
90 char* end; | |
91 | |
17 | 92 uint loc = 1; /// line of code |
93 | |
4 | 94 this(char[] text) |
95 { | |
96 this.text = text; | |
97 this.text.length = this.text.length + 1; | |
98 this.text[$-1] = 0; | |
99 | |
100 this.p = this.text.ptr; | |
101 this.end = this.p + this.text.length; | |
102 } | |
103 | |
104 public void scan(out Token t) | |
3 | 105 { |
4 | 106 assert(p < end); |
0 | 107 |
10 | 108 uint c = *p; |
4 | 109 |
5 | 110 while(1) |
4 | 111 { |
5 | 112 t.start = p; |
16 | 113 |
5 | 114 if (c == 0) |
115 { | |
17 | 116 ++p; |
5 | 117 t.type = TOK.EOF; |
17 | 118 t.end = p; |
5 | 119 return; |
120 } | |
4 | 121 |
17 | 122 if (c == '\n') |
123 { | |
124 c = *++p; | |
125 ++loc; | |
126 continue; | |
127 } | |
128 else if (c == '\r') | |
129 { | |
130 c = *++p; | |
131 if (c != '\n') | |
132 ++loc; | |
133 continue; | |
134 } | |
135 | |
13 | 136 if (isidbeg(c)) |
5 | 137 { |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
138 Lidentifier: |
5 | 139 do |
140 { c = *++p; } | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
141 while (isident(c) || c & 128 && isUniAlpha(decodeUTF())) |
5 | 142 t.type = TOK.Identifier; |
143 t.end = p; | |
144 return; | |
145 } | |
146 | |
15 | 147 if (isdigit(c)) |
148 return scanNumber(t); | |
149 | |
8 | 150 if (c == '/') |
5 | 151 { |
8 | 152 c = *++p; |
14 | 153 switch(c) |
5 | 154 { |
14 | 155 case '=': |
156 ++p; | |
157 t.type = TOK.DivisionAssign; | |
158 t.end = p; | |
159 return; | |
160 case '+': | |
8 | 161 uint level = 1; |
162 do | |
7 | 163 { |
8 | 164 c = *++p; |
165 if (c == 0) | |
166 throw new Error("unterminated /+ +/ comment."); | |
167 else if (c == '/' && p[1] == '+') | |
168 { | |
169 ++p; | |
170 ++level; | |
171 } | |
172 else if (c == '+' && p[1] == '/') | |
173 { | |
174 ++p; | |
175 if (--level == 0) | |
176 break; | |
177 } | |
178 } while (1) | |
179 p += 2; | |
180 t.type = TOK.Comment; | |
181 t.end = p; | |
182 return; | |
14 | 183 case '*': |
8 | 184 do |
7 | 185 { |
8 | 186 c = *++p; |
187 if (c == 0) | |
188 throw new Error("unterminated /* */ comment."); | |
189 } while (c != '*' || p[1] != '/') | |
190 p += 2; | |
191 t.type = TOK.Comment; | |
192 t.end = p; | |
193 return; | |
14 | 194 case '/': |
10 | 195 do |
196 { | |
197 c = *++p; | |
11 | 198 if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) |
199 break; | |
10 | 200 } while (c != '\n' && c != 0) |
201 t.type = TOK.Comment; | |
202 t.end = p; | |
203 return; | |
204 } | |
5 | 205 } |
206 | |
9
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
207 if (c == '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
208 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
209 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
210 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
211 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
212 throw new Error("unterminated string literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
213 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
214 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
215 } while (c != '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
216 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
217 t.type = TOK.String; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
218 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
219 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
220 } |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
221 |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
222 if (c == '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
223 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
224 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
225 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
226 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
227 throw new Error("unterminated character literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
228 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
229 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
230 } while (c != '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
231 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
232 t.type = TOK.Character; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
233 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
234 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
235 } |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
236 |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
237 if (c & 128 && isUniAlpha(decodeUTF())) |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
238 goto Lidentifier; |
5 | 239 c = *++p; |
4 | 240 } |
3 | 241 } |
4 | 242 |
15 | 243 void scanNumber(ref Token t) |
244 { | |
245 while (isdigit(*++p)) {} | |
246 t.type = TOK.Number; | |
247 t.end = p; | |
248 } | |
249 | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
250 uint decodeUTF() |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
251 { |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
252 assert(*p & 128); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
253 size_t idx; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
254 uint d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
255 d = std.utf.decode(p[0 .. end-p], idx); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
256 p += idx -1; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
257 return d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
258 } |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
259 |
3 | 260 public TOK nextToken() |
261 { | |
4 | 262 scan(this.token); |
263 return this.token.type; | |
264 } | |
265 | |
266 Token[] getTokens() | |
267 { | |
268 Token[] tokens; | |
269 while (nextToken() != TOK.EOF) | |
270 tokens ~= this.token; | |
271 tokens ~= this.token; | |
272 return tokens; | |
3 | 273 } |
5 | 274 } |