Mercurial > projects > dil
annotate trunk/src/Lexer.d @ 15:c70c028e47dd
- Started implementation of lexing numbers.
author | aziz |
---|---|
date | Sat, 23 Jun 2007 14:11:01 +0000 |
parents | cdf788d8bdaf |
children | 476e8e55c1d4 |
rev | line source |
---|---|
0 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL2 | |
4 +/ | |
5 module Lexer; | |
3 | 6 import Token; |
2 | 7 import std.stdio; |
4 | 8 import std.utf; |
9 import std.uni; | |
0 | 10 |
11 /// ASCII character properties table. | |
2 | 12 static const int ptable[256] = [ |
13 | 13 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
16 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, | |
17 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
18 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0,16, | |
19 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
20 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, | |
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
2 | 29 ]; |
0 | 30 |
31 enum CProperty | |
32 { | |
1 | 33 Octal = 1, |
0 | 34 Digit = 1<<1, |
35 Hex = 1<<2, | |
36 Alpha = 1<<3, | |
13 | 37 Underscore = 1<<4, |
0 | 38 } |
39 | |
13 | 40 private alias CProperty CP; |
41 int isoctal(char c) { return ptable[c] & CP.Octal; } | |
42 int isdigit(char c) { return ptable[c] & CP.Digit; } | |
43 int ishexad(char c) { return ptable[c] & CP.Hex; } | |
44 int isalpha(char c) { return ptable[c] & CP.Alpha; } | |
45 int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); } | |
46 int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); } | |
47 int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); } | |
48 | |
49 version(gen_ptable) | |
0 | 50 static this() |
51 { | |
2 | 52 // Initialize character properties table. |
0 | 53 for (int i; i < ptable.length; ++i) |
54 { | |
13 | 55 ptable[i] = 0; |
0 | 56 if ('0' <= i && i <= '7') |
13 | 57 ptable[i] |= CP.Octal; |
0 | 58 if ('0' <= i && i <= '9') |
13 | 59 ptable[i] |= CP.Digit; |
0 | 60 if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F') |
13 | 61 ptable[i] |= CP.Hex; |
0 | 62 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') |
13 | 63 ptable[i] |= CP.Alpha; |
64 if (i == '_') | |
65 ptable[i] |= CP.Underscore; | |
0 | 66 } |
2 | 67 // Print a formatted array literal. |
68 char[] array = "[\n"; | |
69 for (int i; i < ptable.length; ++i) | |
70 { | |
13 | 71 int c = ptable[i]; |
72 array ~= std.string.format("%2d,", c, ((i+1) % 16) ? "":"\n"); | |
2 | 73 } |
13 | 74 array[$-2..$] = "\n]"; |
2 | 75 writefln(array); |
0 | 76 } |
77 | |
11 | 78 const char[3] LS = \u2028; |
79 const char[3] PS = \u2029; | |
80 | |
0 | 81 class Lexer |
82 { | |
4 | 83 Token token; |
84 char[] text; | |
85 char* p; | |
86 char* end; | |
87 | |
88 this(char[] text) | |
89 { | |
90 this.text = text; | |
91 this.text.length = this.text.length + 1; | |
92 this.text[$-1] = 0; | |
93 | |
94 this.p = this.text.ptr; | |
95 this.end = this.p + this.text.length; | |
96 } | |
97 | |
98 public void scan(out Token t) | |
3 | 99 { |
4 | 100 assert(p < end); |
0 | 101 |
10 | 102 uint c = *p; |
4 | 103 |
5 | 104 while(1) |
4 | 105 { |
5 | 106 t.start = p; |
107 if (c == 0) | |
108 { | |
109 t.type = TOK.EOF; | |
110 t.end = p+1; | |
111 return; | |
112 } | |
4 | 113 |
13 | 114 if (isidbeg(c)) |
5 | 115 { |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
116 Lidentifier: |
5 | 117 do |
118 { c = *++p; } | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
119 while (isident(c) || c & 128 && isUniAlpha(decodeUTF())) |
5 | 120 t.type = TOK.Identifier; |
121 t.end = p; | |
122 return; | |
123 } | |
124 | |
15 | 125 if (isdigit(c)) |
126 return scanNumber(t); | |
127 | |
8 | 128 if (c == '/') |
5 | 129 { |
8 | 130 c = *++p; |
14 | 131 switch(c) |
5 | 132 { |
14 | 133 case '=': |
134 ++p; | |
135 t.type = TOK.DivisionAssign; | |
136 t.end = p; | |
137 return; | |
138 case '+': | |
8 | 139 uint level = 1; |
140 do | |
7 | 141 { |
8 | 142 c = *++p; |
143 if (c == 0) | |
144 throw new Error("unterminated /+ +/ comment."); | |
145 else if (c == '/' && p[1] == '+') | |
146 { | |
147 ++p; | |
148 ++level; | |
149 } | |
150 else if (c == '+' && p[1] == '/') | |
151 { | |
152 ++p; | |
153 if (--level == 0) | |
154 break; | |
155 } | |
156 } while (1) | |
157 p += 2; | |
158 t.type = TOK.Comment; | |
159 t.end = p; | |
160 return; | |
14 | 161 case '*': |
8 | 162 do |
7 | 163 { |
8 | 164 c = *++p; |
165 if (c == 0) | |
166 throw new Error("unterminated /* */ comment."); | |
167 } while (c != '*' || p[1] != '/') | |
168 p += 2; | |
169 t.type = TOK.Comment; | |
170 t.end = p; | |
171 return; | |
14 | 172 case '/': |
10 | 173 do |
174 { | |
175 c = *++p; | |
11 | 176 if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) |
177 break; | |
10 | 178 } while (c != '\n' && c != 0) |
179 t.type = TOK.Comment; | |
180 t.end = p; | |
181 return; | |
182 } | |
5 | 183 } |
184 | |
9
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
185 if (c == '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
186 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
187 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
188 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
189 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
190 throw new Error("unterminated string literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
191 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
192 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
193 } while (c != '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
194 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
195 t.type = TOK.String; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
196 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
197 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
198 } |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
199 |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
200 if (c == '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
201 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
202 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
203 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
204 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
205 throw new Error("unterminated character literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
206 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
207 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
208 } while (c != '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
209 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
210 t.type = TOK.Character; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
211 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
212 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
213 } |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
214 |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
215 if (c & 128 && isUniAlpha(decodeUTF())) |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
216 goto Lidentifier; |
5 | 217 c = *++p; |
4 | 218 } |
3 | 219 } |
4 | 220 |
15 | 221 void scanNumber(ref Token t) |
222 { | |
223 while (isdigit(*++p)) {} | |
224 t.type = TOK.Number; | |
225 t.end = p; | |
226 } | |
227 | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
228 uint decodeUTF() |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
229 { |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
230 assert(*p & 128); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
231 size_t idx; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
232 uint d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
233 d = std.utf.decode(p[0 .. end-p], idx); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
234 p += idx -1; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
235 return d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
236 } |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
237 |
3 | 238 public TOK nextToken() |
239 { | |
4 | 240 scan(this.token); |
241 return this.token.type; | |
242 } | |
243 | |
244 Token[] getTokens() | |
245 { | |
246 Token[] tokens; | |
247 while (nextToken() != TOK.EOF) | |
248 tokens ~= this.token; | |
249 tokens ~= this.token; | |
250 return tokens; | |
3 | 251 } |
5 | 252 } |