Mercurial > projects > dil
annotate trunk/src/Lexer.d @ 14:cdf788d8bdaf
- Parsing /= now.
author | aziz |
---|---|
date | Sat, 23 Jun 2007 13:14:05 +0000 |
parents | e5211758b63c |
children | c70c028e47dd |
rev | line source |
---|---|
0 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL2 | |
4 +/ | |
5 module Lexer; | |
3 | 6 import Token; |
2 | 7 import std.stdio; |
4 | 8 import std.utf; |
9 import std.uni; | |
0 | 10 |
11 /// ASCII character properties table. | |
2 | 12 static const int ptable[256] = [ |
13 | 13 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
16 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0, | |
17 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
18 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0,16, | |
19 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
20 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, | |
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
2 | 29 ]; |
0 | 30 |
31 enum CProperty | |
32 { | |
1 | 33 Octal = 1, |
0 | 34 Digit = 1<<1, |
35 Hex = 1<<2, | |
36 Alpha = 1<<3, | |
13 | 37 Underscore = 1<<4, |
0 | 38 } |
39 | |
13 | 40 private alias CProperty CP; |
41 int isoctal(char c) { return ptable[c] & CP.Octal; } | |
42 int isdigit(char c) { return ptable[c] & CP.Digit; } | |
43 int ishexad(char c) { return ptable[c] & CP.Hex; } | |
44 int isalpha(char c) { return ptable[c] & CP.Alpha; } | |
45 int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); } | |
46 int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); } | |
47 int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); } | |
48 | |
49 version(gen_ptable) | |
0 | 50 static this() |
51 { | |
2 | 52 // Initialize character properties table. |
0 | 53 for (int i; i < ptable.length; ++i) |
54 { | |
13 | 55 ptable[i] = 0; |
0 | 56 if ('0' <= i && i <= '7') |
13 | 57 ptable[i] |= CP.Octal; |
0 | 58 if ('0' <= i && i <= '9') |
13 | 59 ptable[i] |= CP.Digit; |
0 | 60 if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F') |
13 | 61 ptable[i] |= CP.Hex; |
0 | 62 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') |
13 | 63 ptable[i] |= CP.Alpha; |
64 if (i == '_') | |
65 ptable[i] |= CP.Underscore; | |
0 | 66 } |
2 | 67 // Print a formatted array literal. |
68 char[] array = "[\n"; | |
69 for (int i; i < ptable.length; ++i) | |
70 { | |
13 | 71 int c = ptable[i]; |
72 array ~= std.string.format("%2d,", c, ((i+1) % 16) ? "":"\n"); | |
2 | 73 } |
13 | 74 array[$-2..$] = "\n]"; |
2 | 75 writefln(array); |
0 | 76 } |
77 | |
11 | 78 const char[3] LS = \u2028; |
79 const char[3] PS = \u2029; | |
80 | |
0 | 81 class Lexer |
82 { | |
4 | 83 Token token; |
84 char[] text; | |
85 char* p; | |
86 char* end; | |
87 | |
88 this(char[] text) | |
89 { | |
90 this.text = text; | |
91 this.text.length = this.text.length + 1; | |
92 this.text[$-1] = 0; | |
93 | |
94 this.p = this.text.ptr; | |
95 this.end = this.p + this.text.length; | |
96 } | |
97 | |
98 public void scan(out Token t) | |
3 | 99 { |
4 | 100 assert(p < end); |
0 | 101 |
10 | 102 uint c = *p; |
4 | 103 |
5 | 104 while(1) |
4 | 105 { |
5 | 106 t.start = p; |
107 if (c == 0) | |
108 { | |
109 t.type = TOK.EOF; | |
110 t.end = p+1; | |
111 return; | |
112 } | |
4 | 113 |
13 | 114 if (isidbeg(c)) |
5 | 115 { |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
116 Lidentifier: |
5 | 117 do |
118 { c = *++p; } | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
119 while (isident(c) || c & 128 && isUniAlpha(decodeUTF())) |
5 | 120 t.type = TOK.Identifier; |
121 t.end = p; | |
122 return; | |
123 } | |
124 | |
8 | 125 if (c == '/') |
5 | 126 { |
8 | 127 c = *++p; |
14 | 128 switch(c) |
5 | 129 { |
14 | 130 case '=': |
131 ++p; | |
132 t.type = TOK.DivisionAssign; | |
133 t.end = p; | |
134 return; | |
135 case '+': | |
8 | 136 uint level = 1; |
137 do | |
7 | 138 { |
8 | 139 c = *++p; |
140 if (c == 0) | |
141 throw new Error("unterminated /+ +/ comment."); | |
142 else if (c == '/' && p[1] == '+') | |
143 { | |
144 ++p; | |
145 ++level; | |
146 } | |
147 else if (c == '+' && p[1] == '/') | |
148 { | |
149 ++p; | |
150 if (--level == 0) | |
151 break; | |
152 } | |
153 } while (1) | |
154 p += 2; | |
155 t.type = TOK.Comment; | |
156 t.end = p; | |
157 return; | |
14 | 158 case '*': |
8 | 159 do |
7 | 160 { |
8 | 161 c = *++p; |
162 if (c == 0) | |
163 throw new Error("unterminated /* */ comment."); | |
164 } while (c != '*' || p[1] != '/') | |
165 p += 2; | |
166 t.type = TOK.Comment; | |
167 t.end = p; | |
168 return; | |
14 | 169 case '/': |
10 | 170 do |
171 { | |
172 c = *++p; | |
11 | 173 if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) |
174 break; | |
10 | 175 } while (c != '\n' && c != 0) |
176 t.type = TOK.Comment; | |
177 t.end = p; | |
178 return; | |
179 } | |
5 | 180 } |
181 | |
9
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
182 if (c == '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
183 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
184 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
185 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
186 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
187 throw new Error("unterminated string literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
188 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
189 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
190 } while (c != '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
191 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
192 t.type = TOK.String; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
193 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
194 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
195 } |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
196 |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
197 if (c == '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
198 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
199 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
200 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
201 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
202 throw new Error("unterminated character literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
203 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
204 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
205 } while (c != '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
206 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
207 t.type = TOK.Character; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
208 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
209 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
210 } |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
211 |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
212 if (c & 128 && isUniAlpha(decodeUTF())) |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
213 goto Lidentifier; |
5 | 214 c = *++p; |
4 | 215 } |
3 | 216 } |
4 | 217 |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
218 uint decodeUTF() |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
219 { |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
220 assert(*p & 128); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
221 size_t idx; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
222 uint d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
223 d = std.utf.decode(p[0 .. end-p], idx); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
224 p += idx -1; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
225 return d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
226 } |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
227 |
3 | 228 public TOK nextToken() |
229 { | |
4 | 230 scan(this.token); |
231 return this.token.type; | |
232 } | |
233 | |
234 Token[] getTokens() | |
235 { | |
236 Token[] tokens; | |
237 while (nextToken() != TOK.EOF) | |
238 tokens ~= this.token; | |
239 tokens ~= this.token; | |
240 return tokens; | |
3 | 241 } |
5 | 242 } |