Mercurial > projects > dil
annotate trunk/src/Lexer.d @ 12:0989206cf73c
- Added code to decode Unicode characters in identifiers.
author | aziz |
---|---|
date | Sat, 23 Jun 2007 10:02:00 +0000 |
parents | dffcdaa7c47a |
children | e5211758b63c |
rev | line source |
---|---|
0 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL2 | |
4 +/ | |
5 module Lexer; | |
3 | 6 import Token; |
2 | 7 import std.stdio; |
4 | 8 import std.utf; |
9 import std.uni; | |
0 | 10 |
11 /// ASCII character properties table. | |
2 | 12 static const int ptable[256] = [ |
13 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,23,23,23,23,23,23,23,23,22,22, 0, 0, 0, 0, 0, 0, | |
15 0,28,28,28,28,28,28,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24, 0, 0, 0, 0,16, | |
16 0,28,28,28,28,28,28,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24, 0, 0, 0, 0, 0, | |
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
21 ]; | |
0 | 22 |
23 enum CProperty | |
24 { | |
1 | 25 Octal = 1, |
0 | 26 Digit = 1<<1, |
27 Hex = 1<<2, | |
28 Alpha = 1<<3, | |
29 Identifier = 1<<4 | |
30 } | |
31 | |
32 int isoctal(char c) { return ptable[c] & CProperty.Octal; } | |
33 int isdigit(char c) { return ptable[c] & CProperty.Digit; } | |
34 int ishexad(char c) { return ptable[c] & CProperty.Hex; } | |
35 int isalpha(char c) { return ptable[c] & CProperty.Alpha; } | |
36 int isalnum(char c) { return ptable[c] & (CProperty.Alpha | CProperty.Digit); } | |
37 int isident(char c) { return ptable[c] & CProperty.Identifier; } | |
2 | 38 /+ |
0 | 39 static this() |
40 { | |
2 | 41 // Initialize character properties table. |
0 | 42 for (int i; i < ptable.length; ++i) |
43 { | |
44 if ('0' <= i && i <= '7') | |
45 ptable[i] |= CProperty.Octal; | |
46 if ('0' <= i && i <= '9') | |
47 ptable[i] |= CProperty.Digit; | |
48 if (isdigit(i) || 'a' <= i && i <= 'f' || 'A' <= i && i <= 'F') | |
49 ptable[i] |= CProperty.Hex; | |
50 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') | |
51 ptable[i] |= CProperty.Alpha; | |
52 if (isalnum(i) || i == '_') | |
1 | 53 ptable[i] |= CProperty.Identifier; |
0 | 54 } |
2 | 55 // Print a formatted array literal. |
56 char[] array = "[\n"; | |
57 for (int i; i < ptable.length; ++i) | |
58 { | |
59 char c = ptable[i]; | |
60 array ~= std.string.format("%2d,", c, ((i+1) % 32) ? "":"\n"); | |
61 } | |
62 array.length = array.length - 2; // remove ",\n" | |
63 array ~= "\n]"; | |
64 writefln(array); | |
0 | 65 } |
2 | 66 +/ |
0 | 67 |
11 | 68 const char[3] LS = \u2028; |
69 const char[3] PS = \u2029; | |
70 | |
0 | 71 class Lexer |
72 { | |
4 | 73 Token token; |
74 char[] text; | |
75 char* p; | |
76 char* end; | |
77 | |
78 this(char[] text) | |
79 { | |
80 this.text = text; | |
81 this.text.length = this.text.length + 1; | |
82 this.text[$-1] = 0; | |
83 | |
84 this.p = this.text.ptr; | |
85 this.end = this.p + this.text.length; | |
86 } | |
87 | |
88 public void scan(out Token t) | |
3 | 89 { |
4 | 90 assert(p < end); |
0 | 91 |
10 | 92 uint c = *p; |
4 | 93 |
5 | 94 while(1) |
4 | 95 { |
5 | 96 t.start = p; |
97 if (c == 0) | |
98 { | |
99 t.type = TOK.EOF; | |
100 t.end = p+1; | |
101 return; | |
102 } | |
4 | 103 |
5 | 104 if (isident(c) && !isdigit(c)) |
105 { | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
106 Lidentifier: |
5 | 107 do |
108 { c = *++p; } | |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
109 while (isident(c) || c & 128 && isUniAlpha(decodeUTF())) |
5 | 110 t.type = TOK.Identifier; |
111 t.end = p; | |
112 return; | |
113 } | |
114 | |
8 | 115 if (c == '/') |
5 | 116 { |
8 | 117 c = *++p; |
118 if (c == '+') | |
5 | 119 { |
8 | 120 uint level = 1; |
121 do | |
7 | 122 { |
8 | 123 c = *++p; |
124 if (c == 0) | |
125 throw new Error("unterminated /+ +/ comment."); | |
126 else if (c == '/' && p[1] == '+') | |
127 { | |
128 ++p; | |
129 ++level; | |
130 } | |
131 else if (c == '+' && p[1] == '/') | |
132 { | |
133 ++p; | |
134 if (--level == 0) | |
135 break; | |
136 } | |
137 } while (1) | |
138 p += 2; | |
139 t.type = TOK.Comment; | |
140 t.end = p; | |
141 return; | |
142 } | |
143 else if (c == '*') | |
144 { | |
145 do | |
7 | 146 { |
8 | 147 c = *++p; |
148 if (c == 0) | |
149 throw new Error("unterminated /* */ comment."); | |
150 } while (c != '*' || p[1] != '/') | |
151 p += 2; | |
152 t.type = TOK.Comment; | |
153 t.end = p; | |
154 return; | |
155 } | |
10 | 156 else if (c == '/') |
157 { | |
158 do | |
159 { | |
160 c = *++p; | |
11 | 161 if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) |
162 break; | |
10 | 163 } while (c != '\n' && c != 0) |
164 t.type = TOK.Comment; | |
165 t.end = p; | |
166 return; | |
167 } | |
5 | 168 } |
169 | |
9
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
170 if (c == '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
171 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
172 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
173 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
174 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
175 throw new Error("unterminated string literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
176 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
177 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
178 } while (c != '"') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
179 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
180 t.type = TOK.String; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
181 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
182 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
183 } |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
184 |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
185 if (c == '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
186 { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
187 do { |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
188 c = *++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
189 if (c == 0) |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
190 throw new Error("unterminated character literal."); |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
191 if (c == '\\') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
192 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
193 } while (c != '\'') |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
194 ++p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
195 t.type = TOK.Character; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
196 t.end = p; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
197 return; |
5d6968cc751e
- Parsing string and character literals now (rudimentary implementation.)
aziz
parents:
8
diff
changeset
|
198 } |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
199 |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
200 if (c & 128 && isUniAlpha(decodeUTF())) |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
201 goto Lidentifier; |
5 | 202 c = *++p; |
4 | 203 } |
3 | 204 } |
4 | 205 |
12
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
206 uint decodeUTF() |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
207 { |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
208 assert(*p & 128); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
209 size_t idx; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
210 uint d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
211 d = std.utf.decode(p[0 .. end-p], idx); |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
212 p += idx -1; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
213 return d; |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
214 } |
0989206cf73c
- Added code to decode Unicode characters in identifiers.
aziz
parents:
11
diff
changeset
|
215 |
3 | 216 public TOK nextToken() |
217 { | |
4 | 218 scan(this.token); |
219 return this.token.type; | |
220 } | |
221 | |
222 Token[] getTokens() | |
223 { | |
224 Token[] tokens; | |
225 while (nextToken() != TOK.EOF) | |
226 tokens ~= this.token; | |
227 tokens ~= this.token; | |
228 return tokens; | |
3 | 229 } |
5 | 230 } |