Mercurial > projects > dang
comparison src/lexer/Lexer.d @ 206:d3c148ca429b
Major moving of files. all src now goes into src, all docs in docs.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Tue, 12 Aug 2008 18:14:56 +0200 |
parents | |
children | e0551773a005 |
comparison
equal
deleted
inserted
replaced
205:8387cbaa85ab | 206:d3c148ca429b |
---|---|
1 module lexer.Lexer; | |
2 | |
3 import basic.Message, | |
4 basic.SourceManager; | |
5 | |
6 import lexer.Token, | |
7 lexer.Keyword; | |
8 | |
9 import tango.io.Stdout; | |
10 | |
11 /** | |
12 The Lexer class will supply you with methods to tokenize a D file. Supply the | |
13 Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file. | |
14 | |
15 For more info about Tokens, look up the lexer.Token module. | |
16 */ | |
17 class Lexer | |
18 { | |
19 public: | |
20 | |
21 /** | |
22 Create a new Lexer. | |
23 */ | |
24 this(SourceLocation start, SourceManager src_mgr, MessageHandler messages) | |
25 { | |
26 this.messages = messages; | |
27 sm = src_mgr; | |
28 start_loc = start; | |
29 position = 0; | |
30 source = sm.getRawData(start_loc); | |
31 | |
32 | |
33 charTable.length = 256; | |
34 foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_") | |
35 charTable[c] = CharType.Letter; | |
36 | |
37 foreach (c; "0123456789") | |
38 charTable[c] = CharType.Number; | |
39 | |
40 foreach (c; "(){}[];:.,=!<>+-*/%\"`") | |
41 charTable[c] = CharType.Symbol; | |
42 | |
43 foreach (c; " \n") | |
44 charTable[c] = CharType.Whitespace; | |
45 | |
46 foreach (c; "'\\") | |
47 charTable[c] = CharType.Other; | |
48 | |
49 symbolFunctions.length = 256; | |
50 | |
51 symbolFunctions['('] = &openParentheses; | |
52 symbolFunctions[')'] = &closeParentheses; | |
53 symbolFunctions['{'] = &openBrace; | |
54 symbolFunctions['}'] = &closeBrace; | |
55 symbolFunctions['['] = &openBracket; | |
56 symbolFunctions[']'] = &closeBracket; | |
57 symbolFunctions[';'] = &seperator; | |
58 symbolFunctions[':'] = : | |
59 symbolFunctions['.'] = ˙ | |
60 symbolFunctions[','] = , | |
61 symbolFunctions['='] = &eq; | |
62 symbolFunctions['!'] = ≠ | |
63 symbolFunctions['<'] = ≤ | |
64 symbolFunctions['>'] = ≥ | |
65 symbolFunctions['+'] = + | |
66 symbolFunctions['-'] = − | |
67 symbolFunctions['*'] = ☆ | |
68 symbolFunctions['/'] = &slash; | |
69 symbolFunctions['%'] = &percent; | |
70 symbolFunctions['"'] = &string; | |
71 symbolFunctions['`'] = &string; | |
72 } | |
73 | |
74 /** | |
75 Get the next token from the source. This method will move the | |
76 internal position forward to the next Token. | |
77 | |
78 return: A Token - Token.type is TokType.EOF if there is | |
79 no more tokens in the file. | |
80 */ | |
81 Token next() | |
82 { | |
83 switch (getNextChar) | |
84 { | |
85 case CharType.EOF: | |
86 SLoc loc; | |
87 return Token(Tok.EOF, loc, 0); | |
88 | |
89 case CharType.Whitespace: | |
90 position += 1; | |
91 return this.next; | |
92 | |
93 case CharType.Symbol: | |
94 return lexSymbol; | |
95 | |
96 case CharType.Letter: | |
97 return lexLetter; | |
98 | |
99 case CharType.Number: | |
100 return lexNumber; | |
101 case CharType.Other: | |
102 messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer); | |
103 } | |
104 } | |
105 | |
106 /** | |
107 Get the next token from the source. This method will NOT move the | |
108 internal position forward, and thereby having no side-effects. | |
109 | |
110 return: A Token - Token.type is TokType.EOF if there is | |
111 no more tokens in the file. | |
112 */ | |
113 Token peek(int skip = 0) | |
114 { | |
115 int oldPosition = this.position; | |
116 while (skip-- > 0) | |
117 this.next; | |
118 Token t = this.next; | |
119 this.position = oldPosition; | |
120 return t; | |
121 } | |
122 | |
123 private: | |
124 Token eq() | |
125 { | |
126 if(source[position] == '=') | |
127 return Token(Tok.Eq, Loc(position++ - 1), 2); | |
128 return Token(Tok.Assign, Loc(position - 1), 1); | |
129 } | |
130 Token openBrace() | |
131 { | |
132 return Token(Tok.OpenBrace, Loc(position - 1), 1); | |
133 } | |
134 Token closeBrace() | |
135 { | |
136 return Token(Tok.CloseBrace, Loc(position - 1), 1); | |
137 } | |
138 Token openParentheses() | |
139 { | |
140 return Token(Tok.OpenParentheses, Loc(position - 1), 1); | |
141 } | |
142 Token closeParentheses() | |
143 { | |
144 return Token(Tok.CloseParentheses, Loc(position - 1), 1); | |
145 } | |
146 Token openBracket() | |
147 { | |
148 return Token(Tok.OpenBracket, Loc(position - 1), 1); | |
149 } | |
150 Token closeBracket() | |
151 { | |
152 return Token(Tok.CloseBracket, Loc(position - 1), 1); | |
153 } | |
154 Token seperator() | |
155 { | |
156 return Token(Tok.Seperator, Loc(position - 1), 1); | |
157 } | |
158 Token colon() | |
159 { | |
160 return Token(Tok.Colon, Loc(position - 1), 1); | |
161 } | |
162 Token dot() | |
163 { | |
164 int pos = 0; | |
165 while(getNextChar(0) == CharType.Number || | |
166 this.source[position + pos + 1] == '_') | |
167 { | |
168 if(getNextChar(0) == CharType.Number) | |
169 { | |
170 position--; | |
171 return lexNumber(); | |
172 } | |
173 pos++; | |
174 } | |
175 return Token(Tok.Dot, Loc(position - 1), 1); | |
176 } | |
177 Token comma() | |
178 { | |
179 return Token(Tok.Comma, Loc(position - 1), 1); | |
180 } | |
181 Token ne() | |
182 { | |
183 if(source[position] == '=') | |
184 return Token(Tok.Ne, Loc(position++ - 1), 2); | |
185 return Token(Tok.Not, Loc(position - 1), 1); | |
186 } | |
187 Token le() | |
188 { | |
189 if(source[position] == '=') | |
190 return Token(Tok.Le, Loc(position++ - 1), 2); | |
191 return Token(Tok.Lt, Loc(position - 1), 1); | |
192 } | |
193 Token ge() | |
194 { | |
195 if(source[position] == '=') | |
196 return Token(Tok.Ge, Loc(position++ - 1), 2); | |
197 return Token(Tok.Gt, Loc(position - 1), 1); | |
198 } | |
199 Token plus() | |
200 { | |
201 return Token(Tok.Plus, Loc(position - 1), 1); | |
202 } | |
203 Token minus() | |
204 { | |
205 return Token(Tok.Minus, Loc(position - 1), 1); | |
206 } | |
207 Token star() | |
208 { | |
209 return Token(Tok.Star, Loc(position - 1), 1); | |
210 } | |
211 Token slash() | |
212 { | |
213 switch(source[position]) | |
214 { | |
215 case '/': | |
216 while(getNextChar != CharType.EOF) | |
217 { | |
218 if(source[position++] == '\n') | |
219 return this.next; | |
220 } | |
221 return Token(Tok.EOF, Loc(position), 0); | |
222 | |
223 case '*': | |
224 position += 2; | |
225 while(getNextChar != CharType.EOF) | |
226 { | |
227 ++position; | |
228 if(source[position-2] == '*') | |
229 if(source[position-1] == '/') | |
230 { | |
231 return this.next; | |
232 } | |
233 } | |
234 messages.report(UnexpectedEOFBlock,Loc(position)); | |
235 | |
236 case '+': | |
237 position += 2; | |
238 int nesting = 1; | |
239 while(getNextChar != CharType.EOF) | |
240 { | |
241 ++position; | |
242 if(source[position-2] == '+') | |
243 if(source[position-1] == '/') | |
244 { | |
245 position++; | |
246 nesting--; | |
247 } | |
248 | |
249 if(source[position-2] == '/') | |
250 if(source[position-1] == '+') | |
251 { | |
252 nesting++; | |
253 position++; | |
254 } | |
255 | |
256 if(nesting == 0) | |
257 return this.next; | |
258 } | |
259 messages.report(UnexpectedEOFBlock,Loc(position)); | |
260 | |
261 default: | |
262 return Token(Tok.Slash, Loc(position - 1), 1); | |
263 } | |
264 } | |
265 | |
266 Token percent() | |
267 { | |
268 return Token(Tok.Percent, Loc(position - 1), 1); | |
269 } | |
270 | |
271 Token string() | |
272 { | |
273 --position; | |
274 int start = position; | |
275 if(getNextChar() == CharType.Letter) | |
276 position++; | |
277 char end = '`'; | |
278 switch(source[position]) | |
279 { | |
280 case '"': | |
281 if(position > 0) | |
282 if(source[position-1] == 'r') | |
283 { | |
284 end = '"'; | |
285 goto string_wys; | |
286 } | |
287 ++position; | |
288 while(getNextChar != CharType.EOF) | |
289 { | |
290 ++position; | |
291 if (source[position-1] == '"' ) | |
292 return Token(Tok.String, Loc(start), position - start); | |
293 else if (source[position-1] == '\\') | |
294 position++; | |
295 } | |
296 break; | |
297 case '`': | |
298 string_wys: | |
299 ++position; | |
300 while(getNextChar != CharType.EOF) | |
301 { | |
302 ++position; | |
303 if (source[position-1] == end ) | |
304 return Token(Tok.String, Loc(start), position - start); | |
305 } | |
306 break; | |
307 } | |
308 messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer); | |
309 } | |
310 | |
311 Token lexNumber () | |
312 { | |
313 bool sign = false; | |
314 bool dot = false; | |
315 bool e = false; | |
316 | |
317 int i = 0; | |
318 | |
319 bool end = false; | |
320 while(!end) | |
321 { | |
322 switch(getNextChar(i)) | |
323 { | |
324 case CharType.Number: | |
325 break; | |
326 case CharType.Symbol: | |
327 if(this.source[position+i] == '.') | |
328 { | |
329 if(dot) | |
330 messages.report(OnlyOneDotFloating, Loc(position + i)); | |
331 dot = true; | |
332 break; | |
333 } | |
334 end = true; | |
335 continue; | |
336 case CharType.Letter: | |
337 if(this.source[position+i] == '_') | |
338 break; | |
339 if (this.source[position+i] == 'e' || | |
340 this.source[position+i] == 'E') | |
341 { | |
342 if (e) | |
343 messages.report(OnlyOneEFloating, Loc(position + i)); | |
344 e = true; | |
345 break; | |
346 } | |
347 end = true; | |
348 continue; | |
349 | |
350 default: | |
351 end = true; | |
352 continue; | |
353 } | |
354 i++; | |
355 } | |
356 | |
357 position += i; | |
358 | |
359 return Token(Tok.Integer, Loc(position - i), i); | |
360 } | |
361 | |
362 Token lexSymbol () | |
363 { | |
364 Token t = symbolFunctions[source[position++]](); | |
365 | |
366 return t; | |
367 } | |
368 | |
369 Token lexLetter () | |
370 { | |
371 int i = 0; | |
372 bool hasNumber = false; | |
373 if (source[position+1] == '"' || | |
374 source[position+1] == '`') | |
375 { | |
376 ++position; | |
377 return string; | |
378 } | |
379 while (getNextChar(++i) == CharType.Letter || | |
380 getNextChar(i) == CharType.Number) | |
381 { | |
382 if (getNextChar(i) == CharType.Number) | |
383 { | |
384 hasNumber = true; | |
385 } | |
386 } | |
387 | |
388 Token t = Token(Tok.Identifier, Loc(), i); | |
389 | |
390 if (!hasNumber) | |
391 { | |
392 char[] str = source[position .. position + i]; | |
393 if(str in keywords) | |
394 t.type = keywords[str]; | |
395 } | |
396 | |
397 position += i; | |
398 | |
399 return t; | |
400 } | |
401 | |
402 CharType getNextChar(int offset = 0) | |
403 { | |
404 if (position + offset >= this.source.length) | |
405 return CharType.EOF; | |
406 | |
407 char current = source[position + offset]; | |
408 | |
409 CharType c = charTable[current]; | |
410 | |
411 if(c == CharType.INVALID) | |
412 messages.report(InvalidSymbol, Loc()) | |
413 .arg(Integer.toString(cast(int)current)) | |
414 .fatal(ExitLevel.Lexer); | |
415 | |
416 return c; | |
417 | |
418 } | |
419 | |
420 private final SourceLocation Loc(int pos = -1) | |
421 { | |
422 if (pos < 0) | |
423 return start_loc + position; | |
424 return start_loc + pos; | |
425 } | |
426 | |
427 SourceManager sm; | |
428 SourceLocation start_loc; | |
429 int position; | |
430 char[] source; | |
431 MessageHandler messages; | |
432 CharType[] charTable; | |
433 Token delegate()[] symbolFunctions; | |
434 } | |
435 | |
436 enum CharType : ubyte | |
437 { | |
438 INVALID, | |
439 Letter, | |
440 Number, | |
441 Symbol, | |
442 Whitespace, | |
443 Other, | |
444 | |
445 EOF | |
446 } | |
447 |