comparison src/lexer/Lexer.d @ 206:d3c148ca429b

Major moving of files. all src now goes into src, all docs in docs.
author Anders Johnsen <skabet@gmail.com>
date Tue, 12 Aug 2008 18:14:56 +0200
parents
children e0551773a005
comparison
equal deleted inserted replaced
205:8387cbaa85ab 206:d3c148ca429b
1 module lexer.Lexer;
2
3 import basic.Message,
4 basic.SourceManager;
5
6 import lexer.Token,
7 lexer.Keyword;
8
9 import tango.io.Stdout;
10
11 /**
12 The Lexer class will supply you with methods to tokenize a D file. Supply the
13 Lexer with a DataSource and you can 'peek' and 'next' Tokens from the file.
14
15 For more info about Tokens, look up the lexer.Token module.
16 */
17 class Lexer
18 {
19 public:
20
21 /**
22 Create a new Lexer.
23 */
24 this(SourceLocation start, SourceManager src_mgr, MessageHandler messages)
25 {
26 this.messages = messages;
27 sm = src_mgr;
28 start_loc = start;
29 position = 0;
30 source = sm.getRawData(start_loc);
31
32
33 charTable.length = 256;
34 foreach (c; "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_")
35 charTable[c] = CharType.Letter;
36
37 foreach (c; "0123456789")
38 charTable[c] = CharType.Number;
39
40 foreach (c; "(){}[];:.,=!<>+-*/%\"`")
41 charTable[c] = CharType.Symbol;
42
43 foreach (c; " \n")
44 charTable[c] = CharType.Whitespace;
45
46 foreach (c; "'\\")
47 charTable[c] = CharType.Other;
48
49 symbolFunctions.length = 256;
50
51 symbolFunctions['('] = &openParentheses;
52 symbolFunctions[')'] = &closeParentheses;
53 symbolFunctions['{'] = &openBrace;
54 symbolFunctions['}'] = &closeBrace;
55 symbolFunctions['['] = &openBracket;
56 symbolFunctions[']'] = &closeBracket;
57 symbolFunctions[';'] = &seperator;
58 symbolFunctions[':'] = &colon;
59 symbolFunctions['.'] = &dot;
60 symbolFunctions[','] = &comma;
61 symbolFunctions['='] = &eq;
62 symbolFunctions['!'] = &ne;
63 symbolFunctions['<'] = &le;
64 symbolFunctions['>'] = &ge;
65 symbolFunctions['+'] = &plus;
66 symbolFunctions['-'] = &minus;
67 symbolFunctions['*'] = &star;
68 symbolFunctions['/'] = &slash;
69 symbolFunctions['%'] = &percent;
70 symbolFunctions['"'] = &string;
71 symbolFunctions['`'] = &string;
72 }
73
74 /**
75 Get the next token from the source. This method will move the
76 internal position forward to the next Token.
77
78 return: A Token - Token.type is TokType.EOF if there is
79 no more tokens in the file.
80 */
81 Token next()
82 {
83 switch (getNextChar)
84 {
85 case CharType.EOF:
86 SLoc loc;
87 return Token(Tok.EOF, loc, 0);
88
89 case CharType.Whitespace:
90 position += 1;
91 return this.next;
92
93 case CharType.Symbol:
94 return lexSymbol;
95
96 case CharType.Letter:
97 return lexLetter;
98
99 case CharType.Number:
100 return lexNumber;
101 case CharType.Other:
102 messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer);
103 }
104 }
105
106 /**
107 Get the next token from the source. This method will NOT move the
108 internal position forward, and thereby having no side-effects.
109
110 return: A Token - Token.type is TokType.EOF if there is
111 no more tokens in the file.
112 */
113 Token peek(int skip = 0)
114 {
115 int oldPosition = this.position;
116 while (skip-- > 0)
117 this.next;
118 Token t = this.next;
119 this.position = oldPosition;
120 return t;
121 }
122
123 private:
124 Token eq()
125 {
126 if(source[position] == '=')
127 return Token(Tok.Eq, Loc(position++ - 1), 2);
128 return Token(Tok.Assign, Loc(position - 1), 1);
129 }
130 Token openBrace()
131 {
132 return Token(Tok.OpenBrace, Loc(position - 1), 1);
133 }
134 Token closeBrace()
135 {
136 return Token(Tok.CloseBrace, Loc(position - 1), 1);
137 }
138 Token openParentheses()
139 {
140 return Token(Tok.OpenParentheses, Loc(position - 1), 1);
141 }
142 Token closeParentheses()
143 {
144 return Token(Tok.CloseParentheses, Loc(position - 1), 1);
145 }
146 Token openBracket()
147 {
148 return Token(Tok.OpenBracket, Loc(position - 1), 1);
149 }
150 Token closeBracket()
151 {
152 return Token(Tok.CloseBracket, Loc(position - 1), 1);
153 }
154 Token seperator()
155 {
156 return Token(Tok.Seperator, Loc(position - 1), 1);
157 }
158 Token colon()
159 {
160 return Token(Tok.Colon, Loc(position - 1), 1);
161 }
162 Token dot()
163 {
164 int pos = 0;
165 while(getNextChar(0) == CharType.Number ||
166 this.source[position + pos + 1] == '_')
167 {
168 if(getNextChar(0) == CharType.Number)
169 {
170 position--;
171 return lexNumber();
172 }
173 pos++;
174 }
175 return Token(Tok.Dot, Loc(position - 1), 1);
176 }
177 Token comma()
178 {
179 return Token(Tok.Comma, Loc(position - 1), 1);
180 }
181 Token ne()
182 {
183 if(source[position] == '=')
184 return Token(Tok.Ne, Loc(position++ - 1), 2);
185 return Token(Tok.Not, Loc(position - 1), 1);
186 }
187 Token le()
188 {
189 if(source[position] == '=')
190 return Token(Tok.Le, Loc(position++ - 1), 2);
191 return Token(Tok.Lt, Loc(position - 1), 1);
192 }
193 Token ge()
194 {
195 if(source[position] == '=')
196 return Token(Tok.Ge, Loc(position++ - 1), 2);
197 return Token(Tok.Gt, Loc(position - 1), 1);
198 }
199 Token plus()
200 {
201 return Token(Tok.Plus, Loc(position - 1), 1);
202 }
203 Token minus()
204 {
205 return Token(Tok.Minus, Loc(position - 1), 1);
206 }
207 Token star()
208 {
209 return Token(Tok.Star, Loc(position - 1), 1);
210 }
211 Token slash()
212 {
213 switch(source[position])
214 {
215 case '/':
216 while(getNextChar != CharType.EOF)
217 {
218 if(source[position++] == '\n')
219 return this.next;
220 }
221 return Token(Tok.EOF, Loc(position), 0);
222
223 case '*':
224 position += 2;
225 while(getNextChar != CharType.EOF)
226 {
227 ++position;
228 if(source[position-2] == '*')
229 if(source[position-1] == '/')
230 {
231 return this.next;
232 }
233 }
234 messages.report(UnexpectedEOFBlock,Loc(position));
235
236 case '+':
237 position += 2;
238 int nesting = 1;
239 while(getNextChar != CharType.EOF)
240 {
241 ++position;
242 if(source[position-2] == '+')
243 if(source[position-1] == '/')
244 {
245 position++;
246 nesting--;
247 }
248
249 if(source[position-2] == '/')
250 if(source[position-1] == '+')
251 {
252 nesting++;
253 position++;
254 }
255
256 if(nesting == 0)
257 return this.next;
258 }
259 messages.report(UnexpectedEOFBlock,Loc(position));
260
261 default:
262 return Token(Tok.Slash, Loc(position - 1), 1);
263 }
264 }
265
266 Token percent()
267 {
268 return Token(Tok.Percent, Loc(position - 1), 1);
269 }
270
271 Token string()
272 {
273 --position;
274 int start = position;
275 if(getNextChar() == CharType.Letter)
276 position++;
277 char end = '`';
278 switch(source[position])
279 {
280 case '"':
281 if(position > 0)
282 if(source[position-1] == 'r')
283 {
284 end = '"';
285 goto string_wys;
286 }
287 ++position;
288 while(getNextChar != CharType.EOF)
289 {
290 ++position;
291 if (source[position-1] == '"' )
292 return Token(Tok.String, Loc(start), position - start);
293 else if (source[position-1] == '\\')
294 position++;
295 }
296 break;
297 case '`':
298 string_wys:
299 ++position;
300 while(getNextChar != CharType.EOF)
301 {
302 ++position;
303 if (source[position-1] == end )
304 return Token(Tok.String, Loc(start), position - start);
305 }
306 break;
307 }
308 messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer);
309 }
310
311 Token lexNumber ()
312 {
313 bool sign = false;
314 bool dot = false;
315 bool e = false;
316
317 int i = 0;
318
319 bool end = false;
320 while(!end)
321 {
322 switch(getNextChar(i))
323 {
324 case CharType.Number:
325 break;
326 case CharType.Symbol:
327 if(this.source[position+i] == '.')
328 {
329 if(dot)
330 messages.report(OnlyOneDotFloating, Loc(position + i));
331 dot = true;
332 break;
333 }
334 end = true;
335 continue;
336 case CharType.Letter:
337 if(this.source[position+i] == '_')
338 break;
339 if (this.source[position+i] == 'e' ||
340 this.source[position+i] == 'E')
341 {
342 if (e)
343 messages.report(OnlyOneEFloating, Loc(position + i));
344 e = true;
345 break;
346 }
347 end = true;
348 continue;
349
350 default:
351 end = true;
352 continue;
353 }
354 i++;
355 }
356
357 position += i;
358
359 return Token(Tok.Integer, Loc(position - i), i);
360 }
361
362 Token lexSymbol ()
363 {
364 Token t = symbolFunctions[source[position++]]();
365
366 return t;
367 }
368
369 Token lexLetter ()
370 {
371 int i = 0;
372 bool hasNumber = false;
373 if (source[position+1] == '"' ||
374 source[position+1] == '`')
375 {
376 ++position;
377 return string;
378 }
379 while (getNextChar(++i) == CharType.Letter ||
380 getNextChar(i) == CharType.Number)
381 {
382 if (getNextChar(i) == CharType.Number)
383 {
384 hasNumber = true;
385 }
386 }
387
388 Token t = Token(Tok.Identifier, Loc(), i);
389
390 if (!hasNumber)
391 {
392 char[] str = source[position .. position + i];
393 if(str in keywords)
394 t.type = keywords[str];
395 }
396
397 position += i;
398
399 return t;
400 }
401
402 CharType getNextChar(int offset = 0)
403 {
404 if (position + offset >= this.source.length)
405 return CharType.EOF;
406
407 char current = source[position + offset];
408
409 CharType c = charTable[current];
410
411 if(c == CharType.INVALID)
412 messages.report(InvalidSymbol, Loc())
413 .arg(Integer.toString(cast(int)current))
414 .fatal(ExitLevel.Lexer);
415
416 return c;
417
418 }
419
420 private final SourceLocation Loc(int pos = -1)
421 {
422 if (pos < 0)
423 return start_loc + position;
424 return start_loc + pos;
425 }
426
427 SourceManager sm;
428 SourceLocation start_loc;
429 int position;
430 char[] source;
431 MessageHandler messages;
432 CharType[] charTable;
433 Token delegate()[] symbolFunctions;
434 }
435
436 enum CharType : ubyte
437 {
438 INVALID,
439 Letter,
440 Number,
441 Symbol,
442 Whitespace,
443 Other,
444
445 EOF
446 }
447