Mercurial > projects > dil
comparison src/dil/lexer/Lexer.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 09 Mar 2008 00:12:19 +0100 |
parents | trunk/src/dil/lexer/Lexer.d@cb8040538772 |
children | 49e32b5bc161 |
comparison
equal
deleted
inserted
replaced
805:a3fab8b74a7d | 806:bcb74c9b895c |
---|---|
1 /++ | |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.lexer.Lexer; | |
6 | |
7 import dil.lexer.Token; | |
8 import dil.lexer.Keywords; | |
9 import dil.lexer.Identifier; | |
10 import dil.lexer.IdTable; | |
11 import dil.Information; | |
12 import dil.Messages; | |
13 import dil.HtmlEntities; | |
14 import dil.CompilerInfo; | |
15 import dil.Unicode; | |
16 import dil.SourceText; | |
17 import dil.Time; | |
18 import common; | |
19 | |
20 import tango.stdc.stdlib : strtof, strtod, strtold; | |
21 import tango.stdc.errno : errno, ERANGE; | |
22 | |
23 public import dil.lexer.Funcs; | |
24 | |
25 /// The Lexer analyzes the characters of a source text and | |
26 /// produces a doubly-linked list of tokens. | |
27 class Lexer | |
28 { | |
29 SourceText srcText; /// The source text. | |
30 char* p; /// Points to the current character in the source text. | |
31 char* end; /// Points one character past the end of the source text. | |
32 | |
33 Token* head; /// The head of the doubly linked token list. | |
34 Token* tail; /// The tail of the linked list. Set in scan(). | |
35 Token* token; /// Points to the current token in the token list. | |
36 | |
37 // Members used for error messages: | |
38 InfoManager infoMan; | |
39 LexerError[] errors; | |
40 /// Always points to the first character of the current line. | |
41 char* lineBegin; | |
42 // Token* newline; /// Current newline token. | |
43 uint lineNum = 1; /// Current, actual source text line number. | |
44 uint lineNum_hline; /// Line number set by #line. | |
45 uint inTokenString; /// > 0 if inside q{ } | |
46 /// Holds the original file path and the modified one (by #line.) | |
47 NewlineData.FilePaths* filePaths; | |
48 | |
49 /// Construct a Lexer object. | |
50 /// Params: | |
51 /// srcText = the UTF-8 source code. | |
52 /// infoMan = used for collecting error messages. | |
53 this(SourceText srcText, InfoManager infoMan = null) | |
54 { | |
55 this.srcText = srcText; | |
56 this.infoMan = infoMan; | |
57 | |
58 assert(text.length && text[$-1] == 0, "source text has no sentinel character"); | |
59 this.p = text.ptr; | |
60 this.end = this.p + text.length; | |
61 this.lineBegin = this.p; | |
62 | |
63 this.head = new Token; | |
64 this.head.kind = TOK.HEAD; | |
65 this.head.start = this.head.end = this.p; | |
66 this.token = this.head; | |
67 // Initialize this.filePaths. | |
68 newFilePath(this.srcText.filePath); | |
69 // Add a newline as the first token after the head. | |
70 auto newline = new Token; | |
71 newline.kind = TOK.Newline; | |
72 newline.setWhitespaceFlag(); | |
73 newline.start = newline.end = this.p; | |
74 newline.newline.filePaths = this.filePaths; | |
75 newline.newline.oriLineNum = 1; | |
76 newline.newline.setLineNum = 0; | |
77 // Link in. | |
78 this.token.next = newline; | |
79 newline.prev = this.token; | |
80 this.token = newline; | |
81 // this.newline = newline; | |
82 scanShebang(); | |
83 } | |
84 | |
85 /// The destructor deletes the doubly-linked token list. | |
86 ~this() | |
87 { | |
88 auto token = head.next; | |
89 while (token !is null) | |
90 { | |
91 assert(token.kind == TOK.EOF ? token == tail && token.next is null : 1); | |
92 delete token.prev; | |
93 token = token.next; | |
94 } | |
95 delete tail; | |
96 } | |
97 | |
98 char[] text() | |
99 { | |
100 return srcText.data; | |
101 } | |
102 | |
103 /// The "shebang" may optionally appear once at the beginning of a file. | |
104 /// Regexp: #![^\EndOfLine]* | |
105 void scanShebang() | |
106 { | |
107 if (*p == '#' && p[1] == '!') | |
108 { | |
109 auto t = new Token; | |
110 t.kind = TOK.Shebang; | |
111 t.setWhitespaceFlag(); | |
112 t.start = p; | |
113 ++p; | |
114 while (!isEndOfLine(++p)) | |
115 isascii(*p) || decodeUTF8(); | |
116 t.end = p; | |
117 this.token.next = t; | |
118 t.prev = this.token; | |
119 } | |
120 } | |
121 | |
122 /// Sets the value of the special token. | |
123 void finalizeSpecialToken(ref Token t) | |
124 { | |
125 assert(t.srcText[0..2] == "__"); | |
126 switch (t.kind) | |
127 { | |
128 case TOK.FILE: | |
129 t.str = this.filePaths.setPath; | |
130 break; | |
131 case TOK.LINE: | |
132 t.uint_ = this.errorLineNumber(this.lineNum); | |
133 break; | |
134 case TOK.DATE, | |
135 TOK.TIME, | |
136 TOK.TIMESTAMP: | |
137 auto time_str = Time.toString(); | |
138 switch (t.kind) | |
139 { | |
140 case TOK.DATE: | |
141 time_str = Time.month_day(time_str) ~ ' ' ~ Time.year(time_str); break; | |
142 case TOK.TIME: | |
143 time_str = Time.time(time_str); break; | |
144 case TOK.TIMESTAMP: | |
145 break; // time_str is the timestamp. | |
146 default: assert(0); | |
147 } | |
148 time_str ~= '\0'; // Terminate with a zero. | |
149 t.str = time_str; | |
150 break; | |
151 case TOK.VENDOR: | |
152 t.str = VENDOR; | |
153 break; | |
154 case TOK.VERSION: | |
155 t.uint_ = VERSION_MAJOR*1000 + VERSION_MINOR; | |
156 break; | |
157 default: | |
158 assert(0); | |
159 } | |
160 } | |
161 | |
162 /// Sets a new file path. | |
163 void newFilePath(char[] newPath) | |
164 { | |
165 auto paths = new NewlineData.FilePaths; | |
166 paths.oriPath = this.srcText.filePath; | |
167 paths.setPath = newPath; | |
168 this.filePaths = paths; | |
169 } | |
170 | |
171 private void setLineBegin(char* p) | |
172 { | |
173 // Check that we can look behind one character. | |
174 assert((p-1) >= text.ptr && p < end); | |
175 // Check that previous character is a newline. | |
176 assert(isNewlineEnd(p - 1)); | |
177 this.lineBegin = p; | |
178 } | |
179 | |
180 /// Scans the next token in the source text. | |
181 /// | |
182 /// Creates a new token if t.next is null and appends it to the list. | |
183 private void scanNext(ref Token* t) | |
184 { | |
185 assert(t !is null); | |
186 if (t.next) | |
187 { | |
188 t = t.next; | |
189 // if (t.kind == TOK.Newline) | |
190 // this.newline = t; | |
191 } | |
192 else if (t != this.tail) | |
193 { | |
194 Token* new_t = new Token; | |
195 scan(*new_t); | |
196 new_t.prev = t; | |
197 t.next = new_t; | |
198 t = new_t; | |
199 } | |
200 } | |
201 | |
202 /// Advance t one token forward. | |
203 void peek(ref Token* t) | |
204 { | |
205 scanNext(t); | |
206 } | |
207 | |
208 /// Advance to the next token in the source text. | |
209 TOK nextToken() | |
210 { | |
211 scanNext(this.token); | |
212 return this.token.kind; | |
213 } | |
214 | |
215 /// Returns true if p points to the last character of a Newline. | |
216 bool isNewlineEnd(char* p) | |
217 { | |
218 if (*p == '\n' || *p == '\r') | |
219 return true; | |
220 if (*p == LS[2] || *p == PS[2]) | |
221 if ((p-2) >= text.ptr) | |
222 if (p[-1] == LS[1] && p[-2] == LS[0]) | |
223 return true; | |
224 return false; | |
225 } | |
226 | |
227 /// The main method which recognizes the characters that make up a token. | |
228 /// | |
229 /// Complicated tokens are scanned in separate methods. | |
230 public void scan(ref Token t) | |
231 in | |
232 { | |
233 assert(text.ptr <= p && p < end); | |
234 } | |
235 out | |
236 { | |
237 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind)); | |
238 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind)); | |
239 } | |
240 body | |
241 { | |
242 // Scan whitespace. | |
243 if (isspace(*p)) | |
244 { | |
245 t.ws = p; | |
246 while (isspace(*++p)) | |
247 {} | |
248 } | |
249 | |
250 // Scan a token. | |
251 uint c = *p; | |
252 { | |
253 t.start = p; | |
254 // Newline. | |
255 switch (*p) | |
256 { | |
257 case '\r': | |
258 if (p[1] == '\n') | |
259 ++p; | |
260 case '\n': | |
261 assert(isNewlineEnd(p)); | |
262 ++p; | |
263 ++lineNum; | |
264 setLineBegin(p); | |
265 // this.newline = &t; | |
266 t.kind = TOK.Newline; | |
267 t.setWhitespaceFlag(); | |
268 t.newline.filePaths = this.filePaths; | |
269 t.newline.oriLineNum = lineNum; | |
270 t.newline.setLineNum = lineNum_hline; | |
271 t.end = p; | |
272 return; | |
273 default: | |
274 if (isUnicodeNewline(p)) | |
275 { | |
276 ++p; ++p; | |
277 goto case '\n'; | |
278 } | |
279 } | |
280 // Identifier or string literal. | |
281 if (isidbeg(c)) | |
282 { | |
283 if (c == 'r' && p[1] == '"' && ++p) | |
284 return scanRawStringLiteral(t); | |
285 if (c == 'x' && p[1] == '"') | |
286 return scanHexStringLiteral(t); | |
287 version(D2) | |
288 { | |
289 if (c == 'q' && p[1] == '"') | |
290 return scanDelimitedStringLiteral(t); | |
291 if (c == 'q' && p[1] == '{') | |
292 return scanTokenStringLiteral(t); | |
293 } | |
294 // Scan identifier. | |
295 Lidentifier: | |
296 do | |
297 { c = *++p; } | |
298 while (isident(c) || !isascii(c) && isUnicodeAlpha()) | |
299 | |
300 t.end = p; | |
301 | |
302 auto id = IdTable.lookup(t.srcText); | |
303 t.kind = id.kind; | |
304 t.ident = id; | |
305 | |
306 if (t.kind == TOK.Identifier || t.isKeyword) | |
307 return; | |
308 else if (t.isSpecialToken) | |
309 finalizeSpecialToken(t); | |
310 else if (t.kind == TOK.EOF) | |
311 { | |
312 tail = &t; | |
313 assert(t.srcText == "__EOF__"); | |
314 } | |
315 else | |
316 assert(0, "unexpected token type: " ~ Token.toString(t.kind)); | |
317 return; | |
318 } | |
319 | |
320 if (isdigit(c)) | |
321 return scanNumber(t); | |
322 | |
323 if (c == '/') | |
324 { | |
325 c = *++p; | |
326 switch(c) | |
327 { | |
328 case '=': | |
329 ++p; | |
330 t.kind = TOK.DivAssign; | |
331 t.end = p; | |
332 return; | |
333 case '+': | |
334 return scanNestedComment(t); | |
335 case '*': | |
336 return scanBlockComment(t); | |
337 case '/': | |
338 while (!isEndOfLine(++p)) | |
339 isascii(*p) || decodeUTF8(); | |
340 t.kind = TOK.Comment; | |
341 t.setWhitespaceFlag(); | |
342 t.end = p; | |
343 return; | |
344 default: | |
345 t.kind = TOK.Div; | |
346 t.end = p; | |
347 return; | |
348 } | |
349 } | |
350 | |
351 switch (c) | |
352 { | |
353 case '\'': | |
354 return scanCharacterLiteral(t); | |
355 case '`': | |
356 return scanRawStringLiteral(t); | |
357 case '"': | |
358 return scanNormalStringLiteral(t); | |
359 case '\\': | |
360 char[] buffer; | |
361 do | |
362 { | |
363 bool isBinary; | |
364 c = scanEscapeSequence(isBinary); | |
365 if (isascii(c) || isBinary) | |
366 buffer ~= c; | |
367 else | |
368 encodeUTF8(buffer, c); | |
369 } while (*p == '\\') | |
370 buffer ~= 0; | |
371 t.kind = TOK.String; | |
372 t.str = buffer; | |
373 t.end = p; | |
374 return; | |
375 case '>': /* > >= >> >>= >>> >>>= */ | |
376 c = *++p; | |
377 switch (c) | |
378 { | |
379 case '=': | |
380 t.kind = TOK.GreaterEqual; | |
381 goto Lcommon; | |
382 case '>': | |
383 if (p[1] == '>') | |
384 { | |
385 ++p; | |
386 if (p[1] == '=') | |
387 { ++p; | |
388 t.kind = TOK.URShiftAssign; | |
389 } | |
390 else | |
391 t.kind = TOK.URShift; | |
392 } | |
393 else if (p[1] == '=') | |
394 { | |
395 ++p; | |
396 t.kind = TOK.RShiftAssign; | |
397 } | |
398 else | |
399 t.kind = TOK.RShift; | |
400 goto Lcommon; | |
401 default: | |
402 t.kind = TOK.Greater; | |
403 goto Lcommon2; | |
404 } | |
405 assert(0); | |
406 case '<': /* < <= <> <>= << <<= */ | |
407 c = *++p; | |
408 switch (c) | |
409 { | |
410 case '=': | |
411 t.kind = TOK.LessEqual; | |
412 goto Lcommon; | |
413 case '<': | |
414 if (p[1] == '=') { | |
415 ++p; | |
416 t.kind = TOK.LShiftAssign; | |
417 } | |
418 else | |
419 t.kind = TOK.LShift; | |
420 goto Lcommon; | |
421 case '>': | |
422 if (p[1] == '=') { | |
423 ++p; | |
424 t.kind = TOK.LorEorG; | |
425 } | |
426 else | |
427 t.kind = TOK.LorG; | |
428 goto Lcommon; | |
429 default: | |
430 t.kind = TOK.Less; | |
431 goto Lcommon2; | |
432 } | |
433 assert(0); | |
434 case '!': /* ! !< !> !<= !>= !<> !<>= */ | |
435 c = *++p; | |
436 switch (c) | |
437 { | |
438 case '<': | |
439 c = *++p; | |
440 if (c == '>') | |
441 { | |
442 if (p[1] == '=') { | |
443 ++p; | |
444 t.kind = TOK.Unordered; | |
445 } | |
446 else | |
447 t.kind = TOK.UorE; | |
448 } | |
449 else if (c == '=') | |
450 { | |
451 t.kind = TOK.UorG; | |
452 } | |
453 else { | |
454 t.kind = TOK.UorGorE; | |
455 goto Lcommon2; | |
456 } | |
457 goto Lcommon; | |
458 case '>': | |
459 if (p[1] == '=') | |
460 { | |
461 ++p; | |
462 t.kind = TOK.UorL; | |
463 } | |
464 else | |
465 t.kind = TOK.UorLorE; | |
466 goto Lcommon; | |
467 case '=': | |
468 t.kind = TOK.NotEqual; | |
469 goto Lcommon; | |
470 default: | |
471 t.kind = TOK.Not; | |
472 goto Lcommon2; | |
473 } | |
474 assert(0); | |
475 case '.': /* . .[0-9] .. ... */ | |
476 if (p[1] == '.') | |
477 { | |
478 ++p; | |
479 if (p[1] == '.') { | |
480 ++p; | |
481 t.kind = TOK.Ellipses; | |
482 } | |
483 else | |
484 t.kind = TOK.Slice; | |
485 } | |
486 else if (isdigit(p[1])) | |
487 { | |
488 return scanReal(t); | |
489 } | |
490 else | |
491 t.kind = TOK.Dot; | |
492 goto Lcommon; | |
493 case '|': /* | || |= */ | |
494 c = *++p; | |
495 if (c == '=') | |
496 t.kind = TOK.OrAssign; | |
497 else if (c == '|') | |
498 t.kind = TOK.OrLogical; | |
499 else { | |
500 t.kind = TOK.OrBinary; | |
501 goto Lcommon2; | |
502 } | |
503 goto Lcommon; | |
504 case '&': /* & && &= */ | |
505 c = *++p; | |
506 if (c == '=') | |
507 t.kind = TOK.AndAssign; | |
508 else if (c == '&') | |
509 t.kind = TOK.AndLogical; | |
510 else { | |
511 t.kind = TOK.AndBinary; | |
512 goto Lcommon2; | |
513 } | |
514 goto Lcommon; | |
515 case '+': /* + ++ += */ | |
516 c = *++p; | |
517 if (c == '=') | |
518 t.kind = TOK.PlusAssign; | |
519 else if (c == '+') | |
520 t.kind = TOK.PlusPlus; | |
521 else { | |
522 t.kind = TOK.Plus; | |
523 goto Lcommon2; | |
524 } | |
525 goto Lcommon; | |
526 case '-': /* - -- -= */ | |
527 c = *++p; | |
528 if (c == '=') | |
529 t.kind = TOK.MinusAssign; | |
530 else if (c == '-') | |
531 t.kind = TOK.MinusMinus; | |
532 else { | |
533 t.kind = TOK.Minus; | |
534 goto Lcommon2; | |
535 } | |
536 goto Lcommon; | |
537 case '=': /* = == */ | |
538 if (p[1] == '=') { | |
539 ++p; | |
540 t.kind = TOK.Equal; | |
541 } | |
542 else | |
543 t.kind = TOK.Assign; | |
544 goto Lcommon; | |
545 case '~': /* ~ ~= */ | |
546 if (p[1] == '=') { | |
547 ++p; | |
548 t.kind = TOK.CatAssign; | |
549 } | |
550 else | |
551 t.kind = TOK.Tilde; | |
552 goto Lcommon; | |
553 case '*': /* * *= */ | |
554 if (p[1] == '=') { | |
555 ++p; | |
556 t.kind = TOK.MulAssign; | |
557 } | |
558 else | |
559 t.kind = TOK.Mul; | |
560 goto Lcommon; | |
561 case '^': /* ^ ^= */ | |
562 if (p[1] == '=') { | |
563 ++p; | |
564 t.kind = TOK.XorAssign; | |
565 } | |
566 else | |
567 t.kind = TOK.Xor; | |
568 goto Lcommon; | |
569 case '%': /* % %= */ | |
570 if (p[1] == '=') { | |
571 ++p; | |
572 t.kind = TOK.ModAssign; | |
573 } | |
574 else | |
575 t.kind = TOK.Mod; | |
576 goto Lcommon; | |
577 // Single character tokens: | |
578 case '(': | |
579 t.kind = TOK.LParen; | |
580 goto Lcommon; | |
581 case ')': | |
582 t.kind = TOK.RParen; | |
583 goto Lcommon; | |
584 case '[': | |
585 t.kind = TOK.LBracket; | |
586 goto Lcommon; | |
587 case ']': | |
588 t.kind = TOK.RBracket; | |
589 goto Lcommon; | |
590 case '{': | |
591 t.kind = TOK.LBrace; | |
592 goto Lcommon; | |
593 case '}': | |
594 t.kind = TOK.RBrace; | |
595 goto Lcommon; | |
596 case ':': | |
597 t.kind = TOK.Colon; | |
598 goto Lcommon; | |
599 case ';': | |
600 t.kind = TOK.Semicolon; | |
601 goto Lcommon; | |
602 case '?': | |
603 t.kind = TOK.Question; | |
604 goto Lcommon; | |
605 case ',': | |
606 t.kind = TOK.Comma; | |
607 goto Lcommon; | |
608 case '$': | |
609 t.kind = TOK.Dollar; | |
610 Lcommon: | |
611 ++p; | |
612 Lcommon2: | |
613 t.end = p; | |
614 return; | |
615 case '#': | |
616 return scanSpecialTokenSequence(t); | |
617 default: | |
618 } | |
619 | |
620 // Check for EOF | |
621 if (isEOF(c)) | |
622 { | |
623 assert(isEOF(*p), ""~*p); | |
624 t.kind = TOK.EOF; | |
625 t.end = p; | |
626 tail = &t; | |
627 assert(t.start == t.end); | |
628 return; | |
629 } | |
630 | |
631 if (!isascii(c)) | |
632 { | |
633 c = decodeUTF8(); | |
634 if (isUniAlpha(c)) | |
635 goto Lidentifier; | |
636 } | |
637 | |
638 error(t.start, MID.IllegalCharacter, cast(dchar)c); | |
639 | |
640 ++p; | |
641 t.kind = TOK.Illegal; | |
642 t.setWhitespaceFlag(); | |
643 t.dchar_ = c; | |
644 t.end = p; | |
645 return; | |
646 } | |
647 } | |
648 | |
649 /// Converts a string literal to an integer. | |
650 template toUint(char[] T) | |
651 { | |
652 static assert(0 < T.length && T.length <= 4); | |
653 static if (T.length == 1) | |
654 const uint toUint = T[0]; | |
655 else | |
656 const uint toUint = (T[0] << ((T.length-1)*8)) | toUint!(T[1..$]); | |
657 } | |
658 static assert(toUint!("\xAA\xBB\xCC\xDD") == 0xAABBCCDD); | |
659 | |
660 /// Constructs case statements. E.g.: | |
661 /// --- | |
662 //// // case_!("<", "Less", "Lcommon") -> | |
663 /// case 60u: | |
664 /// t.kind = TOK.Less; | |
665 /// goto Lcommon; | |
666 /// --- | |
667 /// Note:Can't use this yet due to a $(DMDBUG 1534, bug) in DMD. | |
668 template case_(char[] str, char[] kind, char[] label) | |
669 { | |
670 const char[] case_ = | |
671 `case `~toUint!(str).stringof~`:` | |
672 `t.kind = TOK.`~kind~`;` | |
673 `goto `~label~`;`; | |
674 } | |
675 //pragma(msg, case_!("<", "Less", "Lcommon")); | |
676 | |
677 template case_L4(char[] str, TOK kind) | |
678 { | |
679 const char[] case_L4 = case_!(str, kind, "Lcommon_4"); | |
680 } | |
681 | |
682 template case_L3(char[] str, TOK kind) | |
683 { | |
684 const char[] case_L3 = case_!(str, kind, "Lcommon_3"); | |
685 } | |
686 | |
687 template case_L2(char[] str, TOK kind) | |
688 { | |
689 const char[] case_L2 = case_!(str, kind, "Lcommon_2"); | |
690 } | |
691 | |
692 template case_L1(char[] str, TOK kind) | |
693 { | |
694 const char[] case_L3 = case_!(str, kind, "Lcommon"); | |
695 } | |
696 | |
697 /// An alternative scan method. | |
698 /// Profiling shows it's a bit slower. | |
699 public void scan_(ref Token t) | |
700 in | |
701 { | |
702 assert(text.ptr <= p && p < end); | |
703 } | |
704 out | |
705 { | |
706 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind)); | |
707 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind)); | |
708 } | |
709 body | |
710 { | |
711 // Scan whitespace. | |
712 if (isspace(*p)) | |
713 { | |
714 t.ws = p; | |
715 while (isspace(*++p)) | |
716 {} | |
717 } | |
718 | |
719 // Scan a token. | |
720 t.start = p; | |
721 // Newline. | |
722 switch (*p) | |
723 { | |
724 case '\r': | |
725 if (p[1] == '\n') | |
726 ++p; | |
727 case '\n': | |
728 assert(isNewlineEnd(p)); | |
729 ++p; | |
730 ++lineNum; | |
731 setLineBegin(p); | |
732 // this.newline = &t; | |
733 t.kind = TOK.Newline; | |
734 t.setWhitespaceFlag(); | |
735 t.newline.filePaths = this.filePaths; | |
736 t.newline.oriLineNum = lineNum; | |
737 t.newline.setLineNum = lineNum_hline; | |
738 t.end = p; | |
739 return; | |
740 default: | |
741 if (isUnicodeNewline(p)) | |
742 { | |
743 ++p; ++p; | |
744 goto case '\n'; | |
745 } | |
746 } | |
747 | |
748 uint c = *p; | |
749 assert(end - p != 0); | |
750 switch (end - p) | |
751 { | |
752 case 1: | |
753 goto L1character; | |
754 case 2: | |
755 c <<= 8; c |= p[1]; | |
756 goto L2characters; | |
757 case 3: | |
758 c <<= 8; c |= p[1]; c <<= 8; c |= p[2]; | |
759 goto L3characters; | |
760 default: | |
761 version(BigEndian) | |
762 c = *cast(uint*)p; | |
763 else | |
764 { | |
765 c <<= 8; c |= p[1]; c <<= 8; c |= p[2]; c <<= 8; c |= p[3]; | |
766 /+ | |
767 c = *cast(uint*)p; | |
768 asm | |
769 { | |
770 mov EDX, c; | |
771 bswap EDX; | |
772 mov c, EDX; | |
773 } | |
774 +/ | |
775 } | |
776 } | |
777 | |
778 // 4 character tokens. | |
779 switch (c) | |
780 { | |
781 case toUint!(">>>="): | |
782 t.kind = TOK.RShiftAssign; | |
783 goto Lcommon_4; | |
784 case toUint!("!<>="): | |
785 t.kind = TOK.Unordered; | |
786 Lcommon_4: | |
787 p += 4; | |
788 t.end = p; | |
789 return; | |
790 default: | |
791 } | |
792 | |
793 c >>>= 8; | |
794 L3characters: | |
795 assert(p == t.start); | |
796 // 3 character tokens. | |
797 switch (c) | |
798 { | |
799 case toUint!(">>="): | |
800 t.kind = TOK.RShiftAssign; | |
801 goto Lcommon_3; | |
802 case toUint!(">>>"): | |
803 t.kind = TOK.URShift; | |
804 goto Lcommon_3; | |
805 case toUint!("<>="): | |
806 t.kind = TOK.LorEorG; | |
807 goto Lcommon_3; | |
808 case toUint!("<<="): | |
809 t.kind = TOK.LShiftAssign; | |
810 goto Lcommon_3; | |
811 case toUint!("!<="): | |
812 t.kind = TOK.UorG; | |
813 goto Lcommon_3; | |
814 case toUint!("!>="): | |
815 t.kind = TOK.UorL; | |
816 goto Lcommon_3; | |
817 case toUint!("!<>"): | |
818 t.kind = TOK.UorE; | |
819 goto Lcommon_3; | |
820 case toUint!("..."): | |
821 t.kind = TOK.Ellipses; | |
822 Lcommon_3: | |
823 p += 3; | |
824 t.end = p; | |
825 return; | |
826 default: | |
827 } | |
828 | |
829 c >>>= 8; | |
830 L2characters: | |
831 assert(p == t.start); | |
832 // 2 character tokens. | |
833 switch (c) | |
834 { | |
835 case toUint!("/+"): | |
836 ++p; // Skip / | |
837 return scanNestedComment(t); | |
838 case toUint!("/*"): | |
839 ++p; // Skip / | |
840 return scanBlockComment(t); | |
841 case toUint!("//"): | |
842 ++p; // Skip / | |
843 assert(*p == '/'); | |
844 while (!isEndOfLine(++p)) | |
845 isascii(*p) || decodeUTF8(); | |
846 t.kind = TOK.Comment; | |
847 t.setWhitespaceFlag(); | |
848 t.end = p; | |
849 return; | |
850 case toUint!(">="): | |
851 t.kind = TOK.GreaterEqual; | |
852 goto Lcommon_2; | |
853 case toUint!(">>"): | |
854 t.kind = TOK.RShift; | |
855 goto Lcommon_2; | |
856 case toUint!("<<"): | |
857 t.kind = TOK.LShift; | |
858 goto Lcommon_2; | |
859 case toUint!("<="): | |
860 t.kind = TOK.LessEqual; | |
861 goto Lcommon_2; | |
862 case toUint!("<>"): | |
863 t.kind = TOK.LorG; | |
864 goto Lcommon_2; | |
865 case toUint!("!<"): | |
866 t.kind = TOK.UorGorE; | |
867 goto Lcommon_2; | |
868 case toUint!("!>"): | |
869 t.kind = TOK.UorLorE; | |
870 goto Lcommon_2; | |
871 case toUint!("!="): | |
872 t.kind = TOK.NotEqual; | |
873 goto Lcommon_2; | |
874 case toUint!(".."): | |
875 t.kind = TOK.Slice; | |
876 goto Lcommon_2; | |
877 case toUint!("&&"): | |
878 t.kind = TOK.AndLogical; | |
879 goto Lcommon_2; | |
880 case toUint!("&="): | |
881 t.kind = TOK.AndAssign; | |
882 goto Lcommon_2; | |
883 case toUint!("||"): | |
884 t.kind = TOK.OrLogical; | |
885 goto Lcommon_2; | |
886 case toUint!("|="): | |
887 t.kind = TOK.OrAssign; | |
888 goto Lcommon_2; | |
889 case toUint!("++"): | |
890 t.kind = TOK.PlusPlus; | |
891 goto Lcommon_2; | |
892 case toUint!("+="): | |
893 t.kind = TOK.PlusAssign; | |
894 goto Lcommon_2; | |
895 case toUint!("--"): | |
896 t.kind = TOK.MinusMinus; | |
897 goto Lcommon_2; | |
898 case toUint!("-="): | |
899 t.kind = TOK.MinusAssign; | |
900 goto Lcommon_2; | |
901 case toUint!("=="): | |
902 t.kind = TOK.Equal; | |
903 goto Lcommon_2; | |
904 case toUint!("~="): | |
905 t.kind = TOK.CatAssign; | |
906 goto Lcommon_2; | |
907 case toUint!("*="): | |
908 t.kind = TOK.MulAssign; | |
909 goto Lcommon_2; | |
910 case toUint!("/="): | |
911 t.kind = TOK.DivAssign; | |
912 goto Lcommon_2; | |
913 case toUint!("^="): | |
914 t.kind = TOK.XorAssign; | |
915 goto Lcommon_2; | |
916 case toUint!("%="): | |
917 t.kind = TOK.ModAssign; | |
918 Lcommon_2: | |
919 p += 2; | |
920 t.end = p; | |
921 return; | |
922 default: | |
923 } | |
924 | |
925 c >>>= 8; | |
926 L1character: | |
927 assert(p == t.start); | |
928 assert(*p == c, Format("p={0},c={1}", *p, cast(dchar)c)); | |
929 // 1 character tokens. | |
930 // TODO: consider storing the token type in ptable. | |
931 switch (c) | |
932 { | |
933 case '\'': | |
934 return scanCharacterLiteral(t); | |
935 case '`': | |
936 return scanRawStringLiteral(t); | |
937 case '"': | |
938 return scanNormalStringLiteral(t); | |
939 case '\\': | |
940 char[] buffer; | |
941 do | |
942 { | |
943 bool isBinary; | |
944 c = scanEscapeSequence(isBinary); | |
945 if (isascii(c) || isBinary) | |
946 buffer ~= c; | |
947 else | |
948 encodeUTF8(buffer, c); | |
949 } while (*p == '\\') | |
950 buffer ~= 0; | |
951 t.kind = TOK.String; | |
952 t.str = buffer; | |
953 t.end = p; | |
954 return; | |
955 case '<': | |
956 t.kind = TOK.Greater; | |
957 goto Lcommon; | |
958 case '>': | |
959 t.kind = TOK.Less; | |
960 goto Lcommon; | |
961 case '^': | |
962 t.kind = TOK.Xor; | |
963 goto Lcommon; | |
964 case '!': | |
965 t.kind = TOK.Not; | |
966 goto Lcommon; | |
967 case '.': | |
968 if (isdigit(p[1])) | |
969 return scanReal(t); | |
970 t.kind = TOK.Dot; | |
971 goto Lcommon; | |
972 case '&': | |
973 t.kind = TOK.AndBinary; | |
974 goto Lcommon; | |
975 case '|': | |
976 t.kind = TOK.OrBinary; | |
977 goto Lcommon; | |
978 case '+': | |
979 t.kind = TOK.Plus; | |
980 goto Lcommon; | |
981 case '-': | |
982 t.kind = TOK.Minus; | |
983 goto Lcommon; | |
984 case '=': | |
985 t.kind = TOK.Assign; | |
986 goto Lcommon; | |
987 case '~': | |
988 t.kind = TOK.Tilde; | |
989 goto Lcommon; | |
990 case '*': | |
991 t.kind = TOK.Mul; | |
992 goto Lcommon; | |
993 case '/': | |
994 t.kind = TOK.Div; | |
995 goto Lcommon; | |
996 case '%': | |
997 t.kind = TOK.Mod; | |
998 goto Lcommon; | |
999 case '(': | |
1000 t.kind = TOK.LParen; | |
1001 goto Lcommon; | |
1002 case ')': | |
1003 t.kind = TOK.RParen; | |
1004 goto Lcommon; | |
1005 case '[': | |
1006 t.kind = TOK.LBracket; | |
1007 goto Lcommon; | |
1008 case ']': | |
1009 t.kind = TOK.RBracket; | |
1010 goto Lcommon; | |
1011 case '{': | |
1012 t.kind = TOK.LBrace; | |
1013 goto Lcommon; | |
1014 case '}': | |
1015 t.kind = TOK.RBrace; | |
1016 goto Lcommon; | |
1017 case ':': | |
1018 t.kind = TOK.Colon; | |
1019 goto Lcommon; | |
1020 case ';': | |
1021 t.kind = TOK.Semicolon; | |
1022 goto Lcommon; | |
1023 case '?': | |
1024 t.kind = TOK.Question; | |
1025 goto Lcommon; | |
1026 case ',': | |
1027 t.kind = TOK.Comma; | |
1028 goto Lcommon; | |
1029 case '$': | |
1030 t.kind = TOK.Dollar; | |
1031 Lcommon: | |
1032 ++p; | |
1033 t.end = p; | |
1034 return; | |
1035 case '#': | |
1036 return scanSpecialTokenSequence(t); | |
1037 default: | |
1038 } | |
1039 | |
1040 assert(p == t.start); | |
1041 assert(*p == c); | |
1042 | |
1043 // TODO: consider moving isidbeg() and isdigit() up. | |
1044 if (isidbeg(c)) | |
1045 { | |
1046 if (c == 'r' && p[1] == '"' && ++p) | |
1047 return scanRawStringLiteral(t); | |
1048 if (c == 'x' && p[1] == '"') | |
1049 return scanHexStringLiteral(t); | |
1050 version(D2) | |
1051 { | |
1052 if (c == 'q' && p[1] == '"') | |
1053 return scanDelimitedStringLiteral(t); | |
1054 if (c == 'q' && p[1] == '{') | |
1055 return scanTokenStringLiteral(t); | |
1056 } | |
1057 // Scan identifier. | |
1058 Lidentifier: | |
1059 do | |
1060 { c = *++p; } | |
1061 while (isident(c) || !isascii(c) && isUnicodeAlpha()) | |
1062 | |
1063 t.end = p; | |
1064 | |
1065 auto id = IdTable.lookup(t.srcText); | |
1066 t.kind = id.kind; | |
1067 t.ident = id; | |
1068 | |
1069 if (t.kind == TOK.Identifier || t.isKeyword) | |
1070 return; | |
1071 else if (t.isSpecialToken) | |
1072 finalizeSpecialToken(t); | |
1073 else if (t.kind == TOK.EOF) | |
1074 { | |
1075 tail = &t; | |
1076 assert(t.srcText == "__EOF__"); | |
1077 } | |
1078 else | |
1079 assert(0, "unexpected token type: " ~ Token.toString(t.kind)); | |
1080 return; | |
1081 } | |
1082 | |
1083 if (isdigit(c)) | |
1084 return scanNumber(t); | |
1085 | |
1086 // Check for EOF | |
1087 if (isEOF(c)) | |
1088 { | |
1089 assert(isEOF(*p), *p~""); | |
1090 t.kind = TOK.EOF; | |
1091 t.end = p; | |
1092 tail = &t; | |
1093 assert(t.start == t.end); | |
1094 return; | |
1095 } | |
1096 | |
1097 if (!isascii(c)) | |
1098 { | |
1099 c = decodeUTF8(); | |
1100 if (isUniAlpha(c)) | |
1101 goto Lidentifier; | |
1102 } | |
1103 | |
1104 error(t.start, MID.IllegalCharacter, cast(dchar)c); | |
1105 | |
1106 ++p; | |
1107 t.kind = TOK.Illegal; | |
1108 t.setWhitespaceFlag(); | |
1109 t.dchar_ = c; | |
1110 t.end = p; | |
1111 return; | |
1112 } | |
1113 | |
1114 /// Scans a block comment. | |
1115 /// | |
1116 /// BlockComment := "/*" AnyChar* "*/" | |
1117 void scanBlockComment(ref Token t) | |
1118 { | |
1119 assert(p[-1] == '/' && *p == '*'); | |
1120 auto tokenLineNum = lineNum; | |
1121 auto tokenLineBegin = lineBegin; | |
1122 Loop: | |
1123 while (1) | |
1124 { | |
1125 switch (*++p) | |
1126 { | |
1127 case '*': | |
1128 if (p[1] != '/') | |
1129 continue; | |
1130 p += 2; | |
1131 break Loop; | |
1132 case '\r': | |
1133 if (p[1] == '\n') | |
1134 ++p; | |
1135 case '\n': | |
1136 assert(isNewlineEnd(p)); | |
1137 ++lineNum; | |
1138 setLineBegin(p+1); | |
1139 break; | |
1140 default: | |
1141 if (!isascii(*p)) | |
1142 { | |
1143 if (isUnicodeNewlineChar(decodeUTF8())) | |
1144 goto case '\n'; | |
1145 } | |
1146 else if (isEOF(*p)) | |
1147 { | |
1148 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBlockComment); | |
1149 break Loop; | |
1150 } | |
1151 } | |
1152 } | |
1153 t.kind = TOK.Comment; | |
1154 t.setWhitespaceFlag(); | |
1155 t.end = p; | |
1156 return; | |
1157 } | |
1158 | |
1159 /// Scans a nested comment. | |
1160 /// | |
1161 /// NestedComment := "/+" (AnyChar* | NestedComment) "+/" | |
1162 void scanNestedComment(ref Token t) | |
1163 { | |
1164 assert(p[-1] == '/' && *p == '+'); | |
1165 auto tokenLineNum = lineNum; | |
1166 auto tokenLineBegin = lineBegin; | |
1167 uint level = 1; | |
1168 Loop: | |
1169 while (1) | |
1170 { | |
1171 switch (*++p) | |
1172 { | |
1173 case '/': | |
1174 if (p[1] == '+') | |
1175 ++p, ++level; | |
1176 continue; | |
1177 case '+': | |
1178 if (p[1] != '/') | |
1179 continue; | |
1180 ++p; | |
1181 if (--level != 0) | |
1182 continue; | |
1183 ++p; | |
1184 break Loop; | |
1185 case '\r': | |
1186 if (p[1] == '\n') | |
1187 ++p; | |
1188 case '\n': | |
1189 assert(isNewlineEnd(p)); | |
1190 ++lineNum; | |
1191 setLineBegin(p+1); | |
1192 continue; | |
1193 default: | |
1194 if (!isascii(*p)) | |
1195 { | |
1196 if (isUnicodeNewlineChar(decodeUTF8())) | |
1197 goto case '\n'; | |
1198 } | |
1199 else if (isEOF(*p)) | |
1200 { | |
1201 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedNestedComment); | |
1202 break Loop; | |
1203 } | |
1204 } | |
1205 } | |
1206 t.kind = TOK.Comment; | |
1207 t.setWhitespaceFlag(); | |
1208 t.end = p; | |
1209 return; | |
1210 } | |
1211 | |
1212 /// Scans the postfix character of a string literal. | |
1213 /// | |
1214 /// PostfixChar := "c" | "w" | "d" | |
1215 char scanPostfix() | |
1216 { | |
1217 assert(p[-1] == '"' || p[-1] == '`' || | |
1218 { version(D2) return p[-1] == '}'; | |
1219 else return 0; }() | |
1220 ); | |
1221 switch (*p) | |
1222 { | |
1223 case 'c': | |
1224 case 'w': | |
1225 case 'd': | |
1226 return *p++; | |
1227 default: | |
1228 return 0; | |
1229 } | |
1230 assert(0); | |
1231 } | |
1232 | |
1233 /// Scans a normal string literal. | |
1234 /// | |
1235 /// NormalStringLiteral := "\"" Char* "\"" | |
1236 void scanNormalStringLiteral(ref Token t) | |
1237 { | |
1238 assert(*p == '"'); | |
1239 auto tokenLineNum = lineNum; | |
1240 auto tokenLineBegin = lineBegin; | |
1241 t.kind = TOK.String; | |
1242 char[] buffer; | |
1243 uint c; | |
1244 while (1) | |
1245 { | |
1246 c = *++p; | |
1247 switch (c) | |
1248 { | |
1249 case '"': | |
1250 ++p; | |
1251 t.pf = scanPostfix(); | |
1252 Lreturn: | |
1253 t.str = buffer ~ '\0'; | |
1254 t.end = p; | |
1255 return; | |
1256 case '\\': | |
1257 bool isBinary; | |
1258 c = scanEscapeSequence(isBinary); | |
1259 --p; | |
1260 if (isascii(c) || isBinary) | |
1261 buffer ~= c; | |
1262 else | |
1263 encodeUTF8(buffer, c); | |
1264 continue; | |
1265 case '\r': | |
1266 if (p[1] == '\n') | |
1267 ++p; | |
1268 case '\n': | |
1269 assert(isNewlineEnd(p)); | |
1270 c = '\n'; // Convert Newline to \n. | |
1271 ++lineNum; | |
1272 setLineBegin(p+1); | |
1273 break; | |
1274 case 0, _Z_: | |
1275 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedString); | |
1276 goto Lreturn; | |
1277 default: | |
1278 if (!isascii(c)) | |
1279 { | |
1280 c = decodeUTF8(); | |
1281 if (isUnicodeNewlineChar(c)) | |
1282 goto case '\n'; | |
1283 encodeUTF8(buffer, c); | |
1284 continue; | |
1285 } | |
1286 } | |
1287 assert(isascii(c)); | |
1288 buffer ~= c; | |
1289 } | |
1290 assert(0); | |
1291 } | |
1292 | |
1293 /// Scans a character literal. | |
1294 /// | |
1295 /// CharLiteral := "'" Char "'" | |
1296 void scanCharacterLiteral(ref Token t) | |
1297 { | |
1298 assert(*p == '\''); | |
1299 ++p; | |
1300 t.kind = TOK.CharLiteral; | |
1301 switch (*p) | |
1302 { | |
1303 case '\\': | |
1304 bool notused; | |
1305 t.dchar_ = scanEscapeSequence(notused); | |
1306 break; | |
1307 case '\'': | |
1308 error(t.start, MID.EmptyCharacterLiteral); | |
1309 break; | |
1310 default: | |
1311 if (isEndOfLine(p)) | |
1312 break; | |
1313 uint c = *p; | |
1314 if (!isascii(c)) | |
1315 c = decodeUTF8(); | |
1316 t.dchar_ = c; | |
1317 ++p; | |
1318 } | |
1319 | |
1320 if (*p == '\'') | |
1321 ++p; | |
1322 else | |
1323 error(t.start, MID.UnterminatedCharacterLiteral); | |
1324 t.end = p; | |
1325 } | |
1326 | |
1327 /// Scans a raw string literal. | |
1328 /// | |
1329 /// RawStringLiteral := "r\"" AnyChar* "\"" | "`" AnyChar* "`" | |
1330 void scanRawStringLiteral(ref Token t) | |
1331 { | |
1332 assert(*p == '`' || *p == '"' && p[-1] == 'r'); | |
1333 auto tokenLineNum = lineNum; | |
1334 auto tokenLineBegin = lineBegin; | |
1335 t.kind = TOK.String; | |
1336 uint delim = *p; | |
1337 char[] buffer; | |
1338 uint c; | |
1339 while (1) | |
1340 { | |
1341 c = *++p; | |
1342 switch (c) | |
1343 { | |
1344 case '\r': | |
1345 if (p[1] == '\n') | |
1346 ++p; | |
1347 case '\n': | |
1348 assert(isNewlineEnd(p)); | |
1349 c = '\n'; // Convert Newline to '\n'. | |
1350 ++lineNum; | |
1351 setLineBegin(p+1); | |
1352 break; | |
1353 case '`': | |
1354 case '"': | |
1355 if (c == delim) | |
1356 { | |
1357 ++p; | |
1358 t.pf = scanPostfix(); | |
1359 Lreturn: | |
1360 t.str = buffer ~ '\0'; | |
1361 t.end = p; | |
1362 return; | |
1363 } | |
1364 break; | |
1365 case 0, _Z_: | |
1366 error(tokenLineNum, tokenLineBegin, t.start, | |
1367 delim == 'r' ? MID.UnterminatedRawString : MID.UnterminatedBackQuoteString); | |
1368 goto Lreturn; | |
1369 default: | |
1370 if (!isascii(c)) | |
1371 { | |
1372 c = decodeUTF8(); | |
1373 if (isUnicodeNewlineChar(c)) | |
1374 goto case '\n'; | |
1375 encodeUTF8(buffer, c); | |
1376 continue; | |
1377 } | |
1378 } | |
1379 assert(isascii(c)); | |
1380 buffer ~= c; | |
1381 } | |
1382 assert(0); | |
1383 } | |
1384 | |
1385 /// Scans a hexadecimal string literal. | |
1386 /// | |
1387 /// HexStringLiteral := "x\"" (HexChar HexChar)* "\"" | |
1388 void scanHexStringLiteral(ref Token t) | |
1389 { | |
1390 assert(p[0] == 'x' && p[1] == '"'); | |
1391 t.kind = TOK.String; | |
1392 | |
1393 auto tokenLineNum = lineNum; | |
1394 auto tokenLineBegin = lineBegin; | |
1395 | |
1396 uint c; | |
1397 ubyte[] buffer; | |
1398 ubyte h; // hex number | |
1399 uint n; // number of hex digits | |
1400 | |
1401 ++p; | |
1402 assert(*p == '"'); | |
1403 while (1) | |
1404 { | |
1405 c = *++p; | |
1406 switch (c) | |
1407 { | |
1408 case '"': | |
1409 if (n & 1) | |
1410 error(tokenLineNum, tokenLineBegin, t.start, MID.OddNumberOfDigitsInHexString); | |
1411 ++p; | |
1412 t.pf = scanPostfix(); | |
1413 Lreturn: | |
1414 t.str = cast(string) (buffer ~= 0); | |
1415 t.end = p; | |
1416 return; | |
1417 case '\r': | |
1418 if (p[1] == '\n') | |
1419 ++p; | |
1420 case '\n': | |
1421 assert(isNewlineEnd(p)); | |
1422 ++lineNum; | |
1423 setLineBegin(p+1); | |
1424 continue; | |
1425 default: | |
1426 if (ishexad(c)) | |
1427 { | |
1428 if (c <= '9') | |
1429 c -= '0'; | |
1430 else if (c <= 'F') | |
1431 c -= 'A' - 10; | |
1432 else | |
1433 c -= 'a' - 10; | |
1434 | |
1435 if (n & 1) | |
1436 { | |
1437 h <<= 4; | |
1438 h |= c; | |
1439 buffer ~= h; | |
1440 } | |
1441 else | |
1442 h = cast(ubyte)c; | |
1443 ++n; | |
1444 continue; | |
1445 } | |
1446 else if (isspace(c)) | |
1447 continue; // Skip spaces. | |
1448 else if (isEOF(c)) | |
1449 { | |
1450 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedHexString); | |
1451 t.pf = 0; | |
1452 goto Lreturn; | |
1453 } | |
1454 else | |
1455 { | |
1456 auto errorAt = p; | |
1457 if (!isascii(c)) | |
1458 { | |
1459 c = decodeUTF8(); | |
1460 if (isUnicodeNewlineChar(c)) | |
1461 goto case '\n'; | |
1462 } | |
1463 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c); | |
1464 } | |
1465 } | |
1466 } | |
1467 assert(0); | |
1468 } | |
1469 | |
1470 version(DDoc) | |
1471 { | |
1472 /// Scans a delimited string literal. | |
1473 void scanDelimitedStringLiteral(ref Token t); | |
1474 /// Scans a token string literal. | |
1475 /// | |
1476 /// TokenStringLiteral := "q{" Token* "}" | |
1477 void scanTokenStringLiteral(ref Token t); | |
1478 } | |
1479 else | |
1480 version(D2) | |
1481 { | |
1482 void scanDelimitedStringLiteral(ref Token t) | |
1483 { | |
1484 assert(p[0] == 'q' && p[1] == '"'); | |
1485 t.kind = TOK.String; | |
1486 | |
1487 auto tokenLineNum = lineNum; | |
1488 auto tokenLineBegin = lineBegin; | |
1489 | |
1490 char[] buffer; | |
1491 dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{' | |
1492 closing_delim; // Will be ']', ')', '>', '}, | |
1493 // the first character of an identifier or | |
1494 // any other Unicode/ASCII character. | |
1495 char[] str_delim; // Identifier delimiter. | |
1496 uint level = 1; // Counter for nestable delimiters. | |
1497 | |
1498 ++p; ++p; // Skip q" | |
1499 uint c = *p; | |
1500 switch (c) | |
1501 { | |
1502 case '(': | |
1503 opening_delim = c; | |
1504 closing_delim = ')'; // c + 1 | |
1505 break; | |
1506 case '[', '<', '{': | |
1507 opening_delim = c; | |
1508 closing_delim = c + 2; // Get to closing counterpart. Feature of ASCII table. | |
1509 break; | |
1510 default: | |
1511 dchar scanNewline() | |
1512 { | |
1513 switch (*p) | |
1514 { | |
1515 case '\r': | |
1516 if (p[1] == '\n') | |
1517 ++p; | |
1518 case '\n': | |
1519 assert(isNewlineEnd(p)); | |
1520 ++p; | |
1521 ++lineNum; | |
1522 setLineBegin(p); | |
1523 return '\n'; | |
1524 default: | |
1525 if (isUnicodeNewline(p)) | |
1526 { | |
1527 ++p; ++p; | |
1528 goto case '\n'; | |
1529 } | |
1530 } | |
1531 return 0; | |
1532 } | |
1533 // Skip leading newlines: | |
1534 while (scanNewline() != 0) | |
1535 {} | |
1536 assert(!isNewline(p)); | |
1537 | |
1538 char* begin = p; | |
1539 c = *p; | |
1540 closing_delim = c; | |
1541 // TODO: Check for non-printable characters? | |
1542 if (!isascii(c)) | |
1543 { | |
1544 closing_delim = decodeUTF8(); | |
1545 if (!isUniAlpha(closing_delim)) | |
1546 break; // Not an identifier. | |
1547 } | |
1548 else if (!isidbeg(c)) | |
1549 break; // Not an identifier. | |
1550 | |
1551 // Parse Identifier + EndOfLine | |
1552 do | |
1553 { c = *++p; } | |
1554 while (isident(c) || !isascii(c) && isUnicodeAlpha()) | |
1555 // Store identifier | |
1556 str_delim = begin[0..p-begin]; | |
1557 // Scan newline | |
1558 if (scanNewline() == '\n') | |
1559 --p; // Go back one because of "c = *++p;" in main loop. | |
1560 else | |
1561 { | |
1562 // TODO: error(p, MID.ExpectedNewlineAfterIdentDelim); | |
1563 } | |
1564 } | |
1565 | |
1566 bool checkStringDelim(char* p) | |
1567 { | |
1568 assert(str_delim.length != 0); | |
1569 if (buffer[$-1] == '\n' && // Last character copied to buffer must be '\n'. | |
1570 end-p >= str_delim.length && // Check remaining length. | |
1571 p[0..str_delim.length] == str_delim) // Compare. | |
1572 return true; | |
1573 return false; | |
1574 } | |
1575 | |
1576 while (1) | |
1577 { | |
1578 c = *++p; | |
1579 switch (c) | |
1580 { | |
1581 case '\r': | |
1582 if (p[1] == '\n') | |
1583 ++p; | |
1584 case '\n': | |
1585 assert(isNewlineEnd(p)); | |
1586 c = '\n'; // Convert Newline to '\n'. | |
1587 ++lineNum; | |
1588 setLineBegin(p+1); | |
1589 break; | |
1590 case 0, _Z_: | |
1591 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedDelimitedString); | |
1592 goto Lreturn3; | |
1593 default: | |
1594 if (!isascii(c)) | |
1595 { | |
1596 auto begin = p; | |
1597 c = decodeUTF8(); | |
1598 if (isUnicodeNewlineChar(c)) | |
1599 goto case '\n'; | |
1600 if (c == closing_delim) | |
1601 { | |
1602 if (str_delim.length) | |
1603 { | |
1604 if (checkStringDelim(begin)) | |
1605 { | |
1606 p = begin + str_delim.length; | |
1607 goto Lreturn2; | |
1608 } | |
1609 } | |
1610 else | |
1611 { | |
1612 assert(level == 1); | |
1613 --level; | |
1614 goto Lreturn; | |
1615 } | |
1616 } | |
1617 encodeUTF8(buffer, c); | |
1618 continue; | |
1619 } | |
1620 else | |
1621 { | |
1622 if (c == opening_delim) | |
1623 ++level; | |
1624 else if (c == closing_delim) | |
1625 { | |
1626 if (str_delim.length) | |
1627 { | |
1628 if (checkStringDelim(p)) | |
1629 { | |
1630 p += str_delim.length; | |
1631 goto Lreturn2; | |
1632 } | |
1633 } | |
1634 else if (--level == 0) | |
1635 goto Lreturn; | |
1636 } | |
1637 } | |
1638 } | |
1639 assert(isascii(c)); | |
1640 buffer ~= c; | |
1641 } | |
1642 Lreturn: // Character delimiter. | |
1643 assert(c == closing_delim); | |
1644 assert(level == 0); | |
1645 ++p; // Skip closing delimiter. | |
1646 Lreturn2: // String delimiter. | |
1647 if (*p == '"') | |
1648 ++p; | |
1649 else | |
1650 { | |
1651 // TODO: error(p, MID.ExpectedDblQuoteAfterDelim, str_delim.length ? str_delim : closing_delim~""); | |
1652 } | |
1653 | |
1654 t.pf = scanPostfix(); | |
1655 Lreturn3: // Error. | |
1656 t.str = buffer ~ '\0'; | |
1657 t.end = p; | |
1658 } | |
1659 | |
1660 void scanTokenStringLiteral(ref Token t) | |
1661 { | |
1662 assert(p[0] == 'q' && p[1] == '{'); | |
1663 t.kind = TOK.String; | |
1664 | |
1665 auto tokenLineNum = lineNum; | |
1666 auto tokenLineBegin = lineBegin; | |
1667 | |
1668 // A guard against changes to particular members: | |
1669 // this.lineNum_hline and this.errorPath | |
1670 ++inTokenString; | |
1671 | |
1672 uint lineNum = this.lineNum; | |
1673 uint level = 1; | |
1674 | |
1675 ++p; ++p; // Skip q{ | |
1676 | |
1677 auto prev_t = &t; | |
1678 Token* token; | |
1679 while (1) | |
1680 { | |
1681 token = new Token; | |
1682 scan(*token); | |
1683 // Save the tokens in a doubly linked list. | |
1684 // Could be useful for various tools. | |
1685 token.prev = prev_t; | |
1686 prev_t.next = token; | |
1687 prev_t = token; | |
1688 switch (token.kind) | |
1689 { | |
1690 case TOK.LBrace: | |
1691 ++level; | |
1692 continue; | |
1693 case TOK.RBrace: | |
1694 if (--level == 0) | |
1695 { | |
1696 t.tok_str = t.next; | |
1697 t.next = null; | |
1698 break; | |
1699 } | |
1700 continue; | |
1701 case TOK.EOF: | |
1702 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedTokenString); | |
1703 t.tok_str = t.next; | |
1704 t.next = token; | |
1705 break; | |
1706 default: | |
1707 continue; | |
1708 } | |
1709 break; // Exit loop. | |
1710 } | |
1711 | |
1712 assert(token.kind == TOK.RBrace || token.kind == TOK.EOF); | |
1713 assert(token.kind == TOK.RBrace && t.next is null || | |
1714 token.kind == TOK.EOF && t.next !is null); | |
1715 | |
1716 char[] buffer; | |
1717 // token points to } or EOF | |
1718 if (token.kind == TOK.EOF) | |
1719 { | |
1720 t.end = token.start; | |
1721 buffer = t.srcText[2..$].dup ~ '\0'; | |
1722 } | |
1723 else | |
1724 { | |
1725 // Assign to buffer before scanPostfix(). | |
1726 t.end = p; | |
1727 buffer = t.srcText[2..$-1].dup ~ '\0'; | |
1728 t.pf = scanPostfix(); | |
1729 t.end = p; // Assign again because of postfix. | |
1730 } | |
1731 // Convert newlines to '\n'. | |
1732 if (lineNum != this.lineNum) | |
1733 { | |
1734 assert(buffer[$-1] == '\0'); | |
1735 uint i, j; | |
1736 for (; i < buffer.length; ++i) | |
1737 switch (buffer[i]) | |
1738 { | |
1739 case '\r': | |
1740 if (buffer[i+1] == '\n') | |
1741 ++i; | |
1742 case '\n': | |
1743 assert(isNewlineEnd(buffer.ptr + i)); | |
1744 buffer[j++] = '\n'; // Convert Newline to '\n'. | |
1745 break; | |
1746 default: | |
1747 if (isUnicodeNewline(buffer.ptr + i)) | |
1748 { | |
1749 ++i; ++i; | |
1750 goto case '\n'; | |
1751 } | |
1752 buffer[j++] = buffer[i]; // Copy. | |
1753 } | |
1754 buffer.length = j; // Adjust length. | |
1755 } | |
1756 assert(buffer[$-1] == '\0'); | |
1757 t.str = buffer; | |
1758 | |
1759 --inTokenString; | |
1760 } | |
1761 } // version(D2) | |
1762 | |
1763 /// Scans an escape sequence. | |
1764 /// | |
1765 /// EscapeSequence := "\" (Octal{1,3} | ("x" Hex{2}) | | |
1766 /// ("u" Hex{4}) | ("U" Hex{8}) | | |
1767 /// "'" | "\"" | "\\" | "?" | "a" | | |
1768 /// "b" | "f" | "n" | "r" | "t" | "v") | |
1769 /// Params: | |
1770 /// isBinary = set to true for octal and hexadecimal escapes. | |
1771 /// Returns: the escape value. | |
1772 dchar scanEscapeSequence(ref bool isBinary) | |
1773 out(result) | |
1774 { assert(isValidChar(result)); } | |
1775 body | |
1776 { | |
1777 assert(*p == '\\'); | |
1778 | |
1779 auto sequenceStart = p; // Used for error reporting. | |
1780 | |
1781 ++p; | |
1782 uint c = char2ev(*p); | |
1783 if (c) | |
1784 { | |
1785 ++p; | |
1786 return c; | |
1787 } | |
1788 | |
1789 uint digits = 2; | |
1790 | |
1791 switch (*p) | |
1792 { | |
1793 case 'x': | |
1794 isBinary = true; | |
1795 case_Unicode: | |
1796 assert(c == 0); | |
1797 assert(digits == 2 || digits == 4 || digits == 8); | |
1798 while (1) | |
1799 { | |
1800 ++p; | |
1801 if (ishexad(*p)) | |
1802 { | |
1803 c *= 16; | |
1804 if (*p <= '9') | |
1805 c += *p - '0'; | |
1806 else if (*p <= 'F') | |
1807 c += *p - 'A' + 10; | |
1808 else | |
1809 c += *p - 'a' + 10; | |
1810 | |
1811 if (--digits == 0) | |
1812 { | |
1813 ++p; | |
1814 if (isValidChar(c)) | |
1815 return c; // Return valid escape value. | |
1816 | |
1817 error(sequenceStart, MID.InvalidUnicodeEscapeSequence, | |
1818 sequenceStart[0..p-sequenceStart]); | |
1819 break; | |
1820 } | |
1821 continue; | |
1822 } | |
1823 | |
1824 error(sequenceStart, MID.InsufficientHexDigits, | |
1825 sequenceStart[0..p-sequenceStart]); | |
1826 break; | |
1827 } | |
1828 break; | |
1829 case 'u': | |
1830 digits = 4; | |
1831 goto case_Unicode; | |
1832 case 'U': | |
1833 digits = 8; | |
1834 goto case_Unicode; | |
1835 default: | |
1836 if (isoctal(*p)) | |
1837 { | |
1838 isBinary = true; | |
1839 assert(c == 0); | |
1840 c += *p - '0'; | |
1841 ++p; | |
1842 if (!isoctal(*p)) | |
1843 return c; | |
1844 c *= 8; | |
1845 c += *p - '0'; | |
1846 ++p; | |
1847 if (!isoctal(*p)) | |
1848 return c; | |
1849 c *= 8; | |
1850 c += *p - '0'; | |
1851 ++p; | |
1852 if (c > 0xFF) | |
1853 error(sequenceStart, MSG.InvalidOctalEscapeSequence, | |
1854 sequenceStart[0..p-sequenceStart]); | |
1855 return c; // Return valid escape value. | |
1856 } | |
1857 else if(*p == '&') | |
1858 { | |
1859 if (isalpha(*++p)) | |
1860 { | |
1861 auto begin = p; | |
1862 while (isalnum(*++p)) | |
1863 {} | |
1864 | |
1865 if (*p == ';') | |
1866 { | |
1867 // Pass entity excluding '&' and ';'. | |
1868 c = entity2Unicode(begin[0..p - begin]); | |
1869 ++p; // Skip ; | |
1870 if (c != 0xFFFF) | |
1871 return c; // Return valid escape value. | |
1872 else | |
1873 error(sequenceStart, MID.UndefinedHTMLEntity, sequenceStart[0 .. p - sequenceStart]); | |
1874 } | |
1875 else | |
1876 error(sequenceStart, MID.UnterminatedHTMLEntity, sequenceStart[0 .. p - sequenceStart]); | |
1877 } | |
1878 else | |
1879 error(sequenceStart, MID.InvalidBeginHTMLEntity); | |
1880 } | |
1881 else if (isEndOfLine(p)) | |
1882 error(sequenceStart, MID.UndefinedEscapeSequence, | |
1883 isEOF(*p) ? `\EOF` : `\NewLine`); | |
1884 else | |
1885 { | |
1886 char[] str = `\`; | |
1887 if (isascii(c)) | |
1888 str ~= *p; | |
1889 else | |
1890 encodeUTF8(str, decodeUTF8()); | |
1891 ++p; | |
1892 // TODO: check for unprintable character? | |
1893 error(sequenceStart, MID.UndefinedEscapeSequence, str); | |
1894 } | |
1895 } | |
1896 return REPLACEMENT_CHAR; // Error: return replacement character. | |
1897 } | |
1898 | |
1899 /// Scans a number literal. | |
1900 /// | |
1901 /// $(PRE | |
1902 /// IntegerLiteral := (Dec|Hex|Bin|Oct)Suffix? | |
1903 /// Dec := (0|[1-9][0-9_]*) | |
1904 /// Hex := 0[xX][_]*[0-9a-zA-Z][0-9a-zA-Z_]* | |
1905 /// Bin := 0[bB][_]*[01][01_]* | |
1906 /// Oct := 0[0-7_]* | |
1907 /// Suffix := (L[uU]?|[uU]L?) | |
1908 /// ) | |
1909 /// Invalid: "0b_", "0x_", "._" etc. | |
1910 void scanNumber(ref Token t) | |
1911 { | |
1912 ulong ulong_; | |
1913 bool overflow; | |
1914 bool isDecimal; | |
1915 size_t digits; | |
1916 | |
1917 if (*p != '0') | |
1918 goto LscanInteger; | |
1919 ++p; // skip zero | |
1920 // check for xX bB ... | |
1921 switch (*p) | |
1922 { | |
1923 case 'x','X': | |
1924 goto LscanHex; | |
1925 case 'b','B': | |
1926 goto LscanBinary; | |
1927 case 'L': | |
1928 if (p[1] == 'i') | |
1929 goto LscanReal; // 0Li | |
1930 break; // 0L | |
1931 case '.': | |
1932 if (p[1] == '.') | |
1933 break; // 0.. | |
1934 // 0. | |
1935 case 'i','f','F', // Imaginary and float literal suffixes. | |
1936 'e', 'E': // Float exponent. | |
1937 goto LscanReal; | |
1938 default: | |
1939 if (*p == '_') | |
1940 goto LscanOctal; // 0_ | |
1941 else if (isdigit(*p)) | |
1942 { | |
1943 if (*p == '8' || *p == '9') | |
1944 goto Loctal_hasDecimalDigits; // 08 or 09 | |
1945 else | |
1946 goto Loctal_enter_loop; // 0[0-7] | |
1947 } | |
1948 } | |
1949 | |
1950 // Number 0 | |
1951 assert(p[-1] == '0'); | |
1952 assert(*p != '_' && !isdigit(*p)); | |
1953 assert(ulong_ == 0); | |
1954 isDecimal = true; | |
1955 goto Lfinalize; | |
1956 | |
1957 LscanInteger: | |
1958 assert(*p != 0 && isdigit(*p)); | |
1959 isDecimal = true; | |
1960 goto Lenter_loop_int; | |
1961 while (1) | |
1962 { | |
1963 if (*++p == '_') | |
1964 continue; | |
1965 if (!isdigit(*p)) | |
1966 break; | |
1967 Lenter_loop_int: | |
1968 if (ulong_ < ulong.max/10 || (ulong_ == ulong.max/10 && *p <= '5')) | |
1969 { | |
1970 ulong_ *= 10; | |
1971 ulong_ += *p - '0'; | |
1972 continue; | |
1973 } | |
1974 // Overflow: skip following digits. | |
1975 overflow = true; | |
1976 while (isdigit(*++p)) {} | |
1977 break; | |
1978 } | |
1979 | |
1980 // The number could be a float, so check overflow below. | |
1981 switch (*p) | |
1982 { | |
1983 case '.': | |
1984 if (p[1] != '.') | |
1985 goto LscanReal; | |
1986 break; | |
1987 case 'L': | |
1988 if (p[1] != 'i') | |
1989 break; | |
1990 case 'i', 'f', 'F', 'e', 'E': | |
1991 goto LscanReal; | |
1992 default: | |
1993 } | |
1994 | |
1995 if (overflow) | |
1996 error(t.start, MID.OverflowDecimalNumber); | |
1997 | |
1998 assert((isdigit(p[-1]) || p[-1] == '_') && !isdigit(*p) && *p != '_'); | |
1999 goto Lfinalize; | |
2000 | |
2001 LscanHex: | |
2002 assert(digits == 0); | |
2003 assert(*p == 'x' || *p == 'X'); | |
2004 while (1) | |
2005 { | |
2006 if (*++p == '_') | |
2007 continue; | |
2008 if (!ishexad(*p)) | |
2009 break; | |
2010 ++digits; | |
2011 ulong_ *= 16; | |
2012 if (*p <= '9') | |
2013 ulong_ += *p - '0'; | |
2014 else if (*p <= 'F') | |
2015 ulong_ += *p - 'A' + 10; | |
2016 else | |
2017 ulong_ += *p - 'a' + 10; | |
2018 } | |
2019 | |
2020 assert(ishexad(p[-1]) || p[-1] == '_' || p[-1] == 'x' || p[-1] == 'X'); | |
2021 assert(!ishexad(*p) && *p != '_'); | |
2022 | |
2023 switch (*p) | |
2024 { | |
2025 case '.': | |
2026 if (p[1] == '.') | |
2027 break; | |
2028 case 'p', 'P': | |
2029 return scanHexReal(t); | |
2030 default: | |
2031 } | |
2032 | |
2033 if (digits == 0 || digits > 16) | |
2034 error(t.start, digits == 0 ? MID.NoDigitsInHexNumber : MID.OverflowHexNumber); | |
2035 | |
2036 goto Lfinalize; | |
2037 | |
2038 LscanBinary: | |
2039 assert(digits == 0); | |
2040 assert(*p == 'b' || *p == 'B'); | |
2041 while (1) | |
2042 { | |
2043 if (*++p == '0') | |
2044 { | |
2045 ++digits; | |
2046 ulong_ *= 2; | |
2047 } | |
2048 else if (*p == '1') | |
2049 { | |
2050 ++digits; | |
2051 ulong_ *= 2; | |
2052 ulong_ += *p - '0'; | |
2053 } | |
2054 else if (*p == '_') | |
2055 continue; | |
2056 else | |
2057 break; | |
2058 } | |
2059 | |
2060 if (digits == 0 || digits > 64) | |
2061 error(t.start, digits == 0 ? MID.NoDigitsInBinNumber : MID.OverflowBinaryNumber); | |
2062 | |
2063 assert(p[-1] == '0' || p[-1] == '1' || p[-1] == '_' || p[-1] == 'b' || p[-1] == 'B', p[-1] ~ ""); | |
2064 assert( !(*p == '0' || *p == '1' || *p == '_') ); | |
2065 goto Lfinalize; | |
2066 | |
2067 LscanOctal: | |
2068 assert(*p == '_'); | |
2069 while (1) | |
2070 { | |
2071 if (*++p == '_') | |
2072 continue; | |
2073 if (!isoctal(*p)) | |
2074 break; | |
2075 Loctal_enter_loop: | |
2076 if (ulong_ < ulong.max/2 || (ulong_ == ulong.max/2 && *p <= '1')) | |
2077 { | |
2078 ulong_ *= 8; | |
2079 ulong_ += *p - '0'; | |
2080 continue; | |
2081 } | |
2082 // Overflow: skip following digits. | |
2083 overflow = true; | |
2084 while (isoctal(*++p)) {} | |
2085 break; | |
2086 } | |
2087 | |
2088 bool hasDecimalDigits; | |
2089 if (isdigit(*p)) | |
2090 { | |
2091 Loctal_hasDecimalDigits: | |
2092 hasDecimalDigits = true; | |
2093 while (isdigit(*++p)) {} | |
2094 } | |
2095 | |
2096 // The number could be a float, so check errors below. | |
2097 switch (*p) | |
2098 { | |
2099 case '.': | |
2100 if (p[1] != '.') | |
2101 goto LscanReal; | |
2102 break; | |
2103 case 'L': | |
2104 if (p[1] != 'i') | |
2105 break; | |
2106 case 'i', 'f', 'F', 'e', 'E': | |
2107 goto LscanReal; | |
2108 default: | |
2109 } | |
2110 | |
2111 if (hasDecimalDigits) | |
2112 error(t.start, MID.OctalNumberHasDecimals); | |
2113 | |
2114 if (overflow) | |
2115 error(t.start, MID.OverflowOctalNumber); | |
2116 // goto Lfinalize; | |
2117 | |
2118 Lfinalize: | |
2119 enum Suffix | |
2120 { | |
2121 None = 0, | |
2122 Unsigned = 1, | |
2123 Long = 2 | |
2124 } | |
2125 | |
2126 // Scan optional suffix: L, Lu, LU, u, uL, U or UL. | |
2127 Suffix suffix; | |
2128 while (1) | |
2129 { | |
2130 switch (*p) | |
2131 { | |
2132 case 'L': | |
2133 if (suffix & Suffix.Long) | |
2134 break; | |
2135 suffix |= Suffix.Long; | |
2136 ++p; | |
2137 continue; | |
2138 case 'u', 'U': | |
2139 if (suffix & Suffix.Unsigned) | |
2140 break; | |
2141 suffix |= Suffix.Unsigned; | |
2142 ++p; | |
2143 continue; | |
2144 default: | |
2145 break; | |
2146 } | |
2147 break; | |
2148 } | |
2149 | |
2150 // Determine type of Integer. | |
2151 switch (suffix) | |
2152 { | |
2153 case Suffix.None: | |
2154 if (ulong_ & 0x8000_0000_0000_0000) | |
2155 { | |
2156 if (isDecimal) | |
2157 error(t.start, MID.OverflowDecimalSign); | |
2158 t.kind = TOK.Uint64; | |
2159 } | |
2160 else if (ulong_ & 0xFFFF_FFFF_0000_0000) | |
2161 t.kind = TOK.Int64; | |
2162 else if (ulong_ & 0x8000_0000) | |
2163 t.kind = isDecimal ? TOK.Int64 : TOK.Uint32; | |
2164 else | |
2165 t.kind = TOK.Int32; | |
2166 break; | |
2167 case Suffix.Unsigned: | |
2168 if (ulong_ & 0xFFFF_FFFF_0000_0000) | |
2169 t.kind = TOK.Uint64; | |
2170 else | |
2171 t.kind = TOK.Uint32; | |
2172 break; | |
2173 case Suffix.Long: | |
2174 if (ulong_ & 0x8000_0000_0000_0000) | |
2175 { | |
2176 if (isDecimal) | |
2177 error(t.start, MID.OverflowDecimalSign); | |
2178 t.kind = TOK.Uint64; | |
2179 } | |
2180 else | |
2181 t.kind = TOK.Int64; | |
2182 break; | |
2183 case Suffix.Unsigned | Suffix.Long: | |
2184 t.kind = TOK.Uint64; | |
2185 break; | |
2186 default: | |
2187 assert(0); | |
2188 } | |
2189 t.ulong_ = ulong_; | |
2190 t.end = p; | |
2191 return; | |
2192 LscanReal: | |
2193 scanReal(t); | |
2194 return; | |
2195 } | |
2196 | |
2197 /// Scans a floating point number literal. | |
2198 /// | |
2199 /// $(PRE | |
2200 /// FloatLiteral := Float[fFL]?i? | |
2201 /// Float := DecFloat | HexFloat | |
2202 /// DecFloat := ([0-9][0-9_]*[.][0-9_]*DecExponent?) | | |
2203 /// [.][0-9][0-9_]*DecExponent? | [0-9][0-9_]*DecExponent | |
2204 /// DecExponent := [eE][+-]?[0-9][0-9_]* | |
2205 /// HexFloat := 0[xX](HexDigits[.]HexDigits | | |
2206 /// [.][0-9a-zA-Z]HexDigits? | | |
2207 /// HexDigits)HexExponent | |
2208 /// HexExponent := [pP][+-]?[0-9][0-9_]* | |
2209 /// ) | |
2210 void scanReal(ref Token t) | |
2211 { | |
2212 if (*p == '.') | |
2213 { | |
2214 assert(p[1] != '.'); | |
2215 // This function was called by scan() or scanNumber(). | |
2216 while (isdigit(*++p) || *p == '_') {} | |
2217 } | |
2218 else | |
2219 // This function was called by scanNumber(). | |
2220 assert(delegate () | |
2221 { | |
2222 switch (*p) | |
2223 { | |
2224 case 'L': | |
2225 if (p[1] != 'i') | |
2226 return false; | |
2227 case 'i', 'f', 'F', 'e', 'E': | |
2228 return true; | |
2229 default: | |
2230 } | |
2231 return false; | |
2232 }() | |
2233 ); | |
2234 | |
2235 // Scan exponent. | |
2236 if (*p == 'e' || *p == 'E') | |
2237 { | |
2238 ++p; | |
2239 if (*p == '-' || *p == '+') | |
2240 ++p; | |
2241 if (isdigit(*p)) | |
2242 while (isdigit(*++p) || *p == '_') {} | |
2243 else | |
2244 error(t.start, MID.FloatExpMustStartWithDigit); | |
2245 } | |
2246 | |
2247 // Copy whole number and remove underscores from buffer. | |
2248 char[] buffer = t.start[0..p-t.start].dup; | |
2249 uint j; | |
2250 foreach (c; buffer) | |
2251 if (c != '_') | |
2252 buffer[j++] = c; | |
2253 buffer.length = j; // Adjust length. | |
2254 buffer ~= 0; // Terminate for C functions. | |
2255 | |
2256 finalizeFloat(t, buffer); | |
2257 } | |
2258 | |
2259 /// Scans a hexadecimal floating point number literal. | |
2260 void scanHexReal(ref Token t) | |
2261 { | |
2262 assert(*p == '.' || *p == 'p' || *p == 'P'); | |
2263 MID mid; | |
2264 if (*p == '.') | |
2265 while (ishexad(*++p) || *p == '_') | |
2266 {} | |
2267 // Decimal exponent is required. | |
2268 if (*p != 'p' && *p != 'P') | |
2269 { | |
2270 mid = MID.HexFloatExponentRequired; | |
2271 goto Lerr; | |
2272 } | |
2273 // Scan exponent | |
2274 assert(*p == 'p' || *p == 'P'); | |
2275 ++p; | |
2276 if (*p == '+' || *p == '-') | |
2277 ++p; | |
2278 if (!isdigit(*p)) | |
2279 { | |
2280 mid = MID.HexFloatExpMustStartWithDigit; | |
2281 goto Lerr; | |
2282 } | |
2283 while (isdigit(*++p) || *p == '_') | |
2284 {} | |
2285 // Copy whole number and remove underscores from buffer. | |
2286 char[] buffer = t.start[0..p-t.start].dup; | |
2287 uint j; | |
2288 foreach (c; buffer) | |
2289 if (c != '_') | |
2290 buffer[j++] = c; | |
2291 buffer.length = j; // Adjust length. | |
2292 buffer ~= 0; // Terminate for C functions. | |
2293 finalizeFloat(t, buffer); | |
2294 return; | |
2295 Lerr: | |
2296 t.kind = TOK.Float32; | |
2297 t.end = p; | |
2298 error(t.start, mid); | |
2299 } | |
2300 | |
2301 /// Sets the value of the token. | |
2302 /// Params: | |
2303 /// t = receives the value. | |
2304 /// buffer = the well-formed float number. | |
2305 void finalizeFloat(ref Token t, string buffer) | |
2306 { | |
2307 assert(buffer[$-1] == 0); | |
2308 // Float number is well-formed. Check suffixes and do conversion. | |
2309 switch (*p) | |
2310 { | |
2311 case 'f', 'F': | |
2312 t.kind = TOK.Float32; | |
2313 t.float_ = strtof(buffer.ptr, null); | |
2314 ++p; | |
2315 break; | |
2316 case 'L': | |
2317 t.kind = TOK.Float80; | |
2318 t.real_ = strtold(buffer.ptr, null); | |
2319 ++p; | |
2320 break; | |
2321 default: | |
2322 t.kind = TOK.Float64; | |
2323 t.double_ = strtod(buffer.ptr, null); | |
2324 } | |
2325 if (*p == 'i') | |
2326 { | |
2327 ++p; | |
2328 t.kind += 3; // Switch to imaginary counterpart. | |
2329 assert(t.kind == TOK.Imaginary32 || | |
2330 t.kind == TOK.Imaginary64 || | |
2331 t.kind == TOK.Imaginary80); | |
2332 } | |
2333 if (errno() == ERANGE) | |
2334 error(t.start, MID.OverflowFloatNumber); | |
2335 t.end = p; | |
2336 } | |
2337 | |
2338 /// Scans a special token sequence. | |
2339 /// | |
2340 /// SpecialTokenSequence := "#line" Integer Filespec? EndOfLine | |
2341 void scanSpecialTokenSequence(ref Token t) | |
2342 { | |
2343 assert(*p == '#'); | |
2344 t.kind = TOK.HashLine; | |
2345 t.setWhitespaceFlag(); | |
2346 | |
2347 MID mid; | |
2348 char* errorAtColumn = p; | |
2349 char* tokenEnd = ++p; | |
2350 | |
2351 if (!(p[0] == 'l' && p[1] == 'i' && p[2] == 'n' && p[3] == 'e')) | |
2352 { | |
2353 mid = MID.ExpectedIdentifierSTLine; | |
2354 goto Lerr; | |
2355 } | |
2356 p += 3; | |
2357 tokenEnd = p + 1; | |
2358 | |
2359 // TODO: #line58"path/file" is legal. Require spaces? | |
2360 // State.Space could be used for that purpose. | |
2361 enum State | |
2362 { /+Space,+/ Integer, Filespec, End } | |
2363 | |
2364 State state = State.Integer; | |
2365 | |
2366 while (!isEndOfLine(++p)) | |
2367 { | |
2368 if (isspace(*p)) | |
2369 continue; | |
2370 if (state == State.Integer) | |
2371 { | |
2372 if (!isdigit(*p)) | |
2373 { | |
2374 errorAtColumn = p; | |
2375 mid = MID.ExpectedIntegerAfterSTLine; | |
2376 goto Lerr; | |
2377 } | |
2378 t.tokLineNum = new Token; | |
2379 scan(*t.tokLineNum); | |
2380 tokenEnd = p; | |
2381 if (t.tokLineNum.kind != TOK.Int32 && t.tokLineNum.kind != TOK.Uint32) | |
2382 { | |
2383 errorAtColumn = t.tokLineNum.start; | |
2384 mid = MID.ExpectedIntegerAfterSTLine; | |
2385 goto Lerr; | |
2386 } | |
2387 --p; // Go one back because scan() advanced p past the integer. | |
2388 state = State.Filespec; | |
2389 } | |
2390 else if (state == State.Filespec && *p == '"') | |
2391 { // MID.ExpectedFilespec is deprecated. | |
2392 // if (*p != '"') | |
2393 // { | |
2394 // errorAtColumn = p; | |
2395 // mid = MID.ExpectedFilespec; | |
2396 // goto Lerr; | |
2397 // } | |
2398 t.tokLineFilespec = new Token; | |
2399 t.tokLineFilespec.start = p; | |
2400 t.tokLineFilespec.kind = TOK.Filespec; | |
2401 t.tokLineFilespec.setWhitespaceFlag(); | |
2402 while (*++p != '"') | |
2403 { | |
2404 if (isEndOfLine(p)) | |
2405 { | |
2406 errorAtColumn = t.tokLineFilespec.start; | |
2407 mid = MID.UnterminatedFilespec; | |
2408 t.tokLineFilespec.end = p; | |
2409 tokenEnd = p; | |
2410 goto Lerr; | |
2411 } | |
2412 isascii(*p) || decodeUTF8(); | |
2413 } | |
2414 auto start = t.tokLineFilespec.start +1; // +1 skips '"' | |
2415 t.tokLineFilespec.str = start[0 .. p - start]; | |
2416 t.tokLineFilespec.end = p + 1; | |
2417 tokenEnd = p + 1; | |
2418 state = State.End; | |
2419 } | |
2420 else/+ if (state == State.End)+/ | |
2421 { | |
2422 mid = MID.UnterminatedSpecialToken; | |
2423 goto Lerr; | |
2424 } | |
2425 } | |
2426 assert(isEndOfLine(p)); | |
2427 | |
2428 if (state == State.Integer) | |
2429 { | |
2430 errorAtColumn = p; | |
2431 mid = MID.ExpectedIntegerAfterSTLine; | |
2432 goto Lerr; | |
2433 } | |
2434 | |
2435 // Evaluate #line only when not in token string. | |
2436 if (!inTokenString && t.tokLineNum) | |
2437 { | |
2438 this.lineNum_hline = this.lineNum - t.tokLineNum.uint_ + 1; | |
2439 if (t.tokLineFilespec) | |
2440 newFilePath(t.tokLineFilespec.str); | |
2441 } | |
2442 p = tokenEnd; | |
2443 t.end = tokenEnd; | |
2444 | |
2445 return; | |
2446 Lerr: | |
2447 p = tokenEnd; | |
2448 t.end = tokenEnd; | |
2449 error(errorAtColumn, mid); | |
2450 } | |
2451 | |
2452 /// Inserts an empty dummy token (TOK.Empty) before t. | |
2453 /// | |
2454 /// Useful in the parsing phase for representing a node in the AST | |
2455 /// that doesn't consume an actual token from the source text. | |
2456 Token* insertEmptyTokenBefore(Token* t) | |
2457 { | |
2458 assert(t !is null && t.prev !is null); | |
2459 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind)); | |
2460 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind)); | |
2461 | |
2462 auto prev_t = t.prev; | |
2463 auto new_t = new Token; | |
2464 new_t.kind = TOK.Empty; | |
2465 new_t.start = new_t.end = prev_t.end; | |
2466 // Link in new token. | |
2467 prev_t.next = new_t; | |
2468 new_t.prev = prev_t; | |
2469 new_t.next = t; | |
2470 t.prev = new_t; | |
2471 return new_t; | |
2472 } | |
2473 | |
2474 /// Returns the error line number. | |
2475 uint errorLineNumber(uint lineNum) | |
2476 { | |
2477 return lineNum - this.lineNum_hline; | |
2478 } | |
2479 | |
2480 /// Forwards error parameters. | |
2481 void error(char* columnPos, char[] msg, ...) | |
2482 { | |
2483 error_(this.lineNum, this.lineBegin, columnPos, msg, _arguments, _argptr); | |
2484 } | |
2485 | |
2486 /// ditto | |
2487 void error(char* columnPos, MID mid, ...) | |
2488 { | |
2489 error_(this.lineNum, this.lineBegin, columnPos, GetMsg(mid), _arguments, _argptr); | |
2490 } | |
2491 | |
2492 /// ditto | |
2493 void error(uint lineNum, char* lineBegin, char* columnPos, MID mid, ...) | |
2494 { | |
2495 error_(lineNum, lineBegin, columnPos, GetMsg(mid), _arguments, _argptr); | |
2496 } | |
2497 | |
2498 /// Creates an error report and appends it to a list. | |
2499 /// Params: | |
2500 /// lineNum = the line number. | |
2501 /// lineBegin = points to the first character of the current line. | |
2502 /// columnPos = points to the character where the error is located. | |
2503 /// msg = the message. | |
2504 void error_(uint lineNum, char* lineBegin, char* columnPos, char[] msg, | |
2505 TypeInfo[] _arguments, Arg _argptr) | |
2506 { | |
2507 lineNum = this.errorLineNumber(lineNum); | |
2508 auto errorPath = this.filePaths.setPath; | |
2509 auto location = new Location(errorPath, lineNum, lineBegin, columnPos); | |
2510 msg = Format(_arguments, _argptr, msg); | |
2511 auto error = new LexerError(location, msg); | |
2512 errors ~= error; | |
2513 if (infoMan !is null) | |
2514 infoMan ~= error; | |
2515 } | |
2516 | |
2517 /// Scans the whole source text until EOF is encountered. | |
2518 void scanAll() | |
2519 { | |
2520 while (nextToken() != TOK.EOF) | |
2521 {} | |
2522 } | |
2523 | |
2524 /// Returns the first token of the source text. | |
2525 /// This can be the EOF token. | |
2526 /// Structure: HEAD -> Newline -> First Token | |
2527 Token* firstToken() | |
2528 { | |
2529 return this.head.next.next; | |
2530 } | |
2531 | |
2532 /// Returns true if str is a valid D identifier. | |
2533 static bool isIdentifierString(char[] str) | |
2534 { | |
2535 if (str.length == 0 || isdigit(str[0])) | |
2536 return false; | |
2537 size_t idx; | |
2538 do | |
2539 { | |
2540 auto c = dil.Unicode.decode(str, idx); | |
2541 if (c == ERROR_CHAR || !(isident(c) || !isascii(c) && isUniAlpha(c))) | |
2542 return false; | |
2543 } while (idx < str.length) | |
2544 return true; | |
2545 } | |
2546 | |
2547 /// Returns true if str is a keyword or a special token (__FILE__, __LINE__ etc.) | |
2548 static bool isReservedIdentifier(char[] str) | |
2549 { | |
2550 if (!isIdentifierString(str)) | |
2551 return false; // str is not a valid identifier. | |
2552 | |
2553 auto id = IdTable.inStatic(str); | |
2554 if (id is null || id.kind == TOK.Identifier) | |
2555 return false; // str is not in the table or a normal identifier. | |
2556 | |
2557 return true; | |
2558 } | |
2559 | |
2560 /// Returns true if the current character to be decoded is | |
2561 /// a Unicode alpha character. | |
2562 /// | |
2563 /// The current pointer 'p' is not advanced if false is returned. | |
2564 bool isUnicodeAlpha() | |
2565 { | |
2566 assert(!isascii(*p), "check for ASCII char before calling decodeUTF8()."); | |
2567 char* p = this.p; | |
2568 dchar d = *p; | |
2569 ++p; // Move to second byte. | |
2570 // Error if second byte is not a trail byte. | |
2571 if (!isTrailByte(*p)) | |
2572 return false; | |
2573 // Check for overlong sequences. | |
2574 switch (d) | |
2575 { | |
2576 case 0xE0, 0xF0, 0xF8, 0xFC: | |
2577 if ((*p & d) == 0x80) | |
2578 return false; | |
2579 default: | |
2580 if ((d & 0xFE) == 0xC0) // 1100000x | |
2581 return false; | |
2582 } | |
2583 const char[] checkNextByte = "if (!isTrailByte(*++p))" | |
2584 " return false;"; | |
2585 const char[] appendSixBits = "d = (d << 6) | *p & 0b0011_1111;"; | |
2586 // Decode | |
2587 if ((d & 0b1110_0000) == 0b1100_0000) | |
2588 { | |
2589 d &= 0b0001_1111; | |
2590 mixin(appendSixBits); | |
2591 } | |
2592 else if ((d & 0b1111_0000) == 0b1110_0000) | |
2593 { | |
2594 d &= 0b0000_1111; | |
2595 mixin(appendSixBits ~ | |
2596 checkNextByte ~ appendSixBits); | |
2597 } | |
2598 else if ((d & 0b1111_1000) == 0b1111_0000) | |
2599 { | |
2600 d &= 0b0000_0111; | |
2601 mixin(appendSixBits ~ | |
2602 checkNextByte ~ appendSixBits ~ | |
2603 checkNextByte ~ appendSixBits); | |
2604 } | |
2605 else | |
2606 return false; | |
2607 | |
2608 assert(isTrailByte(*p)); | |
2609 if (!isValidChar(d) || !isUniAlpha(d)) | |
2610 return false; | |
2611 // Only advance pointer if this is a Unicode alpha character. | |
2612 this.p = p; | |
2613 return true; | |
2614 } | |
2615 | |
2616 /// Decodes the next UTF-8 sequence. | |
2617 dchar decodeUTF8() | |
2618 { | |
2619 assert(!isascii(*p), "check for ASCII char before calling decodeUTF8()."); | |
2620 char* p = this.p; | |
2621 dchar d = *p; | |
2622 | |
2623 ++p; // Move to second byte. | |
2624 // Error if second byte is not a trail byte. | |
2625 if (!isTrailByte(*p)) | |
2626 goto Lerr2; | |
2627 | |
2628 // Check for overlong sequences. | |
2629 switch (d) | |
2630 { | |
2631 case 0xE0, // 11100000 100xxxxx | |
2632 0xF0, // 11110000 1000xxxx | |
2633 0xF8, // 11111000 10000xxx | |
2634 0xFC: // 11111100 100000xx | |
2635 if ((*p & d) == 0x80) | |
2636 goto Lerr; | |
2637 default: | |
2638 if ((d & 0xFE) == 0xC0) // 1100000x | |
2639 goto Lerr; | |
2640 } | |
2641 | |
2642 const char[] checkNextByte = "if (!isTrailByte(*++p))" | |
2643 " goto Lerr2;"; | |
2644 const char[] appendSixBits = "d = (d << 6) | *p & 0b0011_1111;"; | |
2645 | |
2646 // Decode | |
2647 if ((d & 0b1110_0000) == 0b1100_0000) | |
2648 { // 110xxxxx 10xxxxxx | |
2649 d &= 0b0001_1111; | |
2650 mixin(appendSixBits); | |
2651 } | |
2652 else if ((d & 0b1111_0000) == 0b1110_0000) | |
2653 { // 1110xxxx 10xxxxxx 10xxxxxx | |
2654 d &= 0b0000_1111; | |
2655 mixin(appendSixBits ~ | |
2656 checkNextByte ~ appendSixBits); | |
2657 } | |
2658 else if ((d & 0b1111_1000) == 0b1111_0000) | |
2659 { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
2660 d &= 0b0000_0111; | |
2661 mixin(appendSixBits ~ | |
2662 checkNextByte ~ appendSixBits ~ | |
2663 checkNextByte ~ appendSixBits); | |
2664 } | |
2665 else | |
2666 // 5 and 6 byte UTF-8 sequences are not allowed yet. | |
2667 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
2668 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
2669 goto Lerr; | |
2670 | |
2671 assert(isTrailByte(*p)); | |
2672 | |
2673 if (!isValidChar(d)) | |
2674 { | |
2675 Lerr: | |
2676 // Three cases: | |
2677 // *) the UTF-8 sequence was successfully decoded but the resulting | |
2678 // character is invalid. | |
2679 // p points to last trail byte in the sequence. | |
2680 // *) the UTF-8 sequence is overlong. | |
2681 // p points to second byte in the sequence. | |
2682 // *) the UTF-8 sequence has more than 4 bytes or starts with | |
2683 // a trail byte. | |
2684 // p points to second byte in the sequence. | |
2685 assert(isTrailByte(*p)); | |
2686 // Move to next ASCII character or lead byte of a UTF-8 sequence. | |
2687 while (p < (end-1) && isTrailByte(*p)) | |
2688 ++p; | |
2689 --p; | |
2690 assert(!isTrailByte(p[1])); | |
2691 Lerr2: | |
2692 d = REPLACEMENT_CHAR; | |
2693 error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p)); | |
2694 } | |
2695 | |
2696 this.p = p; | |
2697 return d; | |
2698 } | |
2699 | |
2700 /// Encodes the character d and appends it to str. | |
2701 static void encodeUTF8(ref char[] str, dchar d) | |
2702 { | |
2703 assert(!isascii(d), "check for ASCII char before calling encodeUTF8()."); | |
2704 assert(isValidChar(d), "check if character is valid before calling encodeUTF8()."); | |
2705 | |
2706 char[6] b = void; | |
2707 if (d < 0x800) | |
2708 { | |
2709 b[0] = 0xC0 | (d >> 6); | |
2710 b[1] = 0x80 | (d & 0x3F); | |
2711 str ~= b[0..2]; | |
2712 } | |
2713 else if (d < 0x10000) | |
2714 { | |
2715 b[0] = 0xE0 | (d >> 12); | |
2716 b[1] = 0x80 | ((d >> 6) & 0x3F); | |
2717 b[2] = 0x80 | (d & 0x3F); | |
2718 str ~= b[0..3]; | |
2719 } | |
2720 else if (d < 0x200000) | |
2721 { | |
2722 b[0] = 0xF0 | (d >> 18); | |
2723 b[1] = 0x80 | ((d >> 12) & 0x3F); | |
2724 b[2] = 0x80 | ((d >> 6) & 0x3F); | |
2725 b[3] = 0x80 | (d & 0x3F); | |
2726 str ~= b[0..4]; | |
2727 } | |
2728 /+ // There are no 5 and 6 byte UTF-8 sequences yet. | |
2729 else if (d < 0x4000000) | |
2730 { | |
2731 b[0] = 0xF8 | (d >> 24); | |
2732 b[1] = 0x80 | ((d >> 18) & 0x3F); | |
2733 b[2] = 0x80 | ((d >> 12) & 0x3F); | |
2734 b[3] = 0x80 | ((d >> 6) & 0x3F); | |
2735 b[4] = 0x80 | (d & 0x3F); | |
2736 str ~= b[0..5]; | |
2737 } | |
2738 else if (d < 0x80000000) | |
2739 { | |
2740 b[0] = 0xFC | (d >> 30); | |
2741 b[1] = 0x80 | ((d >> 24) & 0x3F); | |
2742 b[2] = 0x80 | ((d >> 18) & 0x3F); | |
2743 b[3] = 0x80 | ((d >> 12) & 0x3F); | |
2744 b[4] = 0x80 | ((d >> 6) & 0x3F); | |
2745 b[5] = 0x80 | (d & 0x3F); | |
2746 str ~= b[0..6]; | |
2747 } | |
2748 +/ | |
2749 else | |
2750 assert(0); | |
2751 } | |
2752 | |
2753 /// Formats the bytes between start and end. | |
2754 /// Returns: e.g.: abc -> \x61\x62\x63 | |
2755 static char[] formatBytes(char* start, char* end) | |
2756 { | |
2757 auto strLen = end-start; | |
2758 const formatLen = `\xXX`.length; | |
2759 char[] result = new char[strLen*formatLen]; // Reserve space. | |
2760 result.length = 0; | |
2761 foreach (c; cast(ubyte[])start[0..strLen]) | |
2762 result ~= Format("\\x{:X}", c); | |
2763 return result; | |
2764 } | |
2765 | |
2766 /// Searches for an invalid UTF-8 sequence in str. | |
2767 /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80). | |
2768 static string findInvalidUTF8Sequence(string str) | |
2769 { | |
2770 char* p = str.ptr, end = p + str.length; | |
2771 while (p < end) | |
2772 { | |
2773 if (decode(p, end) == ERROR_CHAR) | |
2774 { | |
2775 auto begin = p; | |
2776 // Skip trail-bytes. | |
2777 while (++p < end && isTrailByte(*p)) | |
2778 {} | |
2779 return Lexer.formatBytes(begin, p); | |
2780 } | |
2781 } | |
2782 assert(p == end); | |
2783 return ""; | |
2784 } | |
2785 } | |
2786 | |
2787 /// Tests the lexer with a list of tokens. | |
2788 unittest | |
2789 { | |
2790 Stdout("Testing Lexer.\n"); | |
2791 struct Pair | |
2792 { | |
2793 char[] tokenText; | |
2794 TOK kind; | |
2795 } | |
2796 static Pair[] pairs = [ | |
2797 {"#!äöüß", TOK.Shebang}, {"\n", TOK.Newline}, | |
2798 {"//çay", TOK.Comment}, {"\n", TOK.Newline}, | |
2799 {"&", TOK.AndBinary}, | |
2800 {"/*çağ*/", TOK.Comment}, {"&&", TOK.AndLogical}, | |
2801 {"/+çak+/", TOK.Comment}, {"&=", TOK.AndAssign}, | |
2802 {">", TOK.Greater}, {"+", TOK.Plus}, | |
2803 {">=", TOK.GreaterEqual}, {"++", TOK.PlusPlus}, | |
2804 {">>", TOK.RShift}, {"+=", TOK.PlusAssign}, | |
2805 {">>=", TOK.RShiftAssign}, {"-", TOK.Minus}, | |
2806 {">>>", TOK.URShift}, {"--", TOK.MinusMinus}, | |
2807 {">>>=", TOK.URShiftAssign}, {"-=", TOK.MinusAssign}, | |
2808 {"<", TOK.Less}, {"=", TOK.Assign}, | |
2809 {"<=", TOK.LessEqual}, {"==", TOK.Equal}, | |
2810 {"<>", TOK.LorG}, {"~", TOK.Tilde}, | |
2811 {"<>=", TOK.LorEorG}, {"~=", TOK.CatAssign}, | |
2812 {"<<", TOK.LShift}, {"*", TOK.Mul}, | |
2813 {"<<=", TOK.LShiftAssign}, {"*=", TOK.MulAssign}, | |
2814 {"!", TOK.Not}, {"/", TOK.Div}, | |
2815 {"!=", TOK.NotEqual}, {"/=", TOK.DivAssign}, | |
2816 {"!<", TOK.UorGorE}, {"^", TOK.Xor}, | |
2817 {"!>", TOK.UorLorE}, {"^=", TOK.XorAssign}, | |
2818 {"!<=", TOK.UorG}, {"%", TOK.Mod}, | |
2819 {"!>=", TOK.UorL}, {"%=", TOK.ModAssign}, | |
2820 {"!<>", TOK.UorE}, {"(", TOK.LParen}, | |
2821 {"!<>=", TOK.Unordered}, {")", TOK.RParen}, | |
2822 {".", TOK.Dot}, {"[", TOK.LBracket}, | |
2823 {"..", TOK.Slice}, {"]", TOK.RBracket}, | |
2824 {"...", TOK.Ellipses}, {"{", TOK.LBrace}, | |
2825 {"|", TOK.OrBinary}, {"}", TOK.RBrace}, | |
2826 {"||", TOK.OrLogical}, {":", TOK.Colon}, | |
2827 {"|=", TOK.OrAssign}, {";", TOK.Semicolon}, | |
2828 {"?", TOK.Question}, {",", TOK.Comma}, | |
2829 {"$", TOK.Dollar}, {"cam", TOK.Identifier}, | |
2830 {"çay", TOK.Identifier}, {".0", TOK.Float64}, | |
2831 {"0", TOK.Int32}, {"\n", TOK.Newline}, | |
2832 {"\r", TOK.Newline}, {"\r\n", TOK.Newline}, | |
2833 {"\u2028", TOK.Newline}, {"\u2029", TOK.Newline} | |
2834 ]; | |
2835 | |
2836 char[] src; | |
2837 | |
2838 // Join all token texts into a single string. | |
2839 foreach (i, pair; pairs) | |
2840 if (pair.kind == TOK.Comment && pair.tokenText[1] == '/' || // Line comment. | |
2841 pair.kind == TOK.Shebang) | |
2842 { | |
2843 assert(pairs[i+1].kind == TOK.Newline); // Must be followed by a newline. | |
2844 src ~= pair.tokenText; | |
2845 } | |
2846 else | |
2847 src ~= pair.tokenText ~ " "; | |
2848 | |
2849 auto lx = new Lexer(new SourceText("", src)); | |
2850 auto token = lx.getTokens(); | |
2851 | |
2852 uint i; | |
2853 assert(token == lx.head); | |
2854 assert(token.next.kind == TOK.Newline); | |
2855 token = token.next.next; | |
2856 do | |
2857 { | |
2858 assert(i < pairs.length); | |
2859 assert(token.srcText == pairs[i].tokenText, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].tokenText)); | |
2860 ++i; | |
2861 token = token.next; | |
2862 } while (token.kind != TOK.EOF) | |
2863 } | |
2864 | |
2865 /// Tests the Lexer's peek() method. | |
2866 unittest | |
2867 { | |
2868 Stdout("Testing method Lexer.peek()\n"); | |
2869 auto sourceText = new SourceText("", "unittest { }"); | |
2870 auto lx = new Lexer(sourceText, null); | |
2871 | |
2872 auto next = lx.head; | |
2873 lx.peek(next); | |
2874 assert(next.kind == TOK.Newline); | |
2875 lx.peek(next); | |
2876 assert(next.kind == TOK.Unittest); | |
2877 lx.peek(next); | |
2878 assert(next.kind == TOK.LBrace); | |
2879 lx.peek(next); | |
2880 assert(next.kind == TOK.RBrace); | |
2881 lx.peek(next); | |
2882 assert(next.kind == TOK.EOF); | |
2883 | |
2884 lx = new Lexer(new SourceText("", "")); | |
2885 next = lx.head; | |
2886 lx.peek(next); | |
2887 assert(next.kind == TOK.Newline); | |
2888 lx.peek(next); | |
2889 assert(next.kind == TOK.EOF); | |
2890 } | |
2891 | |
2892 unittest | |
2893 { | |
2894 // Numbers unittest | |
2895 // 0L 0ULi 0_L 0_UL 0x0U 0x0p2 0_Fi 0_e2 0_F 0_i | |
2896 // 0u 0U 0uL 0UL 0L 0LU 0Lu | |
2897 // 0Li 0f 0F 0fi 0Fi 0i | |
2898 // 0b_1_LU 0b1000u | |
2899 // 0x232Lu | |
2900 } |