comparison src/dil/lexer/Lexer.d @ 806:bcb74c9b895c

Moved out files in the trunk folder to the root.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sun, 09 Mar 2008 00:12:19 +0100
parents trunk/src/dil/lexer/Lexer.d@cb8040538772
children 49e32b5bc161
comparison
equal deleted inserted replaced
805:a3fab8b74a7d 806:bcb74c9b895c
1 /++
2 Author: Aziz Köksal
3 License: GPL3
4 +/
5 module dil.lexer.Lexer;
6
7 import dil.lexer.Token;
8 import dil.lexer.Keywords;
9 import dil.lexer.Identifier;
10 import dil.lexer.IdTable;
11 import dil.Information;
12 import dil.Messages;
13 import dil.HtmlEntities;
14 import dil.CompilerInfo;
15 import dil.Unicode;
16 import dil.SourceText;
17 import dil.Time;
18 import common;
19
20 import tango.stdc.stdlib : strtof, strtod, strtold;
21 import tango.stdc.errno : errno, ERANGE;
22
23 public import dil.lexer.Funcs;
24
25 /// The Lexer analyzes the characters of a source text and
26 /// produces a doubly-linked list of tokens.
27 class Lexer
28 {
29 SourceText srcText; /// The source text.
30 char* p; /// Points to the current character in the source text.
31 char* end; /// Points one character past the end of the source text.
32
33 Token* head; /// The head of the doubly linked token list.
34 Token* tail; /// The tail of the linked list. Set in scan().
35 Token* token; /// Points to the current token in the token list.
36
37 // Members used for error messages:
38 InfoManager infoMan;
39 LexerError[] errors;
40 /// Always points to the first character of the current line.
41 char* lineBegin;
42 // Token* newline; /// Current newline token.
43 uint lineNum = 1; /// Current, actual source text line number.
44 uint lineNum_hline; /// Line number set by #line.
45 uint inTokenString; /// > 0 if inside q{ }
46 /// Holds the original file path and the modified one (by #line.)
47 NewlineData.FilePaths* filePaths;
48
49 /// Construct a Lexer object.
50 /// Params:
51 /// srcText = the UTF-8 source code.
52 /// infoMan = used for collecting error messages.
53 this(SourceText srcText, InfoManager infoMan = null)
54 {
55 this.srcText = srcText;
56 this.infoMan = infoMan;
57
58 assert(text.length && text[$-1] == 0, "source text has no sentinel character");
59 this.p = text.ptr;
60 this.end = this.p + text.length;
61 this.lineBegin = this.p;
62
63 this.head = new Token;
64 this.head.kind = TOK.HEAD;
65 this.head.start = this.head.end = this.p;
66 this.token = this.head;
67 // Initialize this.filePaths.
68 newFilePath(this.srcText.filePath);
69 // Add a newline as the first token after the head.
70 auto newline = new Token;
71 newline.kind = TOK.Newline;
72 newline.setWhitespaceFlag();
73 newline.start = newline.end = this.p;
74 newline.newline.filePaths = this.filePaths;
75 newline.newline.oriLineNum = 1;
76 newline.newline.setLineNum = 0;
77 // Link in.
78 this.token.next = newline;
79 newline.prev = this.token;
80 this.token = newline;
81 // this.newline = newline;
82 scanShebang();
83 }
84
85 /// The destructor deletes the doubly-linked token list.
86 ~this()
87 {
88 auto token = head.next;
89 while (token !is null)
90 {
91 assert(token.kind == TOK.EOF ? token == tail && token.next is null : 1);
92 delete token.prev;
93 token = token.next;
94 }
95 delete tail;
96 }
97
98 char[] text()
99 {
100 return srcText.data;
101 }
102
103 /// The "shebang" may optionally appear once at the beginning of a file.
104 /// Regexp: #![^\EndOfLine]*
105 void scanShebang()
106 {
107 if (*p == '#' && p[1] == '!')
108 {
109 auto t = new Token;
110 t.kind = TOK.Shebang;
111 t.setWhitespaceFlag();
112 t.start = p;
113 ++p;
114 while (!isEndOfLine(++p))
115 isascii(*p) || decodeUTF8();
116 t.end = p;
117 this.token.next = t;
118 t.prev = this.token;
119 }
120 }
121
122 /// Sets the value of the special token.
123 void finalizeSpecialToken(ref Token t)
124 {
125 assert(t.srcText[0..2] == "__");
126 switch (t.kind)
127 {
128 case TOK.FILE:
129 t.str = this.filePaths.setPath;
130 break;
131 case TOK.LINE:
132 t.uint_ = this.errorLineNumber(this.lineNum);
133 break;
134 case TOK.DATE,
135 TOK.TIME,
136 TOK.TIMESTAMP:
137 auto time_str = Time.toString();
138 switch (t.kind)
139 {
140 case TOK.DATE:
141 time_str = Time.month_day(time_str) ~ ' ' ~ Time.year(time_str); break;
142 case TOK.TIME:
143 time_str = Time.time(time_str); break;
144 case TOK.TIMESTAMP:
145 break; // time_str is the timestamp.
146 default: assert(0);
147 }
148 time_str ~= '\0'; // Terminate with a zero.
149 t.str = time_str;
150 break;
151 case TOK.VENDOR:
152 t.str = VENDOR;
153 break;
154 case TOK.VERSION:
155 t.uint_ = VERSION_MAJOR*1000 + VERSION_MINOR;
156 break;
157 default:
158 assert(0);
159 }
160 }
161
162 /// Sets a new file path.
163 void newFilePath(char[] newPath)
164 {
165 auto paths = new NewlineData.FilePaths;
166 paths.oriPath = this.srcText.filePath;
167 paths.setPath = newPath;
168 this.filePaths = paths;
169 }
170
171 private void setLineBegin(char* p)
172 {
173 // Check that we can look behind one character.
174 assert((p-1) >= text.ptr && p < end);
175 // Check that previous character is a newline.
176 assert(isNewlineEnd(p - 1));
177 this.lineBegin = p;
178 }
179
180 /// Scans the next token in the source text.
181 ///
182 /// Creates a new token if t.next is null and appends it to the list.
183 private void scanNext(ref Token* t)
184 {
185 assert(t !is null);
186 if (t.next)
187 {
188 t = t.next;
189 // if (t.kind == TOK.Newline)
190 // this.newline = t;
191 }
192 else if (t != this.tail)
193 {
194 Token* new_t = new Token;
195 scan(*new_t);
196 new_t.prev = t;
197 t.next = new_t;
198 t = new_t;
199 }
200 }
201
202 /// Advance t one token forward.
203 void peek(ref Token* t)
204 {
205 scanNext(t);
206 }
207
208 /// Advance to the next token in the source text.
209 TOK nextToken()
210 {
211 scanNext(this.token);
212 return this.token.kind;
213 }
214
215 /// Returns true if p points to the last character of a Newline.
216 bool isNewlineEnd(char* p)
217 {
218 if (*p == '\n' || *p == '\r')
219 return true;
220 if (*p == LS[2] || *p == PS[2])
221 if ((p-2) >= text.ptr)
222 if (p[-1] == LS[1] && p[-2] == LS[0])
223 return true;
224 return false;
225 }
226
227 /// The main method which recognizes the characters that make up a token.
228 ///
229 /// Complicated tokens are scanned in separate methods.
230 public void scan(ref Token t)
231 in
232 {
233 assert(text.ptr <= p && p < end);
234 }
235 out
236 {
237 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind));
238 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind));
239 }
240 body
241 {
242 // Scan whitespace.
243 if (isspace(*p))
244 {
245 t.ws = p;
246 while (isspace(*++p))
247 {}
248 }
249
250 // Scan a token.
251 uint c = *p;
252 {
253 t.start = p;
254 // Newline.
255 switch (*p)
256 {
257 case '\r':
258 if (p[1] == '\n')
259 ++p;
260 case '\n':
261 assert(isNewlineEnd(p));
262 ++p;
263 ++lineNum;
264 setLineBegin(p);
265 // this.newline = &t;
266 t.kind = TOK.Newline;
267 t.setWhitespaceFlag();
268 t.newline.filePaths = this.filePaths;
269 t.newline.oriLineNum = lineNum;
270 t.newline.setLineNum = lineNum_hline;
271 t.end = p;
272 return;
273 default:
274 if (isUnicodeNewline(p))
275 {
276 ++p; ++p;
277 goto case '\n';
278 }
279 }
280 // Identifier or string literal.
281 if (isidbeg(c))
282 {
283 if (c == 'r' && p[1] == '"' && ++p)
284 return scanRawStringLiteral(t);
285 if (c == 'x' && p[1] == '"')
286 return scanHexStringLiteral(t);
287 version(D2)
288 {
289 if (c == 'q' && p[1] == '"')
290 return scanDelimitedStringLiteral(t);
291 if (c == 'q' && p[1] == '{')
292 return scanTokenStringLiteral(t);
293 }
294 // Scan identifier.
295 Lidentifier:
296 do
297 { c = *++p; }
298 while (isident(c) || !isascii(c) && isUnicodeAlpha())
299
300 t.end = p;
301
302 auto id = IdTable.lookup(t.srcText);
303 t.kind = id.kind;
304 t.ident = id;
305
306 if (t.kind == TOK.Identifier || t.isKeyword)
307 return;
308 else if (t.isSpecialToken)
309 finalizeSpecialToken(t);
310 else if (t.kind == TOK.EOF)
311 {
312 tail = &t;
313 assert(t.srcText == "__EOF__");
314 }
315 else
316 assert(0, "unexpected token type: " ~ Token.toString(t.kind));
317 return;
318 }
319
320 if (isdigit(c))
321 return scanNumber(t);
322
323 if (c == '/')
324 {
325 c = *++p;
326 switch(c)
327 {
328 case '=':
329 ++p;
330 t.kind = TOK.DivAssign;
331 t.end = p;
332 return;
333 case '+':
334 return scanNestedComment(t);
335 case '*':
336 return scanBlockComment(t);
337 case '/':
338 while (!isEndOfLine(++p))
339 isascii(*p) || decodeUTF8();
340 t.kind = TOK.Comment;
341 t.setWhitespaceFlag();
342 t.end = p;
343 return;
344 default:
345 t.kind = TOK.Div;
346 t.end = p;
347 return;
348 }
349 }
350
351 switch (c)
352 {
353 case '\'':
354 return scanCharacterLiteral(t);
355 case '`':
356 return scanRawStringLiteral(t);
357 case '"':
358 return scanNormalStringLiteral(t);
359 case '\\':
360 char[] buffer;
361 do
362 {
363 bool isBinary;
364 c = scanEscapeSequence(isBinary);
365 if (isascii(c) || isBinary)
366 buffer ~= c;
367 else
368 encodeUTF8(buffer, c);
369 } while (*p == '\\')
370 buffer ~= 0;
371 t.kind = TOK.String;
372 t.str = buffer;
373 t.end = p;
374 return;
375 case '>': /* > >= >> >>= >>> >>>= */
376 c = *++p;
377 switch (c)
378 {
379 case '=':
380 t.kind = TOK.GreaterEqual;
381 goto Lcommon;
382 case '>':
383 if (p[1] == '>')
384 {
385 ++p;
386 if (p[1] == '=')
387 { ++p;
388 t.kind = TOK.URShiftAssign;
389 }
390 else
391 t.kind = TOK.URShift;
392 }
393 else if (p[1] == '=')
394 {
395 ++p;
396 t.kind = TOK.RShiftAssign;
397 }
398 else
399 t.kind = TOK.RShift;
400 goto Lcommon;
401 default:
402 t.kind = TOK.Greater;
403 goto Lcommon2;
404 }
405 assert(0);
406 case '<': /* < <= <> <>= << <<= */
407 c = *++p;
408 switch (c)
409 {
410 case '=':
411 t.kind = TOK.LessEqual;
412 goto Lcommon;
413 case '<':
414 if (p[1] == '=') {
415 ++p;
416 t.kind = TOK.LShiftAssign;
417 }
418 else
419 t.kind = TOK.LShift;
420 goto Lcommon;
421 case '>':
422 if (p[1] == '=') {
423 ++p;
424 t.kind = TOK.LorEorG;
425 }
426 else
427 t.kind = TOK.LorG;
428 goto Lcommon;
429 default:
430 t.kind = TOK.Less;
431 goto Lcommon2;
432 }
433 assert(0);
434 case '!': /* ! !< !> !<= !>= !<> !<>= */
435 c = *++p;
436 switch (c)
437 {
438 case '<':
439 c = *++p;
440 if (c == '>')
441 {
442 if (p[1] == '=') {
443 ++p;
444 t.kind = TOK.Unordered;
445 }
446 else
447 t.kind = TOK.UorE;
448 }
449 else if (c == '=')
450 {
451 t.kind = TOK.UorG;
452 }
453 else {
454 t.kind = TOK.UorGorE;
455 goto Lcommon2;
456 }
457 goto Lcommon;
458 case '>':
459 if (p[1] == '=')
460 {
461 ++p;
462 t.kind = TOK.UorL;
463 }
464 else
465 t.kind = TOK.UorLorE;
466 goto Lcommon;
467 case '=':
468 t.kind = TOK.NotEqual;
469 goto Lcommon;
470 default:
471 t.kind = TOK.Not;
472 goto Lcommon2;
473 }
474 assert(0);
475 case '.': /* . .[0-9] .. ... */
476 if (p[1] == '.')
477 {
478 ++p;
479 if (p[1] == '.') {
480 ++p;
481 t.kind = TOK.Ellipses;
482 }
483 else
484 t.kind = TOK.Slice;
485 }
486 else if (isdigit(p[1]))
487 {
488 return scanReal(t);
489 }
490 else
491 t.kind = TOK.Dot;
492 goto Lcommon;
493 case '|': /* | || |= */
494 c = *++p;
495 if (c == '=')
496 t.kind = TOK.OrAssign;
497 else if (c == '|')
498 t.kind = TOK.OrLogical;
499 else {
500 t.kind = TOK.OrBinary;
501 goto Lcommon2;
502 }
503 goto Lcommon;
504 case '&': /* & && &= */
505 c = *++p;
506 if (c == '=')
507 t.kind = TOK.AndAssign;
508 else if (c == '&')
509 t.kind = TOK.AndLogical;
510 else {
511 t.kind = TOK.AndBinary;
512 goto Lcommon2;
513 }
514 goto Lcommon;
515 case '+': /* + ++ += */
516 c = *++p;
517 if (c == '=')
518 t.kind = TOK.PlusAssign;
519 else if (c == '+')
520 t.kind = TOK.PlusPlus;
521 else {
522 t.kind = TOK.Plus;
523 goto Lcommon2;
524 }
525 goto Lcommon;
526 case '-': /* - -- -= */
527 c = *++p;
528 if (c == '=')
529 t.kind = TOK.MinusAssign;
530 else if (c == '-')
531 t.kind = TOK.MinusMinus;
532 else {
533 t.kind = TOK.Minus;
534 goto Lcommon2;
535 }
536 goto Lcommon;
537 case '=': /* = == */
538 if (p[1] == '=') {
539 ++p;
540 t.kind = TOK.Equal;
541 }
542 else
543 t.kind = TOK.Assign;
544 goto Lcommon;
545 case '~': /* ~ ~= */
546 if (p[1] == '=') {
547 ++p;
548 t.kind = TOK.CatAssign;
549 }
550 else
551 t.kind = TOK.Tilde;
552 goto Lcommon;
553 case '*': /* * *= */
554 if (p[1] == '=') {
555 ++p;
556 t.kind = TOK.MulAssign;
557 }
558 else
559 t.kind = TOK.Mul;
560 goto Lcommon;
561 case '^': /* ^ ^= */
562 if (p[1] == '=') {
563 ++p;
564 t.kind = TOK.XorAssign;
565 }
566 else
567 t.kind = TOK.Xor;
568 goto Lcommon;
569 case '%': /* % %= */
570 if (p[1] == '=') {
571 ++p;
572 t.kind = TOK.ModAssign;
573 }
574 else
575 t.kind = TOK.Mod;
576 goto Lcommon;
577 // Single character tokens:
578 case '(':
579 t.kind = TOK.LParen;
580 goto Lcommon;
581 case ')':
582 t.kind = TOK.RParen;
583 goto Lcommon;
584 case '[':
585 t.kind = TOK.LBracket;
586 goto Lcommon;
587 case ']':
588 t.kind = TOK.RBracket;
589 goto Lcommon;
590 case '{':
591 t.kind = TOK.LBrace;
592 goto Lcommon;
593 case '}':
594 t.kind = TOK.RBrace;
595 goto Lcommon;
596 case ':':
597 t.kind = TOK.Colon;
598 goto Lcommon;
599 case ';':
600 t.kind = TOK.Semicolon;
601 goto Lcommon;
602 case '?':
603 t.kind = TOK.Question;
604 goto Lcommon;
605 case ',':
606 t.kind = TOK.Comma;
607 goto Lcommon;
608 case '$':
609 t.kind = TOK.Dollar;
610 Lcommon:
611 ++p;
612 Lcommon2:
613 t.end = p;
614 return;
615 case '#':
616 return scanSpecialTokenSequence(t);
617 default:
618 }
619
620 // Check for EOF
621 if (isEOF(c))
622 {
623 assert(isEOF(*p), ""~*p);
624 t.kind = TOK.EOF;
625 t.end = p;
626 tail = &t;
627 assert(t.start == t.end);
628 return;
629 }
630
631 if (!isascii(c))
632 {
633 c = decodeUTF8();
634 if (isUniAlpha(c))
635 goto Lidentifier;
636 }
637
638 error(t.start, MID.IllegalCharacter, cast(dchar)c);
639
640 ++p;
641 t.kind = TOK.Illegal;
642 t.setWhitespaceFlag();
643 t.dchar_ = c;
644 t.end = p;
645 return;
646 }
647 }
648
649 /// Converts a string literal to an integer.
650 template toUint(char[] T)
651 {
652 static assert(0 < T.length && T.length <= 4);
653 static if (T.length == 1)
654 const uint toUint = T[0];
655 else
656 const uint toUint = (T[0] << ((T.length-1)*8)) | toUint!(T[1..$]);
657 }
658 static assert(toUint!("\xAA\xBB\xCC\xDD") == 0xAABBCCDD);
659
660 /// Constructs case statements. E.g.:
661 /// ---
662 //// // case_!("<", "Less", "Lcommon") ->
663 /// case 60u:
664 /// t.kind = TOK.Less;
665 /// goto Lcommon;
666 /// ---
667 /// Note:Can't use this yet due to a $(DMDBUG 1534, bug) in DMD.
668 template case_(char[] str, char[] kind, char[] label)
669 {
670 const char[] case_ =
671 `case `~toUint!(str).stringof~`:`
672 `t.kind = TOK.`~kind~`;`
673 `goto `~label~`;`;
674 }
675 //pragma(msg, case_!("<", "Less", "Lcommon"));
676
677 template case_L4(char[] str, TOK kind)
678 {
679 const char[] case_L4 = case_!(str, kind, "Lcommon_4");
680 }
681
682 template case_L3(char[] str, TOK kind)
683 {
684 const char[] case_L3 = case_!(str, kind, "Lcommon_3");
685 }
686
687 template case_L2(char[] str, TOK kind)
688 {
689 const char[] case_L2 = case_!(str, kind, "Lcommon_2");
690 }
691
692 template case_L1(char[] str, TOK kind)
693 {
694 const char[] case_L3 = case_!(str, kind, "Lcommon");
695 }
696
697 /// An alternative scan method.
698 /// Profiling shows it's a bit slower.
699 public void scan_(ref Token t)
700 in
701 {
702 assert(text.ptr <= p && p < end);
703 }
704 out
705 {
706 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind));
707 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind));
708 }
709 body
710 {
711 // Scan whitespace.
712 if (isspace(*p))
713 {
714 t.ws = p;
715 while (isspace(*++p))
716 {}
717 }
718
719 // Scan a token.
720 t.start = p;
721 // Newline.
722 switch (*p)
723 {
724 case '\r':
725 if (p[1] == '\n')
726 ++p;
727 case '\n':
728 assert(isNewlineEnd(p));
729 ++p;
730 ++lineNum;
731 setLineBegin(p);
732 // this.newline = &t;
733 t.kind = TOK.Newline;
734 t.setWhitespaceFlag();
735 t.newline.filePaths = this.filePaths;
736 t.newline.oriLineNum = lineNum;
737 t.newline.setLineNum = lineNum_hline;
738 t.end = p;
739 return;
740 default:
741 if (isUnicodeNewline(p))
742 {
743 ++p; ++p;
744 goto case '\n';
745 }
746 }
747
748 uint c = *p;
749 assert(end - p != 0);
750 switch (end - p)
751 {
752 case 1:
753 goto L1character;
754 case 2:
755 c <<= 8; c |= p[1];
756 goto L2characters;
757 case 3:
758 c <<= 8; c |= p[1]; c <<= 8; c |= p[2];
759 goto L3characters;
760 default:
761 version(BigEndian)
762 c = *cast(uint*)p;
763 else
764 {
765 c <<= 8; c |= p[1]; c <<= 8; c |= p[2]; c <<= 8; c |= p[3];
766 /+
767 c = *cast(uint*)p;
768 asm
769 {
770 mov EDX, c;
771 bswap EDX;
772 mov c, EDX;
773 }
774 +/
775 }
776 }
777
778 // 4 character tokens.
779 switch (c)
780 {
781 case toUint!(">>>="):
782 t.kind = TOK.RShiftAssign;
783 goto Lcommon_4;
784 case toUint!("!<>="):
785 t.kind = TOK.Unordered;
786 Lcommon_4:
787 p += 4;
788 t.end = p;
789 return;
790 default:
791 }
792
793 c >>>= 8;
794 L3characters:
795 assert(p == t.start);
796 // 3 character tokens.
797 switch (c)
798 {
799 case toUint!(">>="):
800 t.kind = TOK.RShiftAssign;
801 goto Lcommon_3;
802 case toUint!(">>>"):
803 t.kind = TOK.URShift;
804 goto Lcommon_3;
805 case toUint!("<>="):
806 t.kind = TOK.LorEorG;
807 goto Lcommon_3;
808 case toUint!("<<="):
809 t.kind = TOK.LShiftAssign;
810 goto Lcommon_3;
811 case toUint!("!<="):
812 t.kind = TOK.UorG;
813 goto Lcommon_3;
814 case toUint!("!>="):
815 t.kind = TOK.UorL;
816 goto Lcommon_3;
817 case toUint!("!<>"):
818 t.kind = TOK.UorE;
819 goto Lcommon_3;
820 case toUint!("..."):
821 t.kind = TOK.Ellipses;
822 Lcommon_3:
823 p += 3;
824 t.end = p;
825 return;
826 default:
827 }
828
829 c >>>= 8;
830 L2characters:
831 assert(p == t.start);
832 // 2 character tokens.
833 switch (c)
834 {
835 case toUint!("/+"):
836 ++p; // Skip /
837 return scanNestedComment(t);
838 case toUint!("/*"):
839 ++p; // Skip /
840 return scanBlockComment(t);
841 case toUint!("//"):
842 ++p; // Skip /
843 assert(*p == '/');
844 while (!isEndOfLine(++p))
845 isascii(*p) || decodeUTF8();
846 t.kind = TOK.Comment;
847 t.setWhitespaceFlag();
848 t.end = p;
849 return;
850 case toUint!(">="):
851 t.kind = TOK.GreaterEqual;
852 goto Lcommon_2;
853 case toUint!(">>"):
854 t.kind = TOK.RShift;
855 goto Lcommon_2;
856 case toUint!("<<"):
857 t.kind = TOK.LShift;
858 goto Lcommon_2;
859 case toUint!("<="):
860 t.kind = TOK.LessEqual;
861 goto Lcommon_2;
862 case toUint!("<>"):
863 t.kind = TOK.LorG;
864 goto Lcommon_2;
865 case toUint!("!<"):
866 t.kind = TOK.UorGorE;
867 goto Lcommon_2;
868 case toUint!("!>"):
869 t.kind = TOK.UorLorE;
870 goto Lcommon_2;
871 case toUint!("!="):
872 t.kind = TOK.NotEqual;
873 goto Lcommon_2;
874 case toUint!(".."):
875 t.kind = TOK.Slice;
876 goto Lcommon_2;
877 case toUint!("&&"):
878 t.kind = TOK.AndLogical;
879 goto Lcommon_2;
880 case toUint!("&="):
881 t.kind = TOK.AndAssign;
882 goto Lcommon_2;
883 case toUint!("||"):
884 t.kind = TOK.OrLogical;
885 goto Lcommon_2;
886 case toUint!("|="):
887 t.kind = TOK.OrAssign;
888 goto Lcommon_2;
889 case toUint!("++"):
890 t.kind = TOK.PlusPlus;
891 goto Lcommon_2;
892 case toUint!("+="):
893 t.kind = TOK.PlusAssign;
894 goto Lcommon_2;
895 case toUint!("--"):
896 t.kind = TOK.MinusMinus;
897 goto Lcommon_2;
898 case toUint!("-="):
899 t.kind = TOK.MinusAssign;
900 goto Lcommon_2;
901 case toUint!("=="):
902 t.kind = TOK.Equal;
903 goto Lcommon_2;
904 case toUint!("~="):
905 t.kind = TOK.CatAssign;
906 goto Lcommon_2;
907 case toUint!("*="):
908 t.kind = TOK.MulAssign;
909 goto Lcommon_2;
910 case toUint!("/="):
911 t.kind = TOK.DivAssign;
912 goto Lcommon_2;
913 case toUint!("^="):
914 t.kind = TOK.XorAssign;
915 goto Lcommon_2;
916 case toUint!("%="):
917 t.kind = TOK.ModAssign;
918 Lcommon_2:
919 p += 2;
920 t.end = p;
921 return;
922 default:
923 }
924
925 c >>>= 8;
926 L1character:
927 assert(p == t.start);
928 assert(*p == c, Format("p={0},c={1}", *p, cast(dchar)c));
929 // 1 character tokens.
930 // TODO: consider storing the token type in ptable.
931 switch (c)
932 {
933 case '\'':
934 return scanCharacterLiteral(t);
935 case '`':
936 return scanRawStringLiteral(t);
937 case '"':
938 return scanNormalStringLiteral(t);
939 case '\\':
940 char[] buffer;
941 do
942 {
943 bool isBinary;
944 c = scanEscapeSequence(isBinary);
945 if (isascii(c) || isBinary)
946 buffer ~= c;
947 else
948 encodeUTF8(buffer, c);
949 } while (*p == '\\')
950 buffer ~= 0;
951 t.kind = TOK.String;
952 t.str = buffer;
953 t.end = p;
954 return;
955 case '<':
956 t.kind = TOK.Greater;
957 goto Lcommon;
958 case '>':
959 t.kind = TOK.Less;
960 goto Lcommon;
961 case '^':
962 t.kind = TOK.Xor;
963 goto Lcommon;
964 case '!':
965 t.kind = TOK.Not;
966 goto Lcommon;
967 case '.':
968 if (isdigit(p[1]))
969 return scanReal(t);
970 t.kind = TOK.Dot;
971 goto Lcommon;
972 case '&':
973 t.kind = TOK.AndBinary;
974 goto Lcommon;
975 case '|':
976 t.kind = TOK.OrBinary;
977 goto Lcommon;
978 case '+':
979 t.kind = TOK.Plus;
980 goto Lcommon;
981 case '-':
982 t.kind = TOK.Minus;
983 goto Lcommon;
984 case '=':
985 t.kind = TOK.Assign;
986 goto Lcommon;
987 case '~':
988 t.kind = TOK.Tilde;
989 goto Lcommon;
990 case '*':
991 t.kind = TOK.Mul;
992 goto Lcommon;
993 case '/':
994 t.kind = TOK.Div;
995 goto Lcommon;
996 case '%':
997 t.kind = TOK.Mod;
998 goto Lcommon;
999 case '(':
1000 t.kind = TOK.LParen;
1001 goto Lcommon;
1002 case ')':
1003 t.kind = TOK.RParen;
1004 goto Lcommon;
1005 case '[':
1006 t.kind = TOK.LBracket;
1007 goto Lcommon;
1008 case ']':
1009 t.kind = TOK.RBracket;
1010 goto Lcommon;
1011 case '{':
1012 t.kind = TOK.LBrace;
1013 goto Lcommon;
1014 case '}':
1015 t.kind = TOK.RBrace;
1016 goto Lcommon;
1017 case ':':
1018 t.kind = TOK.Colon;
1019 goto Lcommon;
1020 case ';':
1021 t.kind = TOK.Semicolon;
1022 goto Lcommon;
1023 case '?':
1024 t.kind = TOK.Question;
1025 goto Lcommon;
1026 case ',':
1027 t.kind = TOK.Comma;
1028 goto Lcommon;
1029 case '$':
1030 t.kind = TOK.Dollar;
1031 Lcommon:
1032 ++p;
1033 t.end = p;
1034 return;
1035 case '#':
1036 return scanSpecialTokenSequence(t);
1037 default:
1038 }
1039
1040 assert(p == t.start);
1041 assert(*p == c);
1042
1043 // TODO: consider moving isidbeg() and isdigit() up.
1044 if (isidbeg(c))
1045 {
1046 if (c == 'r' && p[1] == '"' && ++p)
1047 return scanRawStringLiteral(t);
1048 if (c == 'x' && p[1] == '"')
1049 return scanHexStringLiteral(t);
1050 version(D2)
1051 {
1052 if (c == 'q' && p[1] == '"')
1053 return scanDelimitedStringLiteral(t);
1054 if (c == 'q' && p[1] == '{')
1055 return scanTokenStringLiteral(t);
1056 }
1057 // Scan identifier.
1058 Lidentifier:
1059 do
1060 { c = *++p; }
1061 while (isident(c) || !isascii(c) && isUnicodeAlpha())
1062
1063 t.end = p;
1064
1065 auto id = IdTable.lookup(t.srcText);
1066 t.kind = id.kind;
1067 t.ident = id;
1068
1069 if (t.kind == TOK.Identifier || t.isKeyword)
1070 return;
1071 else if (t.isSpecialToken)
1072 finalizeSpecialToken(t);
1073 else if (t.kind == TOK.EOF)
1074 {
1075 tail = &t;
1076 assert(t.srcText == "__EOF__");
1077 }
1078 else
1079 assert(0, "unexpected token type: " ~ Token.toString(t.kind));
1080 return;
1081 }
1082
1083 if (isdigit(c))
1084 return scanNumber(t);
1085
1086 // Check for EOF
1087 if (isEOF(c))
1088 {
1089 assert(isEOF(*p), *p~"");
1090 t.kind = TOK.EOF;
1091 t.end = p;
1092 tail = &t;
1093 assert(t.start == t.end);
1094 return;
1095 }
1096
1097 if (!isascii(c))
1098 {
1099 c = decodeUTF8();
1100 if (isUniAlpha(c))
1101 goto Lidentifier;
1102 }
1103
1104 error(t.start, MID.IllegalCharacter, cast(dchar)c);
1105
1106 ++p;
1107 t.kind = TOK.Illegal;
1108 t.setWhitespaceFlag();
1109 t.dchar_ = c;
1110 t.end = p;
1111 return;
1112 }
1113
1114 /// Scans a block comment.
1115 ///
1116 /// BlockComment := "/*" AnyChar* "*/"
1117 void scanBlockComment(ref Token t)
1118 {
1119 assert(p[-1] == '/' && *p == '*');
1120 auto tokenLineNum = lineNum;
1121 auto tokenLineBegin = lineBegin;
1122 Loop:
1123 while (1)
1124 {
1125 switch (*++p)
1126 {
1127 case '*':
1128 if (p[1] != '/')
1129 continue;
1130 p += 2;
1131 break Loop;
1132 case '\r':
1133 if (p[1] == '\n')
1134 ++p;
1135 case '\n':
1136 assert(isNewlineEnd(p));
1137 ++lineNum;
1138 setLineBegin(p+1);
1139 break;
1140 default:
1141 if (!isascii(*p))
1142 {
1143 if (isUnicodeNewlineChar(decodeUTF8()))
1144 goto case '\n';
1145 }
1146 else if (isEOF(*p))
1147 {
1148 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedBlockComment);
1149 break Loop;
1150 }
1151 }
1152 }
1153 t.kind = TOK.Comment;
1154 t.setWhitespaceFlag();
1155 t.end = p;
1156 return;
1157 }
1158
1159 /// Scans a nested comment.
1160 ///
1161 /// NestedComment := "/+" (AnyChar* | NestedComment) "+/"
1162 void scanNestedComment(ref Token t)
1163 {
1164 assert(p[-1] == '/' && *p == '+');
1165 auto tokenLineNum = lineNum;
1166 auto tokenLineBegin = lineBegin;
1167 uint level = 1;
1168 Loop:
1169 while (1)
1170 {
1171 switch (*++p)
1172 {
1173 case '/':
1174 if (p[1] == '+')
1175 ++p, ++level;
1176 continue;
1177 case '+':
1178 if (p[1] != '/')
1179 continue;
1180 ++p;
1181 if (--level != 0)
1182 continue;
1183 ++p;
1184 break Loop;
1185 case '\r':
1186 if (p[1] == '\n')
1187 ++p;
1188 case '\n':
1189 assert(isNewlineEnd(p));
1190 ++lineNum;
1191 setLineBegin(p+1);
1192 continue;
1193 default:
1194 if (!isascii(*p))
1195 {
1196 if (isUnicodeNewlineChar(decodeUTF8()))
1197 goto case '\n';
1198 }
1199 else if (isEOF(*p))
1200 {
1201 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedNestedComment);
1202 break Loop;
1203 }
1204 }
1205 }
1206 t.kind = TOK.Comment;
1207 t.setWhitespaceFlag();
1208 t.end = p;
1209 return;
1210 }
1211
1212 /// Scans the postfix character of a string literal.
1213 ///
1214 /// PostfixChar := "c" | "w" | "d"
1215 char scanPostfix()
1216 {
1217 assert(p[-1] == '"' || p[-1] == '`' ||
1218 { version(D2) return p[-1] == '}';
1219 else return 0; }()
1220 );
1221 switch (*p)
1222 {
1223 case 'c':
1224 case 'w':
1225 case 'd':
1226 return *p++;
1227 default:
1228 return 0;
1229 }
1230 assert(0);
1231 }
1232
1233 /// Scans a normal string literal.
1234 ///
1235 /// NormalStringLiteral := "\"" Char* "\""
1236 void scanNormalStringLiteral(ref Token t)
1237 {
1238 assert(*p == '"');
1239 auto tokenLineNum = lineNum;
1240 auto tokenLineBegin = lineBegin;
1241 t.kind = TOK.String;
1242 char[] buffer;
1243 uint c;
1244 while (1)
1245 {
1246 c = *++p;
1247 switch (c)
1248 {
1249 case '"':
1250 ++p;
1251 t.pf = scanPostfix();
1252 Lreturn:
1253 t.str = buffer ~ '\0';
1254 t.end = p;
1255 return;
1256 case '\\':
1257 bool isBinary;
1258 c = scanEscapeSequence(isBinary);
1259 --p;
1260 if (isascii(c) || isBinary)
1261 buffer ~= c;
1262 else
1263 encodeUTF8(buffer, c);
1264 continue;
1265 case '\r':
1266 if (p[1] == '\n')
1267 ++p;
1268 case '\n':
1269 assert(isNewlineEnd(p));
1270 c = '\n'; // Convert Newline to \n.
1271 ++lineNum;
1272 setLineBegin(p+1);
1273 break;
1274 case 0, _Z_:
1275 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedString);
1276 goto Lreturn;
1277 default:
1278 if (!isascii(c))
1279 {
1280 c = decodeUTF8();
1281 if (isUnicodeNewlineChar(c))
1282 goto case '\n';
1283 encodeUTF8(buffer, c);
1284 continue;
1285 }
1286 }
1287 assert(isascii(c));
1288 buffer ~= c;
1289 }
1290 assert(0);
1291 }
1292
1293 /// Scans a character literal.
1294 ///
1295 /// CharLiteral := "'" Char "'"
1296 void scanCharacterLiteral(ref Token t)
1297 {
1298 assert(*p == '\'');
1299 ++p;
1300 t.kind = TOK.CharLiteral;
1301 switch (*p)
1302 {
1303 case '\\':
1304 bool notused;
1305 t.dchar_ = scanEscapeSequence(notused);
1306 break;
1307 case '\'':
1308 error(t.start, MID.EmptyCharacterLiteral);
1309 break;
1310 default:
1311 if (isEndOfLine(p))
1312 break;
1313 uint c = *p;
1314 if (!isascii(c))
1315 c = decodeUTF8();
1316 t.dchar_ = c;
1317 ++p;
1318 }
1319
1320 if (*p == '\'')
1321 ++p;
1322 else
1323 error(t.start, MID.UnterminatedCharacterLiteral);
1324 t.end = p;
1325 }
1326
1327 /// Scans a raw string literal.
1328 ///
1329 /// RawStringLiteral := "r\"" AnyChar* "\"" | "`" AnyChar* "`"
1330 void scanRawStringLiteral(ref Token t)
1331 {
1332 assert(*p == '`' || *p == '"' && p[-1] == 'r');
1333 auto tokenLineNum = lineNum;
1334 auto tokenLineBegin = lineBegin;
1335 t.kind = TOK.String;
1336 uint delim = *p;
1337 char[] buffer;
1338 uint c;
1339 while (1)
1340 {
1341 c = *++p;
1342 switch (c)
1343 {
1344 case '\r':
1345 if (p[1] == '\n')
1346 ++p;
1347 case '\n':
1348 assert(isNewlineEnd(p));
1349 c = '\n'; // Convert Newline to '\n'.
1350 ++lineNum;
1351 setLineBegin(p+1);
1352 break;
1353 case '`':
1354 case '"':
1355 if (c == delim)
1356 {
1357 ++p;
1358 t.pf = scanPostfix();
1359 Lreturn:
1360 t.str = buffer ~ '\0';
1361 t.end = p;
1362 return;
1363 }
1364 break;
1365 case 0, _Z_:
1366 error(tokenLineNum, tokenLineBegin, t.start,
1367 delim == 'r' ? MID.UnterminatedRawString : MID.UnterminatedBackQuoteString);
1368 goto Lreturn;
1369 default:
1370 if (!isascii(c))
1371 {
1372 c = decodeUTF8();
1373 if (isUnicodeNewlineChar(c))
1374 goto case '\n';
1375 encodeUTF8(buffer, c);
1376 continue;
1377 }
1378 }
1379 assert(isascii(c));
1380 buffer ~= c;
1381 }
1382 assert(0);
1383 }
1384
1385 /// Scans a hexadecimal string literal.
1386 ///
1387 /// HexStringLiteral := "x\"" (HexChar HexChar)* "\""
1388 void scanHexStringLiteral(ref Token t)
1389 {
1390 assert(p[0] == 'x' && p[1] == '"');
1391 t.kind = TOK.String;
1392
1393 auto tokenLineNum = lineNum;
1394 auto tokenLineBegin = lineBegin;
1395
1396 uint c;
1397 ubyte[] buffer;
1398 ubyte h; // hex number
1399 uint n; // number of hex digits
1400
1401 ++p;
1402 assert(*p == '"');
1403 while (1)
1404 {
1405 c = *++p;
1406 switch (c)
1407 {
1408 case '"':
1409 if (n & 1)
1410 error(tokenLineNum, tokenLineBegin, t.start, MID.OddNumberOfDigitsInHexString);
1411 ++p;
1412 t.pf = scanPostfix();
1413 Lreturn:
1414 t.str = cast(string) (buffer ~= 0);
1415 t.end = p;
1416 return;
1417 case '\r':
1418 if (p[1] == '\n')
1419 ++p;
1420 case '\n':
1421 assert(isNewlineEnd(p));
1422 ++lineNum;
1423 setLineBegin(p+1);
1424 continue;
1425 default:
1426 if (ishexad(c))
1427 {
1428 if (c <= '9')
1429 c -= '0';
1430 else if (c <= 'F')
1431 c -= 'A' - 10;
1432 else
1433 c -= 'a' - 10;
1434
1435 if (n & 1)
1436 {
1437 h <<= 4;
1438 h |= c;
1439 buffer ~= h;
1440 }
1441 else
1442 h = cast(ubyte)c;
1443 ++n;
1444 continue;
1445 }
1446 else if (isspace(c))
1447 continue; // Skip spaces.
1448 else if (isEOF(c))
1449 {
1450 error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedHexString);
1451 t.pf = 0;
1452 goto Lreturn;
1453 }
1454 else
1455 {
1456 auto errorAt = p;
1457 if (!isascii(c))
1458 {
1459 c = decodeUTF8();
1460 if (isUnicodeNewlineChar(c))
1461 goto case '\n';
1462 }
1463 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c);
1464 }
1465 }
1466 }
1467 assert(0);
1468 }
1469
1470 version(DDoc)
1471 {
1472 /// Scans a delimited string literal.
1473 void scanDelimitedStringLiteral(ref Token t);
1474 /// Scans a token string literal.
1475 ///
1476 /// TokenStringLiteral := "q{" Token* "}"
1477 void scanTokenStringLiteral(ref Token t);
1478 }
1479 else
1480 version(D2)
1481 {
1482 void scanDelimitedStringLiteral(ref Token t)
1483 {
1484 assert(p[0] == 'q' && p[1] == '"');
1485 t.kind = TOK.String;
1486
1487 auto tokenLineNum = lineNum;
1488 auto tokenLineBegin = lineBegin;
1489
1490 char[] buffer;
1491 dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{'
1492 closing_delim; // Will be ']', ')', '>', '},
1493 // the first character of an identifier or
1494 // any other Unicode/ASCII character.
1495 char[] str_delim; // Identifier delimiter.
1496 uint level = 1; // Counter for nestable delimiters.
1497
1498 ++p; ++p; // Skip q"
1499 uint c = *p;
1500 switch (c)
1501 {
1502 case '(':
1503 opening_delim = c;
1504 closing_delim = ')'; // c + 1
1505 break;
1506 case '[', '<', '{':
1507 opening_delim = c;
1508 closing_delim = c + 2; // Get to closing counterpart. Feature of ASCII table.
1509 break;
1510 default:
1511 dchar scanNewline()
1512 {
1513 switch (*p)
1514 {
1515 case '\r':
1516 if (p[1] == '\n')
1517 ++p;
1518 case '\n':
1519 assert(isNewlineEnd(p));
1520 ++p;
1521 ++lineNum;
1522 setLineBegin(p);
1523 return '\n';
1524 default:
1525 if (isUnicodeNewline(p))
1526 {
1527 ++p; ++p;
1528 goto case '\n';
1529 }
1530 }
1531 return 0;
1532 }
1533 // Skip leading newlines:
1534 while (scanNewline() != 0)
1535 {}
1536 assert(!isNewline(p));
1537
1538 char* begin = p;
1539 c = *p;
1540 closing_delim = c;
1541 // TODO: Check for non-printable characters?
1542 if (!isascii(c))
1543 {
1544 closing_delim = decodeUTF8();
1545 if (!isUniAlpha(closing_delim))
1546 break; // Not an identifier.
1547 }
1548 else if (!isidbeg(c))
1549 break; // Not an identifier.
1550
1551 // Parse Identifier + EndOfLine
1552 do
1553 { c = *++p; }
1554 while (isident(c) || !isascii(c) && isUnicodeAlpha())
1555 // Store identifier
1556 str_delim = begin[0..p-begin];
1557 // Scan newline
1558 if (scanNewline() == '\n')
1559 --p; // Go back one because of "c = *++p;" in main loop.
1560 else
1561 {
1562 // TODO: error(p, MID.ExpectedNewlineAfterIdentDelim);
1563 }
1564 }
1565
1566 bool checkStringDelim(char* p)
1567 {
1568 assert(str_delim.length != 0);
1569 if (buffer[$-1] == '\n' && // Last character copied to buffer must be '\n'.
1570 end-p >= str_delim.length && // Check remaining length.
1571 p[0..str_delim.length] == str_delim) // Compare.
1572 return true;
1573 return false;
1574 }
1575
1576 while (1)
1577 {
1578 c = *++p;
1579 switch (c)
1580 {
1581 case '\r':
1582 if (p[1] == '\n')
1583 ++p;
1584 case '\n':
1585 assert(isNewlineEnd(p));
1586 c = '\n'; // Convert Newline to '\n'.
1587 ++lineNum;
1588 setLineBegin(p+1);
1589 break;
1590 case 0, _Z_:
1591 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedDelimitedString);
1592 goto Lreturn3;
1593 default:
1594 if (!isascii(c))
1595 {
1596 auto begin = p;
1597 c = decodeUTF8();
1598 if (isUnicodeNewlineChar(c))
1599 goto case '\n';
1600 if (c == closing_delim)
1601 {
1602 if (str_delim.length)
1603 {
1604 if (checkStringDelim(begin))
1605 {
1606 p = begin + str_delim.length;
1607 goto Lreturn2;
1608 }
1609 }
1610 else
1611 {
1612 assert(level == 1);
1613 --level;
1614 goto Lreturn;
1615 }
1616 }
1617 encodeUTF8(buffer, c);
1618 continue;
1619 }
1620 else
1621 {
1622 if (c == opening_delim)
1623 ++level;
1624 else if (c == closing_delim)
1625 {
1626 if (str_delim.length)
1627 {
1628 if (checkStringDelim(p))
1629 {
1630 p += str_delim.length;
1631 goto Lreturn2;
1632 }
1633 }
1634 else if (--level == 0)
1635 goto Lreturn;
1636 }
1637 }
1638 }
1639 assert(isascii(c));
1640 buffer ~= c;
1641 }
1642 Lreturn: // Character delimiter.
1643 assert(c == closing_delim);
1644 assert(level == 0);
1645 ++p; // Skip closing delimiter.
1646 Lreturn2: // String delimiter.
1647 if (*p == '"')
1648 ++p;
1649 else
1650 {
1651 // TODO: error(p, MID.ExpectedDblQuoteAfterDelim, str_delim.length ? str_delim : closing_delim~"");
1652 }
1653
1654 t.pf = scanPostfix();
1655 Lreturn3: // Error.
1656 t.str = buffer ~ '\0';
1657 t.end = p;
1658 }
1659
1660 void scanTokenStringLiteral(ref Token t)
1661 {
1662 assert(p[0] == 'q' && p[1] == '{');
1663 t.kind = TOK.String;
1664
1665 auto tokenLineNum = lineNum;
1666 auto tokenLineBegin = lineBegin;
1667
1668 // A guard against changes to particular members:
1669 // this.lineNum_hline and this.errorPath
1670 ++inTokenString;
1671
1672 uint lineNum = this.lineNum;
1673 uint level = 1;
1674
1675 ++p; ++p; // Skip q{
1676
1677 auto prev_t = &t;
1678 Token* token;
1679 while (1)
1680 {
1681 token = new Token;
1682 scan(*token);
1683 // Save the tokens in a doubly linked list.
1684 // Could be useful for various tools.
1685 token.prev = prev_t;
1686 prev_t.next = token;
1687 prev_t = token;
1688 switch (token.kind)
1689 {
1690 case TOK.LBrace:
1691 ++level;
1692 continue;
1693 case TOK.RBrace:
1694 if (--level == 0)
1695 {
1696 t.tok_str = t.next;
1697 t.next = null;
1698 break;
1699 }
1700 continue;
1701 case TOK.EOF:
1702 // TODO: error(tokenLineNum, tokenLineBegin, t.start, MID.UnterminatedTokenString);
1703 t.tok_str = t.next;
1704 t.next = token;
1705 break;
1706 default:
1707 continue;
1708 }
1709 break; // Exit loop.
1710 }
1711
1712 assert(token.kind == TOK.RBrace || token.kind == TOK.EOF);
1713 assert(token.kind == TOK.RBrace && t.next is null ||
1714 token.kind == TOK.EOF && t.next !is null);
1715
1716 char[] buffer;
1717 // token points to } or EOF
1718 if (token.kind == TOK.EOF)
1719 {
1720 t.end = token.start;
1721 buffer = t.srcText[2..$].dup ~ '\0';
1722 }
1723 else
1724 {
1725 // Assign to buffer before scanPostfix().
1726 t.end = p;
1727 buffer = t.srcText[2..$-1].dup ~ '\0';
1728 t.pf = scanPostfix();
1729 t.end = p; // Assign again because of postfix.
1730 }
1731 // Convert newlines to '\n'.
1732 if (lineNum != this.lineNum)
1733 {
1734 assert(buffer[$-1] == '\0');
1735 uint i, j;
1736 for (; i < buffer.length; ++i)
1737 switch (buffer[i])
1738 {
1739 case '\r':
1740 if (buffer[i+1] == '\n')
1741 ++i;
1742 case '\n':
1743 assert(isNewlineEnd(buffer.ptr + i));
1744 buffer[j++] = '\n'; // Convert Newline to '\n'.
1745 break;
1746 default:
1747 if (isUnicodeNewline(buffer.ptr + i))
1748 {
1749 ++i; ++i;
1750 goto case '\n';
1751 }
1752 buffer[j++] = buffer[i]; // Copy.
1753 }
1754 buffer.length = j; // Adjust length.
1755 }
1756 assert(buffer[$-1] == '\0');
1757 t.str = buffer;
1758
1759 --inTokenString;
1760 }
1761 } // version(D2)
1762
1763 /// Scans an escape sequence.
1764 ///
1765 /// EscapeSequence := "\" (Octal{1,3} | ("x" Hex{2}) |
1766 /// ("u" Hex{4}) | ("U" Hex{8}) |
1767 /// "'" | "\"" | "\\" | "?" | "a" |
1768 /// "b" | "f" | "n" | "r" | "t" | "v")
1769 /// Params:
1770 /// isBinary = set to true for octal and hexadecimal escapes.
1771 /// Returns: the escape value.
1772 dchar scanEscapeSequence(ref bool isBinary)
1773 out(result)
1774 { assert(isValidChar(result)); }
1775 body
1776 {
1777 assert(*p == '\\');
1778
1779 auto sequenceStart = p; // Used for error reporting.
1780
1781 ++p;
1782 uint c = char2ev(*p);
1783 if (c)
1784 {
1785 ++p;
1786 return c;
1787 }
1788
1789 uint digits = 2;
1790
1791 switch (*p)
1792 {
1793 case 'x':
1794 isBinary = true;
1795 case_Unicode:
1796 assert(c == 0);
1797 assert(digits == 2 || digits == 4 || digits == 8);
1798 while (1)
1799 {
1800 ++p;
1801 if (ishexad(*p))
1802 {
1803 c *= 16;
1804 if (*p <= '9')
1805 c += *p - '0';
1806 else if (*p <= 'F')
1807 c += *p - 'A' + 10;
1808 else
1809 c += *p - 'a' + 10;
1810
1811 if (--digits == 0)
1812 {
1813 ++p;
1814 if (isValidChar(c))
1815 return c; // Return valid escape value.
1816
1817 error(sequenceStart, MID.InvalidUnicodeEscapeSequence,
1818 sequenceStart[0..p-sequenceStart]);
1819 break;
1820 }
1821 continue;
1822 }
1823
1824 error(sequenceStart, MID.InsufficientHexDigits,
1825 sequenceStart[0..p-sequenceStart]);
1826 break;
1827 }
1828 break;
1829 case 'u':
1830 digits = 4;
1831 goto case_Unicode;
1832 case 'U':
1833 digits = 8;
1834 goto case_Unicode;
1835 default:
1836 if (isoctal(*p))
1837 {
1838 isBinary = true;
1839 assert(c == 0);
1840 c += *p - '0';
1841 ++p;
1842 if (!isoctal(*p))
1843 return c;
1844 c *= 8;
1845 c += *p - '0';
1846 ++p;
1847 if (!isoctal(*p))
1848 return c;
1849 c *= 8;
1850 c += *p - '0';
1851 ++p;
1852 if (c > 0xFF)
1853 error(sequenceStart, MSG.InvalidOctalEscapeSequence,
1854 sequenceStart[0..p-sequenceStart]);
1855 return c; // Return valid escape value.
1856 }
1857 else if(*p == '&')
1858 {
1859 if (isalpha(*++p))
1860 {
1861 auto begin = p;
1862 while (isalnum(*++p))
1863 {}
1864
1865 if (*p == ';')
1866 {
1867 // Pass entity excluding '&' and ';'.
1868 c = entity2Unicode(begin[0..p - begin]);
1869 ++p; // Skip ;
1870 if (c != 0xFFFF)
1871 return c; // Return valid escape value.
1872 else
1873 error(sequenceStart, MID.UndefinedHTMLEntity, sequenceStart[0 .. p - sequenceStart]);
1874 }
1875 else
1876 error(sequenceStart, MID.UnterminatedHTMLEntity, sequenceStart[0 .. p - sequenceStart]);
1877 }
1878 else
1879 error(sequenceStart, MID.InvalidBeginHTMLEntity);
1880 }
1881 else if (isEndOfLine(p))
1882 error(sequenceStart, MID.UndefinedEscapeSequence,
1883 isEOF(*p) ? `\EOF` : `\NewLine`);
1884 else
1885 {
1886 char[] str = `\`;
1887 if (isascii(c))
1888 str ~= *p;
1889 else
1890 encodeUTF8(str, decodeUTF8());
1891 ++p;
1892 // TODO: check for unprintable character?
1893 error(sequenceStart, MID.UndefinedEscapeSequence, str);
1894 }
1895 }
1896 return REPLACEMENT_CHAR; // Error: return replacement character.
1897 }
1898
1899 /// Scans a number literal.
1900 ///
1901 /// $(PRE
1902 /// IntegerLiteral := (Dec|Hex|Bin|Oct)Suffix?
1903 /// Dec := (0|[1-9][0-9_]*)
1904 /// Hex := 0[xX][_]*[0-9a-zA-Z][0-9a-zA-Z_]*
1905 /// Bin := 0[bB][_]*[01][01_]*
1906 /// Oct := 0[0-7_]*
1907 /// Suffix := (L[uU]?|[uU]L?)
1908 /// )
1909 /// Invalid: "0b_", "0x_", "._" etc.
1910 void scanNumber(ref Token t)
1911 {
1912 ulong ulong_;
1913 bool overflow;
1914 bool isDecimal;
1915 size_t digits;
1916
1917 if (*p != '0')
1918 goto LscanInteger;
1919 ++p; // skip zero
1920 // check for xX bB ...
1921 switch (*p)
1922 {
1923 case 'x','X':
1924 goto LscanHex;
1925 case 'b','B':
1926 goto LscanBinary;
1927 case 'L':
1928 if (p[1] == 'i')
1929 goto LscanReal; // 0Li
1930 break; // 0L
1931 case '.':
1932 if (p[1] == '.')
1933 break; // 0..
1934 // 0.
1935 case 'i','f','F', // Imaginary and float literal suffixes.
1936 'e', 'E': // Float exponent.
1937 goto LscanReal;
1938 default:
1939 if (*p == '_')
1940 goto LscanOctal; // 0_
1941 else if (isdigit(*p))
1942 {
1943 if (*p == '8' || *p == '9')
1944 goto Loctal_hasDecimalDigits; // 08 or 09
1945 else
1946 goto Loctal_enter_loop; // 0[0-7]
1947 }
1948 }
1949
1950 // Number 0
1951 assert(p[-1] == '0');
1952 assert(*p != '_' && !isdigit(*p));
1953 assert(ulong_ == 0);
1954 isDecimal = true;
1955 goto Lfinalize;
1956
1957 LscanInteger:
1958 assert(*p != 0 && isdigit(*p));
1959 isDecimal = true;
1960 goto Lenter_loop_int;
1961 while (1)
1962 {
1963 if (*++p == '_')
1964 continue;
1965 if (!isdigit(*p))
1966 break;
1967 Lenter_loop_int:
1968 if (ulong_ < ulong.max/10 || (ulong_ == ulong.max/10 && *p <= '5'))
1969 {
1970 ulong_ *= 10;
1971 ulong_ += *p - '0';
1972 continue;
1973 }
1974 // Overflow: skip following digits.
1975 overflow = true;
1976 while (isdigit(*++p)) {}
1977 break;
1978 }
1979
1980 // The number could be a float, so check overflow below.
1981 switch (*p)
1982 {
1983 case '.':
1984 if (p[1] != '.')
1985 goto LscanReal;
1986 break;
1987 case 'L':
1988 if (p[1] != 'i')
1989 break;
1990 case 'i', 'f', 'F', 'e', 'E':
1991 goto LscanReal;
1992 default:
1993 }
1994
1995 if (overflow)
1996 error(t.start, MID.OverflowDecimalNumber);
1997
1998 assert((isdigit(p[-1]) || p[-1] == '_') && !isdigit(*p) && *p != '_');
1999 goto Lfinalize;
2000
2001 LscanHex:
2002 assert(digits == 0);
2003 assert(*p == 'x' || *p == 'X');
2004 while (1)
2005 {
2006 if (*++p == '_')
2007 continue;
2008 if (!ishexad(*p))
2009 break;
2010 ++digits;
2011 ulong_ *= 16;
2012 if (*p <= '9')
2013 ulong_ += *p - '0';
2014 else if (*p <= 'F')
2015 ulong_ += *p - 'A' + 10;
2016 else
2017 ulong_ += *p - 'a' + 10;
2018 }
2019
2020 assert(ishexad(p[-1]) || p[-1] == '_' || p[-1] == 'x' || p[-1] == 'X');
2021 assert(!ishexad(*p) && *p != '_');
2022
2023 switch (*p)
2024 {
2025 case '.':
2026 if (p[1] == '.')
2027 break;
2028 case 'p', 'P':
2029 return scanHexReal(t);
2030 default:
2031 }
2032
2033 if (digits == 0 || digits > 16)
2034 error(t.start, digits == 0 ? MID.NoDigitsInHexNumber : MID.OverflowHexNumber);
2035
2036 goto Lfinalize;
2037
2038 LscanBinary:
2039 assert(digits == 0);
2040 assert(*p == 'b' || *p == 'B');
2041 while (1)
2042 {
2043 if (*++p == '0')
2044 {
2045 ++digits;
2046 ulong_ *= 2;
2047 }
2048 else if (*p == '1')
2049 {
2050 ++digits;
2051 ulong_ *= 2;
2052 ulong_ += *p - '0';
2053 }
2054 else if (*p == '_')
2055 continue;
2056 else
2057 break;
2058 }
2059
2060 if (digits == 0 || digits > 64)
2061 error(t.start, digits == 0 ? MID.NoDigitsInBinNumber : MID.OverflowBinaryNumber);
2062
2063 assert(p[-1] == '0' || p[-1] == '1' || p[-1] == '_' || p[-1] == 'b' || p[-1] == 'B', p[-1] ~ "");
2064 assert( !(*p == '0' || *p == '1' || *p == '_') );
2065 goto Lfinalize;
2066
2067 LscanOctal:
2068 assert(*p == '_');
2069 while (1)
2070 {
2071 if (*++p == '_')
2072 continue;
2073 if (!isoctal(*p))
2074 break;
2075 Loctal_enter_loop:
2076 if (ulong_ < ulong.max/2 || (ulong_ == ulong.max/2 && *p <= '1'))
2077 {
2078 ulong_ *= 8;
2079 ulong_ += *p - '0';
2080 continue;
2081 }
2082 // Overflow: skip following digits.
2083 overflow = true;
2084 while (isoctal(*++p)) {}
2085 break;
2086 }
2087
2088 bool hasDecimalDigits;
2089 if (isdigit(*p))
2090 {
2091 Loctal_hasDecimalDigits:
2092 hasDecimalDigits = true;
2093 while (isdigit(*++p)) {}
2094 }
2095
2096 // The number could be a float, so check errors below.
2097 switch (*p)
2098 {
2099 case '.':
2100 if (p[1] != '.')
2101 goto LscanReal;
2102 break;
2103 case 'L':
2104 if (p[1] != 'i')
2105 break;
2106 case 'i', 'f', 'F', 'e', 'E':
2107 goto LscanReal;
2108 default:
2109 }
2110
2111 if (hasDecimalDigits)
2112 error(t.start, MID.OctalNumberHasDecimals);
2113
2114 if (overflow)
2115 error(t.start, MID.OverflowOctalNumber);
2116 // goto Lfinalize;
2117
2118 Lfinalize:
2119 enum Suffix
2120 {
2121 None = 0,
2122 Unsigned = 1,
2123 Long = 2
2124 }
2125
2126 // Scan optional suffix: L, Lu, LU, u, uL, U or UL.
2127 Suffix suffix;
2128 while (1)
2129 {
2130 switch (*p)
2131 {
2132 case 'L':
2133 if (suffix & Suffix.Long)
2134 break;
2135 suffix |= Suffix.Long;
2136 ++p;
2137 continue;
2138 case 'u', 'U':
2139 if (suffix & Suffix.Unsigned)
2140 break;
2141 suffix |= Suffix.Unsigned;
2142 ++p;
2143 continue;
2144 default:
2145 break;
2146 }
2147 break;
2148 }
2149
2150 // Determine type of Integer.
2151 switch (suffix)
2152 {
2153 case Suffix.None:
2154 if (ulong_ & 0x8000_0000_0000_0000)
2155 {
2156 if (isDecimal)
2157 error(t.start, MID.OverflowDecimalSign);
2158 t.kind = TOK.Uint64;
2159 }
2160 else if (ulong_ & 0xFFFF_FFFF_0000_0000)
2161 t.kind = TOK.Int64;
2162 else if (ulong_ & 0x8000_0000)
2163 t.kind = isDecimal ? TOK.Int64 : TOK.Uint32;
2164 else
2165 t.kind = TOK.Int32;
2166 break;
2167 case Suffix.Unsigned:
2168 if (ulong_ & 0xFFFF_FFFF_0000_0000)
2169 t.kind = TOK.Uint64;
2170 else
2171 t.kind = TOK.Uint32;
2172 break;
2173 case Suffix.Long:
2174 if (ulong_ & 0x8000_0000_0000_0000)
2175 {
2176 if (isDecimal)
2177 error(t.start, MID.OverflowDecimalSign);
2178 t.kind = TOK.Uint64;
2179 }
2180 else
2181 t.kind = TOK.Int64;
2182 break;
2183 case Suffix.Unsigned | Suffix.Long:
2184 t.kind = TOK.Uint64;
2185 break;
2186 default:
2187 assert(0);
2188 }
2189 t.ulong_ = ulong_;
2190 t.end = p;
2191 return;
2192 LscanReal:
2193 scanReal(t);
2194 return;
2195 }
2196
2197 /// Scans a floating point number literal.
2198 ///
2199 /// $(PRE
2200 /// FloatLiteral := Float[fFL]?i?
2201 /// Float := DecFloat | HexFloat
2202 /// DecFloat := ([0-9][0-9_]*[.][0-9_]*DecExponent?) |
2203 /// [.][0-9][0-9_]*DecExponent? | [0-9][0-9_]*DecExponent
2204 /// DecExponent := [eE][+-]?[0-9][0-9_]*
2205 /// HexFloat := 0[xX](HexDigits[.]HexDigits |
2206 /// [.][0-9a-zA-Z]HexDigits? |
2207 /// HexDigits)HexExponent
2208 /// HexExponent := [pP][+-]?[0-9][0-9_]*
2209 /// )
2210 void scanReal(ref Token t)
2211 {
2212 if (*p == '.')
2213 {
2214 assert(p[1] != '.');
2215 // This function was called by scan() or scanNumber().
2216 while (isdigit(*++p) || *p == '_') {}
2217 }
2218 else
2219 // This function was called by scanNumber().
2220 assert(delegate ()
2221 {
2222 switch (*p)
2223 {
2224 case 'L':
2225 if (p[1] != 'i')
2226 return false;
2227 case 'i', 'f', 'F', 'e', 'E':
2228 return true;
2229 default:
2230 }
2231 return false;
2232 }()
2233 );
2234
2235 // Scan exponent.
2236 if (*p == 'e' || *p == 'E')
2237 {
2238 ++p;
2239 if (*p == '-' || *p == '+')
2240 ++p;
2241 if (isdigit(*p))
2242 while (isdigit(*++p) || *p == '_') {}
2243 else
2244 error(t.start, MID.FloatExpMustStartWithDigit);
2245 }
2246
2247 // Copy whole number and remove underscores from buffer.
2248 char[] buffer = t.start[0..p-t.start].dup;
2249 uint j;
2250 foreach (c; buffer)
2251 if (c != '_')
2252 buffer[j++] = c;
2253 buffer.length = j; // Adjust length.
2254 buffer ~= 0; // Terminate for C functions.
2255
2256 finalizeFloat(t, buffer);
2257 }
2258
2259 /// Scans a hexadecimal floating point number literal.
2260 void scanHexReal(ref Token t)
2261 {
2262 assert(*p == '.' || *p == 'p' || *p == 'P');
2263 MID mid;
2264 if (*p == '.')
2265 while (ishexad(*++p) || *p == '_')
2266 {}
2267 // Decimal exponent is required.
2268 if (*p != 'p' && *p != 'P')
2269 {
2270 mid = MID.HexFloatExponentRequired;
2271 goto Lerr;
2272 }
2273 // Scan exponent
2274 assert(*p == 'p' || *p == 'P');
2275 ++p;
2276 if (*p == '+' || *p == '-')
2277 ++p;
2278 if (!isdigit(*p))
2279 {
2280 mid = MID.HexFloatExpMustStartWithDigit;
2281 goto Lerr;
2282 }
2283 while (isdigit(*++p) || *p == '_')
2284 {}
2285 // Copy whole number and remove underscores from buffer.
2286 char[] buffer = t.start[0..p-t.start].dup;
2287 uint j;
2288 foreach (c; buffer)
2289 if (c != '_')
2290 buffer[j++] = c;
2291 buffer.length = j; // Adjust length.
2292 buffer ~= 0; // Terminate for C functions.
2293 finalizeFloat(t, buffer);
2294 return;
2295 Lerr:
2296 t.kind = TOK.Float32;
2297 t.end = p;
2298 error(t.start, mid);
2299 }
2300
2301 /// Sets the value of the token.
2302 /// Params:
2303 /// t = receives the value.
2304 /// buffer = the well-formed float number.
2305 void finalizeFloat(ref Token t, string buffer)
2306 {
2307 assert(buffer[$-1] == 0);
2308 // Float number is well-formed. Check suffixes and do conversion.
2309 switch (*p)
2310 {
2311 case 'f', 'F':
2312 t.kind = TOK.Float32;
2313 t.float_ = strtof(buffer.ptr, null);
2314 ++p;
2315 break;
2316 case 'L':
2317 t.kind = TOK.Float80;
2318 t.real_ = strtold(buffer.ptr, null);
2319 ++p;
2320 break;
2321 default:
2322 t.kind = TOK.Float64;
2323 t.double_ = strtod(buffer.ptr, null);
2324 }
2325 if (*p == 'i')
2326 {
2327 ++p;
2328 t.kind += 3; // Switch to imaginary counterpart.
2329 assert(t.kind == TOK.Imaginary32 ||
2330 t.kind == TOK.Imaginary64 ||
2331 t.kind == TOK.Imaginary80);
2332 }
2333 if (errno() == ERANGE)
2334 error(t.start, MID.OverflowFloatNumber);
2335 t.end = p;
2336 }
2337
2338 /// Scans a special token sequence.
2339 ///
2340 /// SpecialTokenSequence := "#line" Integer Filespec? EndOfLine
2341 void scanSpecialTokenSequence(ref Token t)
2342 {
2343 assert(*p == '#');
2344 t.kind = TOK.HashLine;
2345 t.setWhitespaceFlag();
2346
2347 MID mid;
2348 char* errorAtColumn = p;
2349 char* tokenEnd = ++p;
2350
2351 if (!(p[0] == 'l' && p[1] == 'i' && p[2] == 'n' && p[3] == 'e'))
2352 {
2353 mid = MID.ExpectedIdentifierSTLine;
2354 goto Lerr;
2355 }
2356 p += 3;
2357 tokenEnd = p + 1;
2358
2359 // TODO: #line58"path/file" is legal. Require spaces?
2360 // State.Space could be used for that purpose.
2361 enum State
2362 { /+Space,+/ Integer, Filespec, End }
2363
2364 State state = State.Integer;
2365
2366 while (!isEndOfLine(++p))
2367 {
2368 if (isspace(*p))
2369 continue;
2370 if (state == State.Integer)
2371 {
2372 if (!isdigit(*p))
2373 {
2374 errorAtColumn = p;
2375 mid = MID.ExpectedIntegerAfterSTLine;
2376 goto Lerr;
2377 }
2378 t.tokLineNum = new Token;
2379 scan(*t.tokLineNum);
2380 tokenEnd = p;
2381 if (t.tokLineNum.kind != TOK.Int32 && t.tokLineNum.kind != TOK.Uint32)
2382 {
2383 errorAtColumn = t.tokLineNum.start;
2384 mid = MID.ExpectedIntegerAfterSTLine;
2385 goto Lerr;
2386 }
2387 --p; // Go one back because scan() advanced p past the integer.
2388 state = State.Filespec;
2389 }
2390 else if (state == State.Filespec && *p == '"')
2391 { // MID.ExpectedFilespec is deprecated.
2392 // if (*p != '"')
2393 // {
2394 // errorAtColumn = p;
2395 // mid = MID.ExpectedFilespec;
2396 // goto Lerr;
2397 // }
2398 t.tokLineFilespec = new Token;
2399 t.tokLineFilespec.start = p;
2400 t.tokLineFilespec.kind = TOK.Filespec;
2401 t.tokLineFilespec.setWhitespaceFlag();
2402 while (*++p != '"')
2403 {
2404 if (isEndOfLine(p))
2405 {
2406 errorAtColumn = t.tokLineFilespec.start;
2407 mid = MID.UnterminatedFilespec;
2408 t.tokLineFilespec.end = p;
2409 tokenEnd = p;
2410 goto Lerr;
2411 }
2412 isascii(*p) || decodeUTF8();
2413 }
2414 auto start = t.tokLineFilespec.start +1; // +1 skips '"'
2415 t.tokLineFilespec.str = start[0 .. p - start];
2416 t.tokLineFilespec.end = p + 1;
2417 tokenEnd = p + 1;
2418 state = State.End;
2419 }
2420 else/+ if (state == State.End)+/
2421 {
2422 mid = MID.UnterminatedSpecialToken;
2423 goto Lerr;
2424 }
2425 }
2426 assert(isEndOfLine(p));
2427
2428 if (state == State.Integer)
2429 {
2430 errorAtColumn = p;
2431 mid = MID.ExpectedIntegerAfterSTLine;
2432 goto Lerr;
2433 }
2434
2435 // Evaluate #line only when not in token string.
2436 if (!inTokenString && t.tokLineNum)
2437 {
2438 this.lineNum_hline = this.lineNum - t.tokLineNum.uint_ + 1;
2439 if (t.tokLineFilespec)
2440 newFilePath(t.tokLineFilespec.str);
2441 }
2442 p = tokenEnd;
2443 t.end = tokenEnd;
2444
2445 return;
2446 Lerr:
2447 p = tokenEnd;
2448 t.end = tokenEnd;
2449 error(errorAtColumn, mid);
2450 }
2451
2452 /// Inserts an empty dummy token (TOK.Empty) before t.
2453 ///
2454 /// Useful in the parsing phase for representing a node in the AST
2455 /// that doesn't consume an actual token from the source text.
2456 Token* insertEmptyTokenBefore(Token* t)
2457 {
2458 assert(t !is null && t.prev !is null);
2459 assert(text.ptr <= t.start && t.start < end, Token.toString(t.kind));
2460 assert(text.ptr <= t.end && t.end <= end, Token.toString(t.kind));
2461
2462 auto prev_t = t.prev;
2463 auto new_t = new Token;
2464 new_t.kind = TOK.Empty;
2465 new_t.start = new_t.end = prev_t.end;
2466 // Link in new token.
2467 prev_t.next = new_t;
2468 new_t.prev = prev_t;
2469 new_t.next = t;
2470 t.prev = new_t;
2471 return new_t;
2472 }
2473
2474 /// Returns the error line number.
2475 uint errorLineNumber(uint lineNum)
2476 {
2477 return lineNum - this.lineNum_hline;
2478 }
2479
2480 /// Forwards error parameters.
2481 void error(char* columnPos, char[] msg, ...)
2482 {
2483 error_(this.lineNum, this.lineBegin, columnPos, msg, _arguments, _argptr);
2484 }
2485
2486 /// ditto
2487 void error(char* columnPos, MID mid, ...)
2488 {
2489 error_(this.lineNum, this.lineBegin, columnPos, GetMsg(mid), _arguments, _argptr);
2490 }
2491
2492 /// ditto
2493 void error(uint lineNum, char* lineBegin, char* columnPos, MID mid, ...)
2494 {
2495 error_(lineNum, lineBegin, columnPos, GetMsg(mid), _arguments, _argptr);
2496 }
2497
2498 /// Creates an error report and appends it to a list.
2499 /// Params:
2500 /// lineNum = the line number.
2501 /// lineBegin = points to the first character of the current line.
2502 /// columnPos = points to the character where the error is located.
2503 /// msg = the message.
2504 void error_(uint lineNum, char* lineBegin, char* columnPos, char[] msg,
2505 TypeInfo[] _arguments, Arg _argptr)
2506 {
2507 lineNum = this.errorLineNumber(lineNum);
2508 auto errorPath = this.filePaths.setPath;
2509 auto location = new Location(errorPath, lineNum, lineBegin, columnPos);
2510 msg = Format(_arguments, _argptr, msg);
2511 auto error = new LexerError(location, msg);
2512 errors ~= error;
2513 if (infoMan !is null)
2514 infoMan ~= error;
2515 }
2516
2517 /// Scans the whole source text until EOF is encountered.
2518 void scanAll()
2519 {
2520 while (nextToken() != TOK.EOF)
2521 {}
2522 }
2523
2524 /// Returns the first token of the source text.
2525 /// This can be the EOF token.
2526 /// Structure: HEAD -> Newline -> First Token
2527 Token* firstToken()
2528 {
2529 return this.head.next.next;
2530 }
2531
2532 /// Returns true if str is a valid D identifier.
2533 static bool isIdentifierString(char[] str)
2534 {
2535 if (str.length == 0 || isdigit(str[0]))
2536 return false;
2537 size_t idx;
2538 do
2539 {
2540 auto c = dil.Unicode.decode(str, idx);
2541 if (c == ERROR_CHAR || !(isident(c) || !isascii(c) && isUniAlpha(c)))
2542 return false;
2543 } while (idx < str.length)
2544 return true;
2545 }
2546
2547 /// Returns true if str is a keyword or a special token (__FILE__, __LINE__ etc.)
2548 static bool isReservedIdentifier(char[] str)
2549 {
2550 if (!isIdentifierString(str))
2551 return false; // str is not a valid identifier.
2552
2553 auto id = IdTable.inStatic(str);
2554 if (id is null || id.kind == TOK.Identifier)
2555 return false; // str is not in the table or a normal identifier.
2556
2557 return true;
2558 }
2559
2560 /// Returns true if the current character to be decoded is
2561 /// a Unicode alpha character.
2562 ///
2563 /// The current pointer 'p' is not advanced if false is returned.
2564 bool isUnicodeAlpha()
2565 {
2566 assert(!isascii(*p), "check for ASCII char before calling decodeUTF8().");
2567 char* p = this.p;
2568 dchar d = *p;
2569 ++p; // Move to second byte.
2570 // Error if second byte is not a trail byte.
2571 if (!isTrailByte(*p))
2572 return false;
2573 // Check for overlong sequences.
2574 switch (d)
2575 {
2576 case 0xE0, 0xF0, 0xF8, 0xFC:
2577 if ((*p & d) == 0x80)
2578 return false;
2579 default:
2580 if ((d & 0xFE) == 0xC0) // 1100000x
2581 return false;
2582 }
2583 const char[] checkNextByte = "if (!isTrailByte(*++p))"
2584 " return false;";
2585 const char[] appendSixBits = "d = (d << 6) | *p & 0b0011_1111;";
2586 // Decode
2587 if ((d & 0b1110_0000) == 0b1100_0000)
2588 {
2589 d &= 0b0001_1111;
2590 mixin(appendSixBits);
2591 }
2592 else if ((d & 0b1111_0000) == 0b1110_0000)
2593 {
2594 d &= 0b0000_1111;
2595 mixin(appendSixBits ~
2596 checkNextByte ~ appendSixBits);
2597 }
2598 else if ((d & 0b1111_1000) == 0b1111_0000)
2599 {
2600 d &= 0b0000_0111;
2601 mixin(appendSixBits ~
2602 checkNextByte ~ appendSixBits ~
2603 checkNextByte ~ appendSixBits);
2604 }
2605 else
2606 return false;
2607
2608 assert(isTrailByte(*p));
2609 if (!isValidChar(d) || !isUniAlpha(d))
2610 return false;
2611 // Only advance pointer if this is a Unicode alpha character.
2612 this.p = p;
2613 return true;
2614 }
2615
2616 /// Decodes the next UTF-8 sequence.
2617 dchar decodeUTF8()
2618 {
2619 assert(!isascii(*p), "check for ASCII char before calling decodeUTF8().");
2620 char* p = this.p;
2621 dchar d = *p;
2622
2623 ++p; // Move to second byte.
2624 // Error if second byte is not a trail byte.
2625 if (!isTrailByte(*p))
2626 goto Lerr2;
2627
2628 // Check for overlong sequences.
2629 switch (d)
2630 {
2631 case 0xE0, // 11100000 100xxxxx
2632 0xF0, // 11110000 1000xxxx
2633 0xF8, // 11111000 10000xxx
2634 0xFC: // 11111100 100000xx
2635 if ((*p & d) == 0x80)
2636 goto Lerr;
2637 default:
2638 if ((d & 0xFE) == 0xC0) // 1100000x
2639 goto Lerr;
2640 }
2641
2642 const char[] checkNextByte = "if (!isTrailByte(*++p))"
2643 " goto Lerr2;";
2644 const char[] appendSixBits = "d = (d << 6) | *p & 0b0011_1111;";
2645
2646 // Decode
2647 if ((d & 0b1110_0000) == 0b1100_0000)
2648 { // 110xxxxx 10xxxxxx
2649 d &= 0b0001_1111;
2650 mixin(appendSixBits);
2651 }
2652 else if ((d & 0b1111_0000) == 0b1110_0000)
2653 { // 1110xxxx 10xxxxxx 10xxxxxx
2654 d &= 0b0000_1111;
2655 mixin(appendSixBits ~
2656 checkNextByte ~ appendSixBits);
2657 }
2658 else if ((d & 0b1111_1000) == 0b1111_0000)
2659 { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2660 d &= 0b0000_0111;
2661 mixin(appendSixBits ~
2662 checkNextByte ~ appendSixBits ~
2663 checkNextByte ~ appendSixBits);
2664 }
2665 else
2666 // 5 and 6 byte UTF-8 sequences are not allowed yet.
2667 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2668 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2669 goto Lerr;
2670
2671 assert(isTrailByte(*p));
2672
2673 if (!isValidChar(d))
2674 {
2675 Lerr:
2676 // Three cases:
2677 // *) the UTF-8 sequence was successfully decoded but the resulting
2678 // character is invalid.
2679 // p points to last trail byte in the sequence.
2680 // *) the UTF-8 sequence is overlong.
2681 // p points to second byte in the sequence.
2682 // *) the UTF-8 sequence has more than 4 bytes or starts with
2683 // a trail byte.
2684 // p points to second byte in the sequence.
2685 assert(isTrailByte(*p));
2686 // Move to next ASCII character or lead byte of a UTF-8 sequence.
2687 while (p < (end-1) && isTrailByte(*p))
2688 ++p;
2689 --p;
2690 assert(!isTrailByte(p[1]));
2691 Lerr2:
2692 d = REPLACEMENT_CHAR;
2693 error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p));
2694 }
2695
2696 this.p = p;
2697 return d;
2698 }
2699
2700 /// Encodes the character d and appends it to str.
2701 static void encodeUTF8(ref char[] str, dchar d)
2702 {
2703 assert(!isascii(d), "check for ASCII char before calling encodeUTF8().");
2704 assert(isValidChar(d), "check if character is valid before calling encodeUTF8().");
2705
2706 char[6] b = void;
2707 if (d < 0x800)
2708 {
2709 b[0] = 0xC0 | (d >> 6);
2710 b[1] = 0x80 | (d & 0x3F);
2711 str ~= b[0..2];
2712 }
2713 else if (d < 0x10000)
2714 {
2715 b[0] = 0xE0 | (d >> 12);
2716 b[1] = 0x80 | ((d >> 6) & 0x3F);
2717 b[2] = 0x80 | (d & 0x3F);
2718 str ~= b[0..3];
2719 }
2720 else if (d < 0x200000)
2721 {
2722 b[0] = 0xF0 | (d >> 18);
2723 b[1] = 0x80 | ((d >> 12) & 0x3F);
2724 b[2] = 0x80 | ((d >> 6) & 0x3F);
2725 b[3] = 0x80 | (d & 0x3F);
2726 str ~= b[0..4];
2727 }
2728 /+ // There are no 5 and 6 byte UTF-8 sequences yet.
2729 else if (d < 0x4000000)
2730 {
2731 b[0] = 0xF8 | (d >> 24);
2732 b[1] = 0x80 | ((d >> 18) & 0x3F);
2733 b[2] = 0x80 | ((d >> 12) & 0x3F);
2734 b[3] = 0x80 | ((d >> 6) & 0x3F);
2735 b[4] = 0x80 | (d & 0x3F);
2736 str ~= b[0..5];
2737 }
2738 else if (d < 0x80000000)
2739 {
2740 b[0] = 0xFC | (d >> 30);
2741 b[1] = 0x80 | ((d >> 24) & 0x3F);
2742 b[2] = 0x80 | ((d >> 18) & 0x3F);
2743 b[3] = 0x80 | ((d >> 12) & 0x3F);
2744 b[4] = 0x80 | ((d >> 6) & 0x3F);
2745 b[5] = 0x80 | (d & 0x3F);
2746 str ~= b[0..6];
2747 }
2748 +/
2749 else
2750 assert(0);
2751 }
2752
2753 /// Formats the bytes between start and end.
2754 /// Returns: e.g.: abc -> \x61\x62\x63
2755 static char[] formatBytes(char* start, char* end)
2756 {
2757 auto strLen = end-start;
2758 const formatLen = `\xXX`.length;
2759 char[] result = new char[strLen*formatLen]; // Reserve space.
2760 result.length = 0;
2761 foreach (c; cast(ubyte[])start[0..strLen])
2762 result ~= Format("\\x{:X}", c);
2763 return result;
2764 }
2765
2766 /// Searches for an invalid UTF-8 sequence in str.
2767 /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80).
2768 static string findInvalidUTF8Sequence(string str)
2769 {
2770 char* p = str.ptr, end = p + str.length;
2771 while (p < end)
2772 {
2773 if (decode(p, end) == ERROR_CHAR)
2774 {
2775 auto begin = p;
2776 // Skip trail-bytes.
2777 while (++p < end && isTrailByte(*p))
2778 {}
2779 return Lexer.formatBytes(begin, p);
2780 }
2781 }
2782 assert(p == end);
2783 return "";
2784 }
2785 }
2786
2787 /// Tests the lexer with a list of tokens.
2788 unittest
2789 {
2790 Stdout("Testing Lexer.\n");
2791 struct Pair
2792 {
2793 char[] tokenText;
2794 TOK kind;
2795 }
2796 static Pair[] pairs = [
2797 {"#!äöüß", TOK.Shebang}, {"\n", TOK.Newline},
2798 {"//çay", TOK.Comment}, {"\n", TOK.Newline},
2799 {"&", TOK.AndBinary},
2800 {"/*çağ*/", TOK.Comment}, {"&&", TOK.AndLogical},
2801 {"/+çak+/", TOK.Comment}, {"&=", TOK.AndAssign},
2802 {">", TOK.Greater}, {"+", TOK.Plus},
2803 {">=", TOK.GreaterEqual}, {"++", TOK.PlusPlus},
2804 {">>", TOK.RShift}, {"+=", TOK.PlusAssign},
2805 {">>=", TOK.RShiftAssign}, {"-", TOK.Minus},
2806 {">>>", TOK.URShift}, {"--", TOK.MinusMinus},
2807 {">>>=", TOK.URShiftAssign}, {"-=", TOK.MinusAssign},
2808 {"<", TOK.Less}, {"=", TOK.Assign},
2809 {"<=", TOK.LessEqual}, {"==", TOK.Equal},
2810 {"<>", TOK.LorG}, {"~", TOK.Tilde},
2811 {"<>=", TOK.LorEorG}, {"~=", TOK.CatAssign},
2812 {"<<", TOK.LShift}, {"*", TOK.Mul},
2813 {"<<=", TOK.LShiftAssign}, {"*=", TOK.MulAssign},
2814 {"!", TOK.Not}, {"/", TOK.Div},
2815 {"!=", TOK.NotEqual}, {"/=", TOK.DivAssign},
2816 {"!<", TOK.UorGorE}, {"^", TOK.Xor},
2817 {"!>", TOK.UorLorE}, {"^=", TOK.XorAssign},
2818 {"!<=", TOK.UorG}, {"%", TOK.Mod},
2819 {"!>=", TOK.UorL}, {"%=", TOK.ModAssign},
2820 {"!<>", TOK.UorE}, {"(", TOK.LParen},
2821 {"!<>=", TOK.Unordered}, {")", TOK.RParen},
2822 {".", TOK.Dot}, {"[", TOK.LBracket},
2823 {"..", TOK.Slice}, {"]", TOK.RBracket},
2824 {"...", TOK.Ellipses}, {"{", TOK.LBrace},
2825 {"|", TOK.OrBinary}, {"}", TOK.RBrace},
2826 {"||", TOK.OrLogical}, {":", TOK.Colon},
2827 {"|=", TOK.OrAssign}, {";", TOK.Semicolon},
2828 {"?", TOK.Question}, {",", TOK.Comma},
2829 {"$", TOK.Dollar}, {"cam", TOK.Identifier},
2830 {"çay", TOK.Identifier}, {".0", TOK.Float64},
2831 {"0", TOK.Int32}, {"\n", TOK.Newline},
2832 {"\r", TOK.Newline}, {"\r\n", TOK.Newline},
2833 {"\u2028", TOK.Newline}, {"\u2029", TOK.Newline}
2834 ];
2835
2836 char[] src;
2837
2838 // Join all token texts into a single string.
2839 foreach (i, pair; pairs)
2840 if (pair.kind == TOK.Comment && pair.tokenText[1] == '/' || // Line comment.
2841 pair.kind == TOK.Shebang)
2842 {
2843 assert(pairs[i+1].kind == TOK.Newline); // Must be followed by a newline.
2844 src ~= pair.tokenText;
2845 }
2846 else
2847 src ~= pair.tokenText ~ " ";
2848
2849 auto lx = new Lexer(new SourceText("", src));
2850 auto token = lx.getTokens();
2851
2852 uint i;
2853 assert(token == lx.head);
2854 assert(token.next.kind == TOK.Newline);
2855 token = token.next.next;
2856 do
2857 {
2858 assert(i < pairs.length);
2859 assert(token.srcText == pairs[i].tokenText, Format("Scanned '{0}' but expected '{1}'", token.srcText, pairs[i].tokenText));
2860 ++i;
2861 token = token.next;
2862 } while (token.kind != TOK.EOF)
2863 }
2864
2865 /// Tests the Lexer's peek() method.
2866 unittest
2867 {
2868 Stdout("Testing method Lexer.peek()\n");
2869 auto sourceText = new SourceText("", "unittest { }");
2870 auto lx = new Lexer(sourceText, null);
2871
2872 auto next = lx.head;
2873 lx.peek(next);
2874 assert(next.kind == TOK.Newline);
2875 lx.peek(next);
2876 assert(next.kind == TOK.Unittest);
2877 lx.peek(next);
2878 assert(next.kind == TOK.LBrace);
2879 lx.peek(next);
2880 assert(next.kind == TOK.RBrace);
2881 lx.peek(next);
2882 assert(next.kind == TOK.EOF);
2883
2884 lx = new Lexer(new SourceText("", ""));
2885 next = lx.head;
2886 lx.peek(next);
2887 assert(next.kind == TOK.Newline);
2888 lx.peek(next);
2889 assert(next.kind == TOK.EOF);
2890 }
2891
2892 unittest
2893 {
2894 // Numbers unittest
2895 // 0L 0ULi 0_L 0_UL 0x0U 0x0p2 0_Fi 0_e2 0_F 0_i
2896 // 0u 0U 0uL 0UL 0L 0LU 0Lu
2897 // 0Li 0f 0F 0fi 0Fi 0i
2898 // 0b_1_LU 0b1000u
2899 // 0x232Lu
2900 }