Mercurial > projects > ldc
comparison dmd/lexer.c @ 1:c53b6e3fe49a trunk
[svn r5] Initial commit. Most things are very rough.
author | lindquist |
---|---|
date | Sat, 01 Sep 2007 21:43:27 +0200 |
parents | |
children | 788401029ecf |
comparison
equal
deleted
inserted
replaced
0:a9e71648e74d | 1:c53b6e3fe49a |
---|---|
1 | |
2 // Compiler implementation of the D programming language | |
3 // Copyright (c) 1999-2007 by Digital Mars | |
4 // All Rights Reserved | |
5 // written by Walter Bright | |
6 // http://www.digitalmars.com | |
7 // License for redistribution is by either the Artistic License | |
8 // in artistic.txt, or the GNU General Public License in gnu.txt. | |
9 // See the included readme.txt for details. | |
10 | |
11 /* Lexical Analyzer */ | |
12 | |
13 #include <stdio.h> | |
14 #include <string.h> | |
15 #include <ctype.h> | |
16 #include <stdarg.h> | |
17 #include <errno.h> | |
18 #include <wchar.h> | |
19 #include <stdlib.h> | |
20 #include <assert.h> | |
21 #include <sys/time.h> | |
22 | |
23 #ifdef IN_GCC | |
24 | |
25 #include <time.h> | |
26 #include "mem.h" | |
27 | |
28 #else | |
29 | |
30 #if __GNUC__ | |
31 #include <time.h> | |
32 #endif | |
33 | |
34 #if IN_LLVM | |
35 #include "mem.h" | |
36 #elif _WIN32 | |
37 #include "..\root\mem.h" | |
38 #else | |
39 #include "../root/mem.h" | |
40 #endif | |
41 #endif | |
42 | |
43 #include "stringtable.h" | |
44 | |
45 #include "lexer.h" | |
46 #include "utf.h" | |
47 #include "identifier.h" | |
48 #include "id.h" | |
49 #include "module.h" | |
50 | |
51 #if _WIN32 && __DMC__ | |
52 // from \dm\src\include\setlocal.h | |
53 extern "C" char * __cdecl __locale_decpoint; | |
54 #endif | |
55 | |
56 extern int HtmlNamedEntity(unsigned char *p, int length); | |
57 | |
58 #define LS 0x2028 // UTF line separator | |
59 #define PS 0x2029 // UTF paragraph separator | |
60 | |
61 /******************************************** | |
62 * Do our own char maps | |
63 */ | |
64 | |
65 static unsigned char cmtable[256]; | |
66 | |
67 const int CMoctal = 0x1; | |
68 const int CMhex = 0x2; | |
69 const int CMidchar = 0x4; | |
70 | |
71 inline unsigned char isoctal (unsigned char c) { return cmtable[c] & CMoctal; } | |
72 inline unsigned char ishex (unsigned char c) { return cmtable[c] & CMhex; } | |
73 inline unsigned char isidchar(unsigned char c) { return cmtable[c] & CMidchar; } | |
74 | |
75 static void cmtable_init() | |
76 { | |
77 for (unsigned c = 0; c < sizeof(cmtable) / sizeof(cmtable[0]); c++) | |
78 { | |
79 if ('0' <= c && c <= '7') | |
80 cmtable[c] |= CMoctal; | |
81 if (isdigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')) | |
82 cmtable[c] |= CMhex; | |
83 if (isalnum(c) || c == '_') | |
84 cmtable[c] |= CMidchar; | |
85 } | |
86 } | |
87 | |
88 | |
89 /************************* Token **********************************************/ | |
90 | |
91 char *Token::tochars[TOKMAX]; | |
92 | |
93 void *Token::operator new(size_t size) | |
94 { Token *t; | |
95 | |
96 if (Lexer::freelist) | |
97 { | |
98 t = Lexer::freelist; | |
99 Lexer::freelist = t->next; | |
100 return t; | |
101 } | |
102 | |
103 return ::operator new(size); | |
104 } | |
105 | |
106 #ifdef DEBUG | |
107 void Token::print() | |
108 { | |
109 fprintf(stdmsg, "%s\n", toChars()); | |
110 } | |
111 #endif | |
112 | |
113 char *Token::toChars() | |
114 { char *p; | |
115 static char buffer[3 + 3 * sizeof(value) + 1]; | |
116 | |
117 p = buffer; | |
118 switch (value) | |
119 { | |
120 case TOKint32v: | |
121 #if IN_GCC | |
122 sprintf(buffer,"%d",(d_int32)int64value); | |
123 #else | |
124 sprintf(buffer,"%d",int32value); | |
125 #endif | |
126 break; | |
127 | |
128 case TOKuns32v: | |
129 case TOKcharv: | |
130 case TOKwcharv: | |
131 case TOKdcharv: | |
132 #if IN_GCC | |
133 sprintf(buffer,"%uU",(d_uns32)uns64value); | |
134 #else | |
135 sprintf(buffer,"%uU",uns32value); | |
136 #endif | |
137 break; | |
138 | |
139 case TOKint64v: | |
140 sprintf(buffer,"%jdL",int64value); | |
141 break; | |
142 | |
143 case TOKuns64v: | |
144 sprintf(buffer,"%juUL",uns64value); | |
145 break; | |
146 | |
147 #if IN_GCC | |
148 case TOKfloat32v: | |
149 case TOKfloat64v: | |
150 case TOKfloat80v: | |
151 float80value.format(buffer, sizeof(buffer)); | |
152 break; | |
153 case TOKimaginary32v: | |
154 case TOKimaginary64v: | |
155 case TOKimaginary80v: | |
156 float80value.format(buffer, sizeof(buffer)); | |
157 // %% buffer | |
158 strcat(buffer, "i"); | |
159 break; | |
160 #else | |
161 case TOKfloat32v: | |
162 sprintf(buffer,"%Lgf", float80value); | |
163 break; | |
164 | |
165 case TOKfloat64v: | |
166 sprintf(buffer,"%Lg", float80value); | |
167 break; | |
168 | |
169 case TOKfloat80v: | |
170 sprintf(buffer,"%LgL", float80value); | |
171 break; | |
172 | |
173 case TOKimaginary32v: | |
174 sprintf(buffer,"%Lgfi", float80value); | |
175 break; | |
176 | |
177 case TOKimaginary64v: | |
178 sprintf(buffer,"%Lgi", float80value); | |
179 break; | |
180 | |
181 case TOKimaginary80v: | |
182 sprintf(buffer,"%LgLi", float80value); | |
183 break; | |
184 #endif | |
185 | |
186 case TOKstring: | |
187 #if CSTRINGS | |
188 p = string; | |
189 #else | |
190 { OutBuffer buf; | |
191 | |
192 buf.writeByte('"'); | |
193 for (size_t i = 0; i < len; ) | |
194 { unsigned c; | |
195 | |
196 utf_decodeChar((unsigned char *)ustring, len, &i, &c); | |
197 switch (c) | |
198 { | |
199 case 0: | |
200 break; | |
201 | |
202 case '"': | |
203 case '\\': | |
204 buf.writeByte('\\'); | |
205 default: | |
206 if (isprint(c)) | |
207 buf.writeByte(c); | |
208 else if (c <= 0x7F) | |
209 buf.printf("\\x%02x", c); | |
210 else if (c <= 0xFFFF) | |
211 buf.printf("\\u%04x", c); | |
212 else | |
213 buf.printf("\\U%08x", c); | |
214 continue; | |
215 } | |
216 break; | |
217 } | |
218 buf.writeByte('"'); | |
219 if (postfix) | |
220 buf.writeByte('"'); | |
221 buf.writeByte(0); | |
222 p = (char *)buf.extractData(); | |
223 } | |
224 #endif | |
225 break; | |
226 | |
227 case TOKidentifier: | |
228 case TOKenum: | |
229 case TOKstruct: | |
230 case TOKimport: | |
231 CASE_BASIC_TYPES: | |
232 p = ident->toChars(); | |
233 break; | |
234 | |
235 default: | |
236 p = toChars(value); | |
237 break; | |
238 } | |
239 return p; | |
240 } | |
241 | |
242 char *Token::toChars(enum TOK value) | |
243 { char *p; | |
244 static char buffer[3 + 3 * sizeof(value) + 1]; | |
245 | |
246 p = tochars[value]; | |
247 if (!p) | |
248 { sprintf(buffer,"TOK%d",value); | |
249 p = buffer; | |
250 } | |
251 return p; | |
252 } | |
253 | |
254 /*************************** Lexer ********************************************/ | |
255 | |
256 Token *Lexer::freelist = NULL; | |
257 StringTable Lexer::stringtable; | |
258 OutBuffer Lexer::stringbuffer; | |
259 | |
260 Lexer::Lexer(Module *mod, | |
261 unsigned char *base, unsigned begoffset, unsigned endoffset, | |
262 int doDocComment, int commentToken) | |
263 : loc(mod, 1) | |
264 { | |
265 //printf("Lexer::Lexer(%p,%d)\n",base,length); | |
266 //printf("lexer.mod = %p, %p\n", mod, this->loc.mod); | |
267 memset(&token,0,sizeof(token)); | |
268 this->base = base; | |
269 this->end = base + endoffset; | |
270 p = base + begoffset; | |
271 this->mod = mod; | |
272 this->doDocComment = doDocComment; | |
273 this->anyToken = 0; | |
274 this->commentToken = commentToken; | |
275 //initKeywords(); | |
276 | |
277 /* If first line starts with '#!', ignore the line | |
278 */ | |
279 | |
280 if (p[0] == '#' && p[1] =='!') | |
281 { | |
282 p += 2; | |
283 while (1) | |
284 { unsigned char c = *p; | |
285 switch (c) | |
286 { | |
287 case '\n': | |
288 p++; | |
289 break; | |
290 | |
291 case '\r': | |
292 p++; | |
293 if (*p == '\n') | |
294 p++; | |
295 break; | |
296 | |
297 case 0: | |
298 case 0x1A: | |
299 break; | |
300 | |
301 default: | |
302 if (c & 0x80) | |
303 { unsigned u = decodeUTF(); | |
304 if (u == PS || u == LS) | |
305 break; | |
306 } | |
307 p++; | |
308 continue; | |
309 } | |
310 break; | |
311 } | |
312 loc.linnum = 2; | |
313 } | |
314 } | |
315 | |
316 | |
317 void Lexer::error(const char *format, ...) | |
318 { | |
319 if (mod && !global.gag) | |
320 { | |
321 char *p = loc.toChars(); | |
322 if (*p) | |
323 fprintf(stdmsg, "%s: ", p); | |
324 mem.free(p); | |
325 | |
326 va_list ap; | |
327 va_start(ap, format); | |
328 vfprintf(stdmsg, format, ap); | |
329 va_end(ap); | |
330 | |
331 fprintf(stdmsg, "\n"); | |
332 fflush(stdmsg); | |
333 | |
334 if (global.errors >= 20) // moderate blizzard of cascading messages | |
335 fatal(); | |
336 } | |
337 global.errors++; | |
338 } | |
339 | |
340 void Lexer::error(Loc loc, const char *format, ...) | |
341 { | |
342 if (mod && !global.gag) | |
343 { | |
344 char *p = loc.toChars(); | |
345 if (*p) | |
346 fprintf(stdmsg, "%s: ", p); | |
347 mem.free(p); | |
348 | |
349 va_list ap; | |
350 va_start(ap, format); | |
351 vfprintf(stdmsg, format, ap); | |
352 va_end(ap); | |
353 | |
354 fprintf(stdmsg, "\n"); | |
355 fflush(stdmsg); | |
356 | |
357 if (global.errors >= 20) // moderate blizzard of cascading messages | |
358 fatal(); | |
359 } | |
360 global.errors++; | |
361 } | |
362 | |
363 TOK Lexer::nextToken() | |
364 { Token *t; | |
365 | |
366 if (token.next) | |
367 { | |
368 t = token.next; | |
369 memcpy(&token,t,sizeof(Token)); | |
370 t->next = freelist; | |
371 freelist = t; | |
372 } | |
373 else | |
374 { | |
375 scan(&token); | |
376 } | |
377 //token.print(); | |
378 return token.value; | |
379 } | |
380 | |
381 Token *Lexer::peek(Token *ct) | |
382 { Token *t; | |
383 | |
384 if (ct->next) | |
385 t = ct->next; | |
386 else | |
387 { | |
388 t = new Token(); | |
389 scan(t); | |
390 t->next = NULL; | |
391 ct->next = t; | |
392 } | |
393 return t; | |
394 } | |
395 | |
396 /********************************* | |
397 * tk is on the opening (. | |
398 * Look ahead and return token that is past the closing ). | |
399 */ | |
400 | |
401 Token *Lexer::peekPastParen(Token *tk) | |
402 { | |
403 //printf("peekPastParen()\n"); | |
404 int parens = 1; | |
405 int curlynest = 0; | |
406 while (1) | |
407 { | |
408 tk = peek(tk); | |
409 //tk->print(); | |
410 switch (tk->value) | |
411 { | |
412 case TOKlparen: | |
413 parens++; | |
414 continue; | |
415 | |
416 case TOKrparen: | |
417 --parens; | |
418 if (parens) | |
419 continue; | |
420 tk = peek(tk); | |
421 break; | |
422 | |
423 case TOKlcurly: | |
424 curlynest++; | |
425 continue; | |
426 | |
427 case TOKrcurly: | |
428 if (--curlynest >= 0) | |
429 continue; | |
430 break; | |
431 | |
432 case TOKsemicolon: | |
433 if (curlynest) | |
434 continue; | |
435 break; | |
436 | |
437 case TOKeof: | |
438 break; | |
439 | |
440 default: | |
441 continue; | |
442 } | |
443 return tk; | |
444 } | |
445 } | |
446 | |
447 /********************************** | |
448 * Determine if string is a valid Identifier. | |
449 * Placed here because of commonality with Lexer functionality. | |
450 * Returns: | |
451 * 0 invalid | |
452 */ | |
453 | |
454 int Lexer::isValidIdentifier(char *p) | |
455 { | |
456 size_t len; | |
457 size_t idx; | |
458 | |
459 if (!p || !*p) | |
460 goto Linvalid; | |
461 | |
462 if (isdigit(*p)) | |
463 goto Linvalid; | |
464 | |
465 len = strlen(p); | |
466 idx = 0; | |
467 while (p[idx]) | |
468 { dchar_t dc; | |
469 | |
470 char *q = utf_decodeChar((unsigned char *)p, len, &idx, &dc); | |
471 if (q) | |
472 goto Linvalid; | |
473 | |
474 if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_')) | |
475 goto Linvalid; | |
476 } | |
477 return 1; | |
478 | |
479 Linvalid: | |
480 return 0; | |
481 } | |
482 | |
483 /**************************** | |
484 * Turn next token in buffer into a token. | |
485 */ | |
486 | |
487 void Lexer::scan(Token *t) | |
488 { | |
489 unsigned lastLine = loc.linnum; | |
490 unsigned linnum; | |
491 | |
492 t->blockComment = NULL; | |
493 t->lineComment = NULL; | |
494 while (1) | |
495 { | |
496 t->ptr = p; | |
497 //printf("p = %p, *p = '%c'\n",p,*p); | |
498 switch (*p) | |
499 { | |
500 case 0: | |
501 case 0x1A: | |
502 t->value = TOKeof; // end of file | |
503 return; | |
504 | |
505 case ' ': | |
506 case '\t': | |
507 case '\v': | |
508 case '\f': | |
509 p++; | |
510 continue; // skip white space | |
511 | |
512 case '\r': | |
513 p++; | |
514 if (*p != '\n') // if CR stands by itself | |
515 loc.linnum++; | |
516 continue; // skip white space | |
517 | |
518 case '\n': | |
519 p++; | |
520 loc.linnum++; | |
521 continue; // skip white space | |
522 | |
523 case '0': case '1': case '2': case '3': case '4': | |
524 case '5': case '6': case '7': case '8': case '9': | |
525 t->value = number(t); | |
526 return; | |
527 | |
528 #if CSTRINGS | |
529 case '\'': | |
530 t->value = charConstant(t, 0); | |
531 return; | |
532 | |
533 case '"': | |
534 t->value = stringConstant(t,0); | |
535 return; | |
536 | |
537 case 'l': | |
538 case 'L': | |
539 if (p[1] == '\'') | |
540 { | |
541 p++; | |
542 t->value = charConstant(t, 1); | |
543 return; | |
544 } | |
545 else if (p[1] == '"') | |
546 { | |
547 p++; | |
548 t->value = stringConstant(t, 1); | |
549 return; | |
550 } | |
551 #else | |
552 case '\'': | |
553 t->value = charConstant(t,0); | |
554 return; | |
555 | |
556 case 'r': | |
557 if (p[1] != '"') | |
558 goto case_ident; | |
559 p++; | |
560 case '`': | |
561 t->value = wysiwygStringConstant(t, *p); | |
562 return; | |
563 | |
564 case 'x': | |
565 if (p[1] != '"') | |
566 goto case_ident; | |
567 p++; | |
568 t->value = hexStringConstant(t); | |
569 return; | |
570 | |
571 | |
572 case '"': | |
573 t->value = escapeStringConstant(t,0); | |
574 return; | |
575 | |
576 case '\\': // escaped string literal | |
577 { unsigned c; | |
578 | |
579 stringbuffer.reset(); | |
580 do | |
581 { | |
582 p++; | |
583 c = escapeSequence(); | |
584 stringbuffer.writeUTF8(c); | |
585 } while (*p == '\\'); | |
586 t->len = stringbuffer.offset; | |
587 stringbuffer.writeByte(0); | |
588 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); | |
589 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
590 t->postfix = 0; | |
591 t->value = TOKstring; | |
592 return; | |
593 } | |
594 | |
595 case 'l': | |
596 case 'L': | |
597 #endif | |
598 case 'a': case 'b': case 'c': case 'd': case 'e': | |
599 case 'f': case 'g': case 'h': case 'i': case 'j': | |
600 case 'k': case 'm': case 'n': case 'o': | |
601 case 'p': case 'q': /*case 'r':*/ case 's': case 't': | |
602 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': | |
603 case 'z': | |
604 case 'A': case 'B': case 'C': case 'D': case 'E': | |
605 case 'F': case 'G': case 'H': case 'I': case 'J': | |
606 case 'K': case 'M': case 'N': case 'O': | |
607 case 'P': case 'Q': case 'R': case 'S': case 'T': | |
608 case 'U': case 'V': case 'W': case 'X': case 'Y': | |
609 case 'Z': | |
610 case '_': | |
611 case_ident: | |
612 { unsigned char c; | |
613 StringValue *sv; | |
614 Identifier *id; | |
615 | |
616 do | |
617 { | |
618 c = *++p; | |
619 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF()))); | |
620 sv = stringtable.update((char *)t->ptr, p - t->ptr); | |
621 id = (Identifier *) sv->ptrvalue; | |
622 if (!id) | |
623 { id = new Identifier(sv->lstring.string,TOKidentifier); | |
624 sv->ptrvalue = id; | |
625 } | |
626 t->ident = id; | |
627 t->value = (enum TOK) id->value; | |
628 anyToken = 1; | |
629 if (*t->ptr == '_') // if special identifier token | |
630 { | |
631 static char date[11+1]; | |
632 static char time[8+1]; | |
633 static char timestamp[24+1]; | |
634 | |
635 if (!date[0]) // lazy evaluation | |
636 { time_t t; | |
637 char *p; | |
638 | |
639 ::time(&t); | |
640 p = ctime(&t); | |
641 assert(p); | |
642 sprintf(date, "%.6s %.4s", p + 4, p + 20); | |
643 sprintf(time, "%.8s", p + 11); | |
644 sprintf(timestamp, "%.24s", p); | |
645 } | |
646 | |
647 if (mod && id == Id::FILE) | |
648 { | |
649 t->ustring = (unsigned char *)(loc.filename ? loc.filename : mod->ident->toChars()); | |
650 goto Lstring; | |
651 } | |
652 else if (mod && id == Id::LINE) | |
653 { | |
654 t->value = TOKint64v; | |
655 t->uns64value = loc.linnum; | |
656 } | |
657 else if (id == Id::DATE) | |
658 { | |
659 t->ustring = (unsigned char *)date; | |
660 goto Lstring; | |
661 } | |
662 else if (id == Id::TIME) | |
663 { | |
664 t->ustring = (unsigned char *)time; | |
665 goto Lstring; | |
666 } | |
667 else if (id == Id::VENDOR) | |
668 { | |
669 t->ustring = (unsigned char *)"Digital Mars D"; | |
670 goto Lstring; | |
671 } | |
672 else if (id == Id::TIMESTAMP) | |
673 { | |
674 t->ustring = (unsigned char *)timestamp; | |
675 Lstring: | |
676 t->value = TOKstring; | |
677 Llen: | |
678 t->postfix = 0; | |
679 t->len = strlen((char *)t->ustring); | |
680 } | |
681 else if (id == Id::VERSIONX) | |
682 { unsigned major = 0; | |
683 unsigned minor = 0; | |
684 | |
685 for (char *p = global.version + 1; 1; p++) | |
686 { | |
687 char c = *p; | |
688 if (isdigit(c)) | |
689 minor = minor * 10 + c - '0'; | |
690 else if (c == '.') | |
691 { major = minor; | |
692 minor = 0; | |
693 } | |
694 else | |
695 break; | |
696 } | |
697 t->value = TOKint64v; | |
698 t->uns64value = major * 1000 + minor; | |
699 } | |
700 } | |
701 //printf("t->value = %d\n",t->value); | |
702 return; | |
703 } | |
704 | |
705 case '/': | |
706 p++; | |
707 switch (*p) | |
708 { | |
709 case '=': | |
710 p++; | |
711 t->value = TOKdivass; | |
712 return; | |
713 | |
714 case '*': | |
715 p++; | |
716 linnum = loc.linnum; | |
717 while (1) | |
718 { | |
719 while (1) | |
720 { unsigned char c = *p; | |
721 switch (c) | |
722 { | |
723 case '/': | |
724 break; | |
725 | |
726 case '\n': | |
727 loc.linnum++; | |
728 p++; | |
729 continue; | |
730 | |
731 case '\r': | |
732 p++; | |
733 if (*p != '\n') | |
734 loc.linnum++; | |
735 continue; | |
736 | |
737 case 0: | |
738 case 0x1A: | |
739 error("unterminated /* */ comment"); | |
740 p = end; | |
741 t->value = TOKeof; | |
742 return; | |
743 | |
744 default: | |
745 if (c & 0x80) | |
746 { unsigned u = decodeUTF(); | |
747 if (u == PS || u == LS) | |
748 loc.linnum++; | |
749 } | |
750 p++; | |
751 continue; | |
752 } | |
753 break; | |
754 } | |
755 p++; | |
756 if (p[-2] == '*' && p - 3 != t->ptr) | |
757 break; | |
758 } | |
759 if (commentToken) | |
760 { | |
761 t->value = TOKcomment; | |
762 return; | |
763 } | |
764 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) | |
765 { // if /** but not /**/ | |
766 getDocComment(t, lastLine == linnum); | |
767 } | |
768 continue; | |
769 | |
770 case '/': // do // style comments | |
771 linnum = loc.linnum; | |
772 while (1) | |
773 { unsigned char c = *++p; | |
774 switch (c) | |
775 { | |
776 case '\n': | |
777 break; | |
778 | |
779 case '\r': | |
780 if (p[1] == '\n') | |
781 p++; | |
782 break; | |
783 | |
784 case 0: | |
785 case 0x1A: | |
786 if (commentToken) | |
787 { | |
788 p = end; | |
789 t->value = TOKcomment; | |
790 return; | |
791 } | |
792 if (doDocComment && t->ptr[2] == '/') | |
793 getDocComment(t, lastLine == linnum); | |
794 p = end; | |
795 t->value = TOKeof; | |
796 return; | |
797 | |
798 default: | |
799 if (c & 0x80) | |
800 { unsigned u = decodeUTF(); | |
801 if (u == PS || u == LS) | |
802 break; | |
803 } | |
804 continue; | |
805 } | |
806 break; | |
807 } | |
808 | |
809 if (commentToken) | |
810 { | |
811 p++; | |
812 loc.linnum++; | |
813 t->value = TOKcomment; | |
814 return; | |
815 } | |
816 if (doDocComment && t->ptr[2] == '/') | |
817 getDocComment(t, lastLine == linnum); | |
818 | |
819 p++; | |
820 loc.linnum++; | |
821 continue; | |
822 | |
823 case '+': | |
824 { int nest; | |
825 | |
826 linnum = loc.linnum; | |
827 p++; | |
828 nest = 1; | |
829 while (1) | |
830 { unsigned char c = *p; | |
831 switch (c) | |
832 { | |
833 case '/': | |
834 p++; | |
835 if (*p == '+') | |
836 { | |
837 p++; | |
838 nest++; | |
839 } | |
840 continue; | |
841 | |
842 case '+': | |
843 p++; | |
844 if (*p == '/') | |
845 { | |
846 p++; | |
847 if (--nest == 0) | |
848 break; | |
849 } | |
850 continue; | |
851 | |
852 case '\r': | |
853 p++; | |
854 if (*p != '\n') | |
855 loc.linnum++; | |
856 continue; | |
857 | |
858 case '\n': | |
859 loc.linnum++; | |
860 p++; | |
861 continue; | |
862 | |
863 case 0: | |
864 case 0x1A: | |
865 error("unterminated /+ +/ comment"); | |
866 p = end; | |
867 t->value = TOKeof; | |
868 return; | |
869 | |
870 default: | |
871 if (c & 0x80) | |
872 { unsigned u = decodeUTF(); | |
873 if (u == PS || u == LS) | |
874 loc.linnum++; | |
875 } | |
876 p++; | |
877 continue; | |
878 } | |
879 break; | |
880 } | |
881 if (commentToken) | |
882 { | |
883 t->value = TOKcomment; | |
884 return; | |
885 } | |
886 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) | |
887 { // if /++ but not /++/ | |
888 getDocComment(t, lastLine == linnum); | |
889 } | |
890 continue; | |
891 } | |
892 } | |
893 t->value = TOKdiv; | |
894 return; | |
895 | |
896 case '.': | |
897 p++; | |
898 if (isdigit(*p)) | |
899 { /* Note that we don't allow ._1 and ._ as being | |
900 * valid floating point numbers. | |
901 */ | |
902 p--; | |
903 t->value = inreal(t); | |
904 } | |
905 else if (p[0] == '.') | |
906 { | |
907 if (p[1] == '.') | |
908 { p += 2; | |
909 t->value = TOKdotdotdot; | |
910 } | |
911 else | |
912 { p++; | |
913 t->value = TOKslice; | |
914 } | |
915 } | |
916 else | |
917 t->value = TOKdot; | |
918 return; | |
919 | |
920 case '&': | |
921 p++; | |
922 if (*p == '=') | |
923 { p++; | |
924 t->value = TOKandass; | |
925 } | |
926 else if (*p == '&') | |
927 { p++; | |
928 t->value = TOKandand; | |
929 } | |
930 else | |
931 t->value = TOKand; | |
932 return; | |
933 | |
934 case '|': | |
935 p++; | |
936 if (*p == '=') | |
937 { p++; | |
938 t->value = TOKorass; | |
939 } | |
940 else if (*p == '|') | |
941 { p++; | |
942 t->value = TOKoror; | |
943 } | |
944 else | |
945 t->value = TOKor; | |
946 return; | |
947 | |
948 case '-': | |
949 p++; | |
950 if (*p == '=') | |
951 { p++; | |
952 t->value = TOKminass; | |
953 } | |
954 #if 0 | |
955 else if (*p == '>') | |
956 { p++; | |
957 t->value = TOKarrow; | |
958 } | |
959 #endif | |
960 else if (*p == '-') | |
961 { p++; | |
962 t->value = TOKminusminus; | |
963 } | |
964 else | |
965 t->value = TOKmin; | |
966 return; | |
967 | |
968 case '+': | |
969 p++; | |
970 if (*p == '=') | |
971 { p++; | |
972 t->value = TOKaddass; | |
973 } | |
974 else if (*p == '+') | |
975 { p++; | |
976 t->value = TOKplusplus; | |
977 } | |
978 else | |
979 t->value = TOKadd; | |
980 return; | |
981 | |
982 case '<': | |
983 p++; | |
984 if (*p == '=') | |
985 { p++; | |
986 t->value = TOKle; // <= | |
987 } | |
988 else if (*p == '<') | |
989 { p++; | |
990 if (*p == '=') | |
991 { p++; | |
992 t->value = TOKshlass; // <<= | |
993 } | |
994 else | |
995 t->value = TOKshl; // << | |
996 } | |
997 else if (*p == '>') | |
998 { p++; | |
999 if (*p == '=') | |
1000 { p++; | |
1001 t->value = TOKleg; // <>= | |
1002 } | |
1003 else | |
1004 t->value = TOKlg; // <> | |
1005 } | |
1006 else | |
1007 t->value = TOKlt; // < | |
1008 return; | |
1009 | |
1010 case '>': | |
1011 p++; | |
1012 if (*p == '=') | |
1013 { p++; | |
1014 t->value = TOKge; // >= | |
1015 } | |
1016 else if (*p == '>') | |
1017 { p++; | |
1018 if (*p == '=') | |
1019 { p++; | |
1020 t->value = TOKshrass; // >>= | |
1021 } | |
1022 else if (*p == '>') | |
1023 { p++; | |
1024 if (*p == '=') | |
1025 { p++; | |
1026 t->value = TOKushrass; // >>>= | |
1027 } | |
1028 else | |
1029 t->value = TOKushr; // >>> | |
1030 } | |
1031 else | |
1032 t->value = TOKshr; // >> | |
1033 } | |
1034 else | |
1035 t->value = TOKgt; // > | |
1036 return; | |
1037 | |
1038 case '!': | |
1039 p++; | |
1040 if (*p == '=') | |
1041 { p++; | |
1042 if (*p == '=' && global.params.Dversion == 1) | |
1043 { p++; | |
1044 t->value = TOKnotidentity; // !== | |
1045 } | |
1046 else | |
1047 t->value = TOKnotequal; // != | |
1048 } | |
1049 else if (*p == '<') | |
1050 { p++; | |
1051 if (*p == '>') | |
1052 { p++; | |
1053 if (*p == '=') | |
1054 { p++; | |
1055 t->value = TOKunord; // !<>= | |
1056 } | |
1057 else | |
1058 t->value = TOKue; // !<> | |
1059 } | |
1060 else if (*p == '=') | |
1061 { p++; | |
1062 t->value = TOKug; // !<= | |
1063 } | |
1064 else | |
1065 t->value = TOKuge; // !< | |
1066 } | |
1067 else if (*p == '>') | |
1068 { p++; | |
1069 if (*p == '=') | |
1070 { p++; | |
1071 t->value = TOKul; // !>= | |
1072 } | |
1073 else | |
1074 t->value = TOKule; // !> | |
1075 } | |
1076 else | |
1077 t->value = TOKnot; // ! | |
1078 return; | |
1079 | |
1080 case '=': | |
1081 p++; | |
1082 if (*p == '=') | |
1083 { p++; | |
1084 if (*p == '=' && global.params.Dversion == 1) | |
1085 { p++; | |
1086 t->value = TOKidentity; // === | |
1087 } | |
1088 else | |
1089 t->value = TOKequal; // == | |
1090 } | |
1091 else | |
1092 t->value = TOKassign; // = | |
1093 return; | |
1094 | |
1095 case '~': | |
1096 p++; | |
1097 if (*p == '=') | |
1098 { p++; | |
1099 t->value = TOKcatass; // ~= | |
1100 } | |
1101 else | |
1102 t->value = TOKtilde; // ~ | |
1103 return; | |
1104 | |
1105 #define SINGLE(c,tok) case c: p++; t->value = tok; return; | |
1106 | |
1107 SINGLE('(', TOKlparen) | |
1108 SINGLE(')', TOKrparen) | |
1109 SINGLE('[', TOKlbracket) | |
1110 SINGLE(']', TOKrbracket) | |
1111 SINGLE('{', TOKlcurly) | |
1112 SINGLE('}', TOKrcurly) | |
1113 SINGLE('?', TOKquestion) | |
1114 SINGLE(',', TOKcomma) | |
1115 SINGLE(';', TOKsemicolon) | |
1116 SINGLE(':', TOKcolon) | |
1117 SINGLE('$', TOKdollar) | |
1118 | |
1119 #undef SINGLE | |
1120 | |
1121 #define DOUBLE(c1,tok1,c2,tok2) \ | |
1122 case c1: \ | |
1123 p++; \ | |
1124 if (*p == c2) \ | |
1125 { p++; \ | |
1126 t->value = tok2; \ | |
1127 } \ | |
1128 else \ | |
1129 t->value = tok1; \ | |
1130 return; | |
1131 | |
1132 DOUBLE('*', TOKmul, '=', TOKmulass) | |
1133 DOUBLE('%', TOKmod, '=', TOKmodass) | |
1134 DOUBLE('^', TOKxor, '=', TOKxorass) | |
1135 | |
1136 #undef DOUBLE | |
1137 | |
1138 case '#': | |
1139 p++; | |
1140 pragma(); | |
1141 continue; | |
1142 | |
1143 default: | |
1144 { unsigned char c = *p; | |
1145 | |
1146 if (c & 0x80) | |
1147 { unsigned u = decodeUTF(); | |
1148 | |
1149 // Check for start of unicode identifier | |
1150 if (isUniAlpha(u)) | |
1151 goto case_ident; | |
1152 | |
1153 if (u == PS || u == LS) | |
1154 { | |
1155 loc.linnum++; | |
1156 p++; | |
1157 continue; | |
1158 } | |
1159 } | |
1160 if (isprint(c)) | |
1161 error("unsupported char '%c'", c); | |
1162 else | |
1163 error("unsupported char 0x%02x", c); | |
1164 p++; | |
1165 continue; | |
1166 } | |
1167 } | |
1168 } | |
1169 } | |
1170 | |
1171 /******************************************* | |
1172 * Parse escape sequence. | |
1173 */ | |
1174 | |
1175 unsigned Lexer::escapeSequence() | |
1176 { unsigned c; | |
1177 int n; | |
1178 int ndigits; | |
1179 | |
1180 c = *p; | |
1181 switch (c) | |
1182 { | |
1183 case '\'': | |
1184 case '"': | |
1185 case '?': | |
1186 case '\\': | |
1187 Lconsume: | |
1188 p++; | |
1189 break; | |
1190 | |
1191 case 'a': c = 7; goto Lconsume; | |
1192 case 'b': c = 8; goto Lconsume; | |
1193 case 'f': c = 12; goto Lconsume; | |
1194 case 'n': c = 10; goto Lconsume; | |
1195 case 'r': c = 13; goto Lconsume; | |
1196 case 't': c = 9; goto Lconsume; | |
1197 case 'v': c = 11; goto Lconsume; | |
1198 | |
1199 case 'u': | |
1200 ndigits = 4; | |
1201 goto Lhex; | |
1202 case 'U': | |
1203 ndigits = 8; | |
1204 goto Lhex; | |
1205 case 'x': | |
1206 ndigits = 2; | |
1207 Lhex: | |
1208 p++; | |
1209 c = *p; | |
1210 if (ishex(c)) | |
1211 { unsigned v; | |
1212 | |
1213 n = 0; | |
1214 v = 0; | |
1215 while (1) | |
1216 { | |
1217 if (isdigit(c)) | |
1218 c -= '0'; | |
1219 else if (islower(c)) | |
1220 c -= 'a' - 10; | |
1221 else | |
1222 c -= 'A' - 10; | |
1223 v = v * 16 + c; | |
1224 c = *++p; | |
1225 if (++n == ndigits) | |
1226 break; | |
1227 if (!ishex(c)) | |
1228 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); | |
1229 break; | |
1230 } | |
1231 } | |
1232 if (ndigits != 2 && !utf_isValidDchar(v)) | |
1233 error("invalid UTF character \\U%08x", v); | |
1234 c = v; | |
1235 } | |
1236 else | |
1237 error("undefined escape hex sequence \\%c\n",c); | |
1238 break; | |
1239 | |
1240 case '&': // named character entity | |
1241 for (unsigned char *idstart = ++p; 1; p++) | |
1242 { | |
1243 switch (*p) | |
1244 { | |
1245 case ';': | |
1246 c = HtmlNamedEntity(idstart, p - idstart); | |
1247 if (c == ~0) | |
1248 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); | |
1249 c = ' '; | |
1250 } | |
1251 p++; | |
1252 break; | |
1253 | |
1254 default: | |
1255 if (isalpha(*p) || | |
1256 (p != idstart + 1 && isdigit(*p))) | |
1257 continue; | |
1258 error("unterminated named entity"); | |
1259 break; | |
1260 } | |
1261 break; | |
1262 } | |
1263 break; | |
1264 | |
1265 case 0: | |
1266 case 0x1A: // end of file | |
1267 c = '\\'; | |
1268 break; | |
1269 | |
1270 default: | |
1271 if (isoctal(c)) | |
1272 { unsigned char v; | |
1273 | |
1274 n = 0; | |
1275 v = 0; | |
1276 do | |
1277 { | |
1278 v = v * 8 + (c - '0'); | |
1279 c = *++p; | |
1280 } while (++n < 3 && isoctal(c)); | |
1281 c = v; | |
1282 } | |
1283 else | |
1284 error("undefined escape sequence \\%c\n",c); | |
1285 break; | |
1286 } | |
1287 return c; | |
1288 } | |
1289 | |
1290 /************************************** | |
1291 */ | |
1292 | |
1293 TOK Lexer::wysiwygStringConstant(Token *t, int tc) | |
1294 { unsigned c; | |
1295 Loc start = loc; | |
1296 | |
1297 p++; | |
1298 stringbuffer.reset(); | |
1299 while (1) | |
1300 { | |
1301 c = *p++; | |
1302 switch (c) | |
1303 { | |
1304 case '\n': | |
1305 loc.linnum++; | |
1306 break; | |
1307 | |
1308 case '\r': | |
1309 if (*p == '\n') | |
1310 continue; // ignore | |
1311 c = '\n'; // treat EndOfLine as \n character | |
1312 loc.linnum++; | |
1313 break; | |
1314 | |
1315 case 0: | |
1316 case 0x1A: | |
1317 error("unterminated string constant starting at %s", start.toChars()); | |
1318 t->ustring = (unsigned char *)""; | |
1319 t->len = 0; | |
1320 t->postfix = 0; | |
1321 return TOKstring; | |
1322 | |
1323 case '"': | |
1324 case '`': | |
1325 if (c == tc) | |
1326 { | |
1327 t->len = stringbuffer.offset; | |
1328 stringbuffer.writeByte(0); | |
1329 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); | |
1330 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1331 stringPostfix(t); | |
1332 return TOKstring; | |
1333 } | |
1334 break; | |
1335 | |
1336 default: | |
1337 if (c & 0x80) | |
1338 { p--; | |
1339 unsigned u = decodeUTF(); | |
1340 p++; | |
1341 if (u == PS || u == LS) | |
1342 loc.linnum++; | |
1343 stringbuffer.writeUTF8(u); | |
1344 continue; | |
1345 } | |
1346 break; | |
1347 } | |
1348 stringbuffer.writeByte(c); | |
1349 } | |
1350 } | |
1351 | |
1352 /************************************** | |
1353 * Lex hex strings: | |
1354 * x"0A ae 34FE BD" | |
1355 */ | |
1356 | |
1357 TOK Lexer::hexStringConstant(Token *t) | |
1358 { unsigned c; | |
1359 Loc start = loc; | |
1360 unsigned n = 0; | |
1361 unsigned v; | |
1362 | |
1363 p++; | |
1364 stringbuffer.reset(); | |
1365 while (1) | |
1366 { | |
1367 c = *p++; | |
1368 switch (c) | |
1369 { | |
1370 case ' ': | |
1371 case '\t': | |
1372 case '\v': | |
1373 case '\f': | |
1374 continue; // skip white space | |
1375 | |
1376 case '\r': | |
1377 if (*p == '\n') | |
1378 continue; // ignore | |
1379 // Treat isolated '\r' as if it were a '\n' | |
1380 case '\n': | |
1381 loc.linnum++; | |
1382 continue; | |
1383 | |
1384 case 0: | |
1385 case 0x1A: | |
1386 error("unterminated string constant starting at %s", start.toChars()); | |
1387 t->ustring = (unsigned char *)""; | |
1388 t->len = 0; | |
1389 t->postfix = 0; | |
1390 return TOKstring; | |
1391 | |
1392 case '"': | |
1393 if (n & 1) | |
1394 { error("odd number (%d) of hex characters in hex string", n); | |
1395 stringbuffer.writeByte(v); | |
1396 } | |
1397 t->len = stringbuffer.offset; | |
1398 stringbuffer.writeByte(0); | |
1399 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); | |
1400 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1401 stringPostfix(t); | |
1402 return TOKstring; | |
1403 | |
1404 default: | |
1405 if (c >= '0' && c <= '9') | |
1406 c -= '0'; | |
1407 else if (c >= 'a' && c <= 'f') | |
1408 c -= 'a' - 10; | |
1409 else if (c >= 'A' && c <= 'F') | |
1410 c -= 'A' - 10; | |
1411 else if (c & 0x80) | |
1412 { p--; | |
1413 unsigned u = decodeUTF(); | |
1414 p++; | |
1415 if (u == PS || u == LS) | |
1416 loc.linnum++; | |
1417 else | |
1418 error("non-hex character \\u%x", u); | |
1419 } | |
1420 else | |
1421 error("non-hex character '%c'", c); | |
1422 if (n & 1) | |
1423 { v = (v << 4) | c; | |
1424 stringbuffer.writeByte(v); | |
1425 } | |
1426 else | |
1427 v = c; | |
1428 n++; | |
1429 break; | |
1430 } | |
1431 } | |
1432 } | |
1433 | |
1434 /************************************** | |
1435 */ | |
1436 | |
1437 TOK Lexer::escapeStringConstant(Token *t, int wide) | |
1438 { unsigned c; | |
1439 Loc start = loc; | |
1440 | |
1441 p++; | |
1442 stringbuffer.reset(); | |
1443 while (1) | |
1444 { | |
1445 c = *p++; | |
1446 switch (c) | |
1447 { | |
1448 case '\\': | |
1449 switch (*p) | |
1450 { | |
1451 case 'u': | |
1452 case 'U': | |
1453 case '&': | |
1454 c = escapeSequence(); | |
1455 stringbuffer.writeUTF8(c); | |
1456 continue; | |
1457 | |
1458 default: | |
1459 c = escapeSequence(); | |
1460 break; | |
1461 } | |
1462 break; | |
1463 | |
1464 case '\n': | |
1465 loc.linnum++; | |
1466 break; | |
1467 | |
1468 case '\r': | |
1469 if (*p == '\n') | |
1470 continue; // ignore | |
1471 c = '\n'; // treat EndOfLine as \n character | |
1472 loc.linnum++; | |
1473 break; | |
1474 | |
1475 case '"': | |
1476 t->len = stringbuffer.offset; | |
1477 stringbuffer.writeByte(0); | |
1478 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); | |
1479 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1480 stringPostfix(t); | |
1481 return TOKstring; | |
1482 | |
1483 case 0: | |
1484 case 0x1A: | |
1485 p--; | |
1486 error("unterminated string constant starting at %s", start.toChars()); | |
1487 t->ustring = (unsigned char *)""; | |
1488 t->len = 0; | |
1489 t->postfix = 0; | |
1490 return TOKstring; | |
1491 | |
1492 default: | |
1493 if (c & 0x80) | |
1494 { | |
1495 p--; | |
1496 c = decodeUTF(); | |
1497 if (c == LS || c == PS) | |
1498 { c = '\n'; | |
1499 loc.linnum++; | |
1500 } | |
1501 p++; | |
1502 stringbuffer.writeUTF8(c); | |
1503 continue; | |
1504 } | |
1505 break; | |
1506 } | |
1507 stringbuffer.writeByte(c); | |
1508 } | |
1509 } | |
1510 | |
1511 /************************************** | |
1512 */ | |
1513 | |
1514 TOK Lexer::charConstant(Token *t, int wide) | |
1515 { | |
1516 unsigned c; | |
1517 TOK tk = TOKcharv; | |
1518 | |
1519 //printf("Lexer::charConstant\n"); | |
1520 p++; | |
1521 c = *p++; | |
1522 switch (c) | |
1523 { | |
1524 case '\\': | |
1525 switch (*p) | |
1526 { | |
1527 case 'u': | |
1528 t->uns64value = escapeSequence(); | |
1529 tk = TOKwcharv; | |
1530 break; | |
1531 | |
1532 case 'U': | |
1533 case '&': | |
1534 t->uns64value = escapeSequence(); | |
1535 tk = TOKdcharv; | |
1536 break; | |
1537 | |
1538 default: | |
1539 t->uns64value = escapeSequence(); | |
1540 break; | |
1541 } | |
1542 break; | |
1543 | |
1544 case '\n': | |
1545 L1: | |
1546 loc.linnum++; | |
1547 case '\r': | |
1548 case 0: | |
1549 case 0x1A: | |
1550 case '\'': | |
1551 error("unterminated character constant"); | |
1552 return tk; | |
1553 | |
1554 default: | |
1555 if (c & 0x80) | |
1556 { | |
1557 p--; | |
1558 c = decodeUTF(); | |
1559 p++; | |
1560 if (c == LS || c == PS) | |
1561 goto L1; | |
1562 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) | |
1563 tk = TOKwcharv; | |
1564 else | |
1565 tk = TOKdcharv; | |
1566 } | |
1567 t->uns64value = c; | |
1568 break; | |
1569 } | |
1570 | |
1571 if (*p != '\'') | |
1572 { error("unterminated character constant"); | |
1573 return tk; | |
1574 } | |
1575 p++; | |
1576 return tk; | |
1577 } | |
1578 | |
1579 /*************************************** | |
1580 * Get postfix of string literal. | |
1581 */ | |
1582 | |
1583 void Lexer::stringPostfix(Token *t) | |
1584 { | |
1585 switch (*p) | |
1586 { | |
1587 case 'c': | |
1588 case 'w': | |
1589 case 'd': | |
1590 t->postfix = *p; | |
1591 p++; | |
1592 break; | |
1593 | |
1594 default: | |
1595 t->postfix = 0; | |
1596 break; | |
1597 } | |
1598 } | |
1599 | |
1600 /*************************************** | |
1601 * Read \u or \U unicode sequence | |
1602 * Input: | |
1603 * u 'u' or 'U' | |
1604 */ | |
1605 | |
1606 #if 0 | |
1607 unsigned Lexer::wchar(unsigned u) | |
1608 { | |
1609 unsigned value; | |
1610 unsigned n; | |
1611 unsigned char c; | |
1612 unsigned nchars; | |
1613 | |
1614 nchars = (u == 'U') ? 8 : 4; | |
1615 value = 0; | |
1616 for (n = 0; 1; n++) | |
1617 { | |
1618 ++p; | |
1619 if (n == nchars) | |
1620 break; | |
1621 c = *p; | |
1622 if (!ishex(c)) | |
1623 { error("\\%c sequence must be followed by %d hex characters", u, nchars); | |
1624 break; | |
1625 } | |
1626 if (isdigit(c)) | |
1627 c -= '0'; | |
1628 else if (islower(c)) | |
1629 c -= 'a' - 10; | |
1630 else | |
1631 c -= 'A' - 10; | |
1632 value <<= 4; | |
1633 value |= c; | |
1634 } | |
1635 return value; | |
1636 } | |
1637 #endif | |
1638 | |
1639 /************************************** | |
1640 * Read in a number. | |
1641 * If it's an integer, store it in tok.TKutok.Vlong. | |
1642 * integers can be decimal, octal or hex | |
1643 * Handle the suffixes U, UL, LU, L, etc. | |
1644 * If it's double, store it in tok.TKutok.Vdouble. | |
1645 * Returns: | |
1646 * TKnum | |
1647 * TKdouble,... | |
1648 */ | |
1649 | |
1650 TOK Lexer::number(Token *t) | |
1651 { | |
1652 // We use a state machine to collect numbers | |
1653 enum STATE { STATE_initial, STATE_0, STATE_decimal, STATE_octal, STATE_octale, | |
1654 STATE_hex, STATE_binary, STATE_hex0, STATE_binary0, | |
1655 STATE_hexh, STATE_error }; | |
1656 enum STATE state; | |
1657 | |
1658 enum FLAGS | |
1659 { FLAGS_decimal = 1, // decimal | |
1660 FLAGS_unsigned = 2, // u or U suffix | |
1661 FLAGS_long = 4, // l or L suffix | |
1662 }; | |
1663 enum FLAGS flags = FLAGS_decimal; | |
1664 | |
1665 int i; | |
1666 int base; | |
1667 unsigned c; | |
1668 unsigned char *start; | |
1669 TOK result; | |
1670 | |
1671 //printf("Lexer::number()\n"); | |
1672 state = STATE_initial; | |
1673 base = 0; | |
1674 stringbuffer.reset(); | |
1675 start = p; | |
1676 while (1) | |
1677 { | |
1678 c = *p; | |
1679 switch (state) | |
1680 { | |
1681 case STATE_initial: // opening state | |
1682 if (c == '0') | |
1683 state = STATE_0; | |
1684 else | |
1685 state = STATE_decimal; | |
1686 break; | |
1687 | |
1688 case STATE_0: | |
1689 flags = (FLAGS) (flags & ~FLAGS_decimal); | |
1690 switch (c) | |
1691 { | |
1692 #if ZEROH | |
1693 case 'H': // 0h | |
1694 case 'h': | |
1695 goto hexh; | |
1696 #endif | |
1697 case 'X': | |
1698 case 'x': | |
1699 state = STATE_hex0; | |
1700 break; | |
1701 | |
1702 case '.': | |
1703 if (p[1] == '.') // .. is a separate token | |
1704 goto done; | |
1705 case 'i': | |
1706 case 'f': | |
1707 case 'F': | |
1708 goto real; | |
1709 #if ZEROH | |
1710 case 'E': | |
1711 case 'e': | |
1712 goto case_hex; | |
1713 #endif | |
1714 case 'B': | |
1715 case 'b': | |
1716 state = STATE_binary0; | |
1717 break; | |
1718 | |
1719 case '0': case '1': case '2': case '3': | |
1720 case '4': case '5': case '6': case '7': | |
1721 state = STATE_octal; | |
1722 break; | |
1723 | |
1724 #if ZEROH | |
1725 case '8': case '9': case 'A': | |
1726 case 'C': case 'D': case 'F': | |
1727 case 'a': case 'c': case 'd': case 'f': | |
1728 case_hex: | |
1729 state = STATE_hexh; | |
1730 break; | |
1731 #endif | |
1732 case '_': | |
1733 state = STATE_octal; | |
1734 p++; | |
1735 continue; | |
1736 | |
1737 case 'L': | |
1738 if (p[1] == 'i') | |
1739 goto real; | |
1740 goto done; | |
1741 | |
1742 default: | |
1743 goto done; | |
1744 } | |
1745 break; | |
1746 | |
1747 case STATE_decimal: // reading decimal number | |
1748 if (!isdigit(c)) | |
1749 { | |
1750 #if ZEROH | |
1751 if (ishex(c) | |
1752 || c == 'H' || c == 'h' | |
1753 ) | |
1754 goto hexh; | |
1755 #endif | |
1756 if (c == '_') // ignore embedded _ | |
1757 { p++; | |
1758 continue; | |
1759 } | |
1760 if (c == '.' && p[1] != '.') | |
1761 goto real; | |
1762 else if (c == 'i' || c == 'f' || c == 'F' || | |
1763 c == 'e' || c == 'E') | |
1764 { | |
1765 real: // It's a real number. Back up and rescan as a real | |
1766 p = start; | |
1767 return inreal(t); | |
1768 } | |
1769 else if (c == 'L' && p[1] == 'i') | |
1770 goto real; | |
1771 goto done; | |
1772 } | |
1773 break; | |
1774 | |
1775 case STATE_hex0: // reading hex number | |
1776 case STATE_hex: | |
1777 if (!ishex(c)) | |
1778 { | |
1779 if (c == '_') // ignore embedded _ | |
1780 { p++; | |
1781 continue; | |
1782 } | |
1783 if (c == '.' && p[1] != '.') | |
1784 goto real; | |
1785 if (c == 'P' || c == 'p' || c == 'i') | |
1786 goto real; | |
1787 if (state == STATE_hex0) | |
1788 error("Hex digit expected, not '%c'", c); | |
1789 goto done; | |
1790 } | |
1791 state = STATE_hex; | |
1792 break; | |
1793 | |
1794 #if ZEROH | |
1795 hexh: | |
1796 state = STATE_hexh; | |
1797 case STATE_hexh: // parse numbers like 0FFh | |
1798 if (!ishex(c)) | |
1799 { | |
1800 if (c == 'H' || c == 'h') | |
1801 { | |
1802 p++; | |
1803 base = 16; | |
1804 goto done; | |
1805 } | |
1806 else | |
1807 { | |
1808 // Check for something like 1E3 or 0E24 | |
1809 if (memchr((char *)stringbuffer.data, 'E', stringbuffer.offset) || | |
1810 memchr((char *)stringbuffer.data, 'e', stringbuffer.offset)) | |
1811 goto real; | |
1812 error("Hex digit expected, not '%c'", c); | |
1813 goto done; | |
1814 } | |
1815 } | |
1816 break; | |
1817 #endif | |
1818 | |
1819 case STATE_octal: // reading octal number | |
1820 case STATE_octale: // reading octal number with non-octal digits | |
1821 if (!isoctal(c)) | |
1822 { | |
1823 #if ZEROH | |
1824 if (ishex(c) | |
1825 || c == 'H' || c == 'h' | |
1826 ) | |
1827 goto hexh; | |
1828 #endif | |
1829 if (c == '_') // ignore embedded _ | |
1830 { p++; | |
1831 continue; | |
1832 } | |
1833 if (c == '.' && p[1] != '.') | |
1834 goto real; | |
1835 if (c == 'i') | |
1836 goto real; | |
1837 if (isdigit(c)) | |
1838 { | |
1839 state = STATE_octale; | |
1840 } | |
1841 else | |
1842 goto done; | |
1843 } | |
1844 break; | |
1845 | |
1846 case STATE_binary0: // starting binary number | |
1847 case STATE_binary: // reading binary number | |
1848 if (c != '0' && c != '1') | |
1849 { | |
1850 #if ZEROH | |
1851 if (ishex(c) | |
1852 || c == 'H' || c == 'h' | |
1853 ) | |
1854 goto hexh; | |
1855 #endif | |
1856 if (c == '_') // ignore embedded _ | |
1857 { p++; | |
1858 continue; | |
1859 } | |
1860 if (state == STATE_binary0) | |
1861 { error("binary digit expected"); | |
1862 state = STATE_error; | |
1863 break; | |
1864 } | |
1865 else | |
1866 goto done; | |
1867 } | |
1868 state = STATE_binary; | |
1869 break; | |
1870 | |
1871 case STATE_error: // for error recovery | |
1872 if (!isdigit(c)) // scan until non-digit | |
1873 goto done; | |
1874 break; | |
1875 | |
1876 default: | |
1877 assert(0); | |
1878 } | |
1879 stringbuffer.writeByte(c); | |
1880 p++; | |
1881 } | |
1882 done: | |
1883 stringbuffer.writeByte(0); // terminate string | |
1884 if (state == STATE_octale) | |
1885 error("Octal digit expected"); | |
1886 | |
1887 uinteger_t n; // unsigned >=64 bit integer type | |
1888 | |
1889 if (stringbuffer.offset == 2 && (state == STATE_decimal || state == STATE_0)) | |
1890 n = stringbuffer.data[0] - '0'; | |
1891 else | |
1892 { | |
1893 // Convert string to integer | |
1894 #if __DMC__ | |
1895 errno = 0; | |
1896 n = strtoull((char *)stringbuffer.data,NULL,base); | |
1897 if (errno == ERANGE) | |
1898 error("integer overflow"); | |
1899 #else | |
1900 // Not everybody implements strtoull() | |
1901 char *p = (char *)stringbuffer.data; | |
1902 int r = 10, d; | |
1903 | |
1904 if (*p == '0') | |
1905 { | |
1906 if (p[1] == 'x' || p[1] == 'X') | |
1907 p += 2, r = 16; | |
1908 else if (p[1] == 'b' || p[1] == 'B') | |
1909 p += 2, r = 2; | |
1910 else if (isdigit(p[1])) | |
1911 p += 1, r = 8; | |
1912 } | |
1913 | |
1914 n = 0; | |
1915 while (1) | |
1916 { | |
1917 if (*p >= '0' && *p <= '9') | |
1918 d = *p - '0'; | |
1919 else if (*p >= 'a' && *p <= 'z') | |
1920 d = *p - 'a' + 10; | |
1921 else if (*p >= 'A' && *p <= 'Z') | |
1922 d = *p - 'A' + 10; | |
1923 else | |
1924 break; | |
1925 if (d >= r) | |
1926 break; | |
1927 if (n * r + d < n) | |
1928 { | |
1929 error ("integer overflow"); | |
1930 break; | |
1931 } | |
1932 | |
1933 n = n * r + d; | |
1934 p++; | |
1935 } | |
1936 #endif | |
1937 if (sizeof(n) > 8 && | |
1938 n > 0xFFFFFFFFFFFFFFFFULL) // if n needs more than 64 bits | |
1939 error("integer overflow"); | |
1940 } | |
1941 | |
1942 // Parse trailing 'u', 'U', 'l' or 'L' in any combination | |
1943 while (1) | |
1944 { unsigned char f; | |
1945 | |
1946 switch (*p) | |
1947 { case 'U': | |
1948 case 'u': | |
1949 f = FLAGS_unsigned; | |
1950 goto L1; | |
1951 | |
1952 case 'l': | |
1953 if (1 || !global.params.useDeprecated) | |
1954 error("'l' suffix is deprecated, use 'L' instead"); | |
1955 case 'L': | |
1956 f = FLAGS_long; | |
1957 L1: | |
1958 p++; | |
1959 if (flags & f) | |
1960 error("unrecognized token"); | |
1961 flags = (FLAGS) (flags | f); | |
1962 continue; | |
1963 default: | |
1964 break; | |
1965 } | |
1966 break; | |
1967 } | |
1968 | |
1969 switch (flags) | |
1970 { | |
1971 case 0: | |
1972 /* Octal or Hexadecimal constant. | |
1973 * First that fits: int, uint, long, ulong | |
1974 */ | |
1975 if (n & 0x8000000000000000LL) | |
1976 result = TOKuns64v; | |
1977 else if (n & 0xFFFFFFFF00000000LL) | |
1978 result = TOKint64v; | |
1979 else if (n & 0x80000000) | |
1980 result = TOKuns32v; | |
1981 else | |
1982 result = TOKint32v; | |
1983 break; | |
1984 | |
1985 case FLAGS_decimal: | |
1986 /* First that fits: int, long, long long | |
1987 */ | |
1988 if (n & 0x8000000000000000LL) | |
1989 { error("signed integer overflow"); | |
1990 result = TOKuns64v; | |
1991 } | |
1992 else if (n & 0xFFFFFFFF80000000LL) | |
1993 result = TOKint64v; | |
1994 else | |
1995 result = TOKint32v; | |
1996 break; | |
1997 | |
1998 case FLAGS_unsigned: | |
1999 case FLAGS_decimal | FLAGS_unsigned: | |
2000 /* First that fits: uint, ulong | |
2001 */ | |
2002 if (n & 0xFFFFFFFF00000000LL) | |
2003 result = TOKuns64v; | |
2004 else | |
2005 result = TOKuns32v; | |
2006 break; | |
2007 | |
2008 case FLAGS_decimal | FLAGS_long: | |
2009 if (n & 0x8000000000000000LL) | |
2010 { error("signed integer overflow"); | |
2011 result = TOKuns64v; | |
2012 } | |
2013 else | |
2014 result = TOKint64v; | |
2015 break; | |
2016 | |
2017 case FLAGS_long: | |
2018 if (n & 0x8000000000000000LL) | |
2019 result = TOKuns64v; | |
2020 else | |
2021 result = TOKint64v; | |
2022 break; | |
2023 | |
2024 case FLAGS_unsigned | FLAGS_long: | |
2025 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: | |
2026 result = TOKuns64v; | |
2027 break; | |
2028 | |
2029 default: | |
2030 #ifdef DEBUG | |
2031 printf("%x\n",flags); | |
2032 #endif | |
2033 assert(0); | |
2034 } | |
2035 t->uns64value = n; | |
2036 return result; | |
2037 } | |
2038 | |
2039 /************************************** | |
2040 * Read in characters, converting them to real. | |
2041 * Bugs: | |
2042 * Exponent overflow not detected. | |
2043 * Too much requested precision is not detected. | |
2044 */ | |
2045 | |
2046 TOK Lexer::inreal(Token *t) | |
2047 #ifdef __DMC__ | |
2048 __in | |
2049 { | |
2050 assert(*p == '.' || isdigit(*p)); | |
2051 } | |
2052 __out (result) | |
2053 { | |
2054 switch (result) | |
2055 { | |
2056 case TOKfloat32v: | |
2057 case TOKfloat64v: | |
2058 case TOKfloat80v: | |
2059 case TOKimaginary32v: | |
2060 case TOKimaginary64v: | |
2061 case TOKimaginary80v: | |
2062 break; | |
2063 | |
2064 default: | |
2065 assert(0); | |
2066 } | |
2067 } | |
2068 __body | |
2069 #endif /* __DMC__ */ | |
2070 { int dblstate; | |
2071 unsigned c; | |
2072 char hex; // is this a hexadecimal-floating-constant? | |
2073 TOK result; | |
2074 | |
2075 //printf("Lexer::inreal()\n"); | |
2076 stringbuffer.reset(); | |
2077 dblstate = 0; | |
2078 hex = 0; | |
2079 Lnext: | |
2080 while (1) | |
2081 { | |
2082 // Get next char from input | |
2083 c = *p++; | |
2084 //printf("dblstate = %d, c = '%c'\n", dblstate, c); | |
2085 while (1) | |
2086 { | |
2087 switch (dblstate) | |
2088 { | |
2089 case 0: // opening state | |
2090 if (c == '0') | |
2091 dblstate = 9; | |
2092 else if (c == '.') | |
2093 dblstate = 3; | |
2094 else | |
2095 dblstate = 1; | |
2096 break; | |
2097 | |
2098 case 9: | |
2099 dblstate = 1; | |
2100 if (c == 'X' || c == 'x') | |
2101 { hex++; | |
2102 break; | |
2103 } | |
2104 case 1: // digits to left of . | |
2105 case 3: // digits to right of . | |
2106 case 7: // continuing exponent digits | |
2107 if (!isdigit(c) && !(hex && isxdigit(c))) | |
2108 { | |
2109 if (c == '_') | |
2110 goto Lnext; // ignore embedded '_' | |
2111 dblstate++; | |
2112 continue; | |
2113 } | |
2114 break; | |
2115 | |
2116 case 2: // no more digits to left of . | |
2117 if (c == '.') | |
2118 { dblstate++; | |
2119 break; | |
2120 } | |
2121 case 4: // no more digits to right of . | |
2122 if ((c == 'E' || c == 'e') || | |
2123 hex && (c == 'P' || c == 'p')) | |
2124 { dblstate = 5; | |
2125 hex = 0; // exponent is always decimal | |
2126 break; | |
2127 } | |
2128 if (hex) | |
2129 error("binary-exponent-part required"); | |
2130 goto done; | |
2131 | |
2132 case 5: // looking immediately to right of E | |
2133 dblstate++; | |
2134 if (c == '-' || c == '+') | |
2135 break; | |
2136 case 6: // 1st exponent digit expected | |
2137 if (!isdigit(c)) | |
2138 error("exponent expected"); | |
2139 dblstate++; | |
2140 break; | |
2141 | |
2142 case 8: // past end of exponent digits | |
2143 goto done; | |
2144 } | |
2145 break; | |
2146 } | |
2147 stringbuffer.writeByte(c); | |
2148 } | |
2149 done: | |
2150 p--; | |
2151 | |
2152 stringbuffer.writeByte(0); | |
2153 | |
2154 #if _WIN32 && __DMC__ | |
2155 char *save = __locale_decpoint; | |
2156 __locale_decpoint = "."; | |
2157 #endif | |
2158 #ifdef IN_GCC | |
2159 t->float80value = real_t::parse((char *)stringbuffer.data, real_t::LongDouble); | |
2160 #else | |
2161 t->float80value = strtold((char *)stringbuffer.data, NULL); | |
2162 #endif | |
2163 errno = 0; | |
2164 switch (*p) | |
2165 { | |
2166 case 'F': | |
2167 case 'f': | |
2168 #ifdef IN_GCC | |
2169 real_t::parse((char *)stringbuffer.data, real_t::Float); | |
2170 #else | |
2171 strtof((char *)stringbuffer.data, NULL); | |
2172 #endif | |
2173 result = TOKfloat32v; | |
2174 p++; | |
2175 break; | |
2176 | |
2177 default: | |
2178 #ifdef IN_GCC | |
2179 real_t::parse((char *)stringbuffer.data, real_t::Double); | |
2180 #else | |
2181 strtod((char *)stringbuffer.data, NULL); | |
2182 #endif | |
2183 result = TOKfloat64v; | |
2184 break; | |
2185 | |
2186 case 'l': | |
2187 if (!global.params.useDeprecated) | |
2188 error("'l' suffix is deprecated, use 'L' instead"); | |
2189 case 'L': | |
2190 result = TOKfloat80v; | |
2191 p++; | |
2192 break; | |
2193 } | |
2194 if (*p == 'i' || *p == 'I') | |
2195 { | |
2196 if (!global.params.useDeprecated && *p == 'I') | |
2197 error("'I' suffix is deprecated, use 'i' instead"); | |
2198 p++; | |
2199 switch (result) | |
2200 { | |
2201 case TOKfloat32v: | |
2202 result = TOKimaginary32v; | |
2203 break; | |
2204 case TOKfloat64v: | |
2205 result = TOKimaginary64v; | |
2206 break; | |
2207 case TOKfloat80v: | |
2208 result = TOKimaginary80v; | |
2209 break; | |
2210 } | |
2211 } | |
2212 #if _WIN32 && __DMC__ | |
2213 __locale_decpoint = save; | |
2214 #endif | |
2215 if (errno == ERANGE) | |
2216 error("number is not representable"); | |
2217 return result; | |
2218 } | |
2219 | |
2220 /********************************************* | |
2221 * Do pragma. | |
2222 * Currently, the only pragma supported is: | |
2223 * #line linnum [filespec] | |
2224 */ | |
2225 | |
2226 void Lexer::pragma() | |
2227 { | |
2228 Token tok; | |
2229 int linnum; | |
2230 char *filespec = NULL; | |
2231 Loc loc = this->loc; | |
2232 | |
2233 scan(&tok); | |
2234 if (tok.value != TOKidentifier || tok.ident != Id::line) | |
2235 goto Lerr; | |
2236 | |
2237 scan(&tok); | |
2238 if (tok.value == TOKint32v || tok.value == TOKint64v) | |
2239 linnum = tok.uns64value - 1; | |
2240 else | |
2241 goto Lerr; | |
2242 | |
2243 while (1) | |
2244 { | |
2245 switch (*p) | |
2246 { | |
2247 case 0: | |
2248 case 0x1A: | |
2249 case '\n': | |
2250 Lnewline: | |
2251 this->loc.linnum = linnum; | |
2252 if (filespec) | |
2253 this->loc.filename = filespec; | |
2254 return; | |
2255 | |
2256 case '\r': | |
2257 p++; | |
2258 if (*p != '\n') | |
2259 { p--; | |
2260 goto Lnewline; | |
2261 } | |
2262 continue; | |
2263 | |
2264 case ' ': | |
2265 case '\t': | |
2266 case '\v': | |
2267 case '\f': | |
2268 p++; | |
2269 continue; // skip white space | |
2270 | |
2271 case '_': | |
2272 if (mod && memcmp(p, "__FILE__", 8) == 0) | |
2273 { | |
2274 p += 8; | |
2275 filespec = mem.strdup(loc.filename ? loc.filename : mod->ident->toChars()); | |
2276 } | |
2277 continue; | |
2278 | |
2279 case '"': | |
2280 if (filespec) | |
2281 goto Lerr; | |
2282 stringbuffer.reset(); | |
2283 p++; | |
2284 while (1) | |
2285 { unsigned c; | |
2286 | |
2287 c = *p; | |
2288 switch (c) | |
2289 { | |
2290 case '\n': | |
2291 case '\r': | |
2292 case 0: | |
2293 case 0x1A: | |
2294 goto Lerr; | |
2295 | |
2296 case '"': | |
2297 stringbuffer.writeByte(0); | |
2298 filespec = mem.strdup((char *)stringbuffer.data); | |
2299 p++; | |
2300 break; | |
2301 | |
2302 default: | |
2303 if (c & 0x80) | |
2304 { unsigned u = decodeUTF(); | |
2305 if (u == PS || u == LS) | |
2306 goto Lerr; | |
2307 } | |
2308 stringbuffer.writeByte(c); | |
2309 p++; | |
2310 continue; | |
2311 } | |
2312 break; | |
2313 } | |
2314 continue; | |
2315 | |
2316 default: | |
2317 if (*p & 0x80) | |
2318 { unsigned u = decodeUTF(); | |
2319 if (u == PS || u == LS) | |
2320 goto Lnewline; | |
2321 } | |
2322 goto Lerr; | |
2323 } | |
2324 } | |
2325 | |
2326 Lerr: | |
2327 error(loc, "#line integer [\"filespec\"]\\n expected"); | |
2328 } | |
2329 | |
2330 | |
2331 /******************************************** | |
2332 * Decode UTF character. | |
2333 * Issue error messages for invalid sequences. | |
2334 * Return decoded character, advance p to last character in UTF sequence. | |
2335 */ | |
2336 | |
2337 unsigned Lexer::decodeUTF() | |
2338 { | |
2339 dchar_t u; | |
2340 unsigned char c; | |
2341 unsigned char *s = p; | |
2342 size_t len; | |
2343 size_t idx; | |
2344 char *msg; | |
2345 | |
2346 c = *s; | |
2347 assert(c & 0x80); | |
2348 | |
2349 // Check length of remaining string up to 6 UTF-8 characters | |
2350 for (len = 1; len < 6 && s[len]; len++) | |
2351 ; | |
2352 | |
2353 idx = 0; | |
2354 msg = utf_decodeChar(s, len, &idx, &u); | |
2355 p += idx - 1; | |
2356 if (msg) | |
2357 { | |
2358 error("%s", msg); | |
2359 } | |
2360 return u; | |
2361 } | |
2362 | |
2363 | |
2364 /*************************************************** | |
2365 * Parse doc comment embedded between t->ptr and p. | |
2366 * Remove trailing blanks and tabs from lines. | |
2367 * Replace all newlines with \n. | |
2368 * Remove leading comment character from each line. | |
2369 * Decide if it's a lineComment or a blockComment. | |
2370 * Append to previous one for this token. | |
2371 */ | |
2372 | |
2373 void Lexer::getDocComment(Token *t, unsigned lineComment) | |
2374 { | |
2375 OutBuffer buf; | |
2376 unsigned char ct = t->ptr[2]; | |
2377 unsigned char *q = t->ptr + 3; // start of comment text | |
2378 int linestart = 0; | |
2379 | |
2380 unsigned char *qend = p; | |
2381 if (ct == '*' || ct == '+') | |
2382 qend -= 2; | |
2383 | |
2384 /* Scan over initial row of ****'s or ++++'s or ////'s | |
2385 */ | |
2386 for (; q < qend; q++) | |
2387 { | |
2388 if (*q != ct) | |
2389 break; | |
2390 } | |
2391 | |
2392 /* Remove trailing row of ****'s or ++++'s | |
2393 */ | |
2394 if (ct != '/') | |
2395 { | |
2396 for (; q < qend; qend--) | |
2397 { | |
2398 if (qend[-1] != ct) | |
2399 break; | |
2400 } | |
2401 } | |
2402 | |
2403 for (; q < qend; q++) | |
2404 { | |
2405 unsigned char c = *q; | |
2406 | |
2407 switch (c) | |
2408 { | |
2409 case '*': | |
2410 case '+': | |
2411 if (linestart && c == ct) | |
2412 { linestart = 0; | |
2413 /* Trim preceding whitespace up to preceding \n | |
2414 */ | |
2415 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2416 buf.offset--; | |
2417 continue; | |
2418 } | |
2419 break; | |
2420 | |
2421 case ' ': | |
2422 case '\t': | |
2423 break; | |
2424 | |
2425 case '\r': | |
2426 if (q[1] == '\n') | |
2427 continue; // skip the \r | |
2428 goto Lnewline; | |
2429 | |
2430 default: | |
2431 if (c == 226) | |
2432 { | |
2433 // If LS or PS | |
2434 if (q[1] == 128 && | |
2435 (q[2] == 168 || q[2] == 169)) | |
2436 { | |
2437 q += 2; | |
2438 goto Lnewline; | |
2439 } | |
2440 } | |
2441 linestart = 0; | |
2442 break; | |
2443 | |
2444 Lnewline: | |
2445 c = '\n'; // replace all newlines with \n | |
2446 case '\n': | |
2447 linestart = 1; | |
2448 | |
2449 /* Trim trailing whitespace | |
2450 */ | |
2451 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2452 buf.offset--; | |
2453 | |
2454 break; | |
2455 } | |
2456 buf.writeByte(c); | |
2457 } | |
2458 | |
2459 // Always end with a newline | |
2460 if (!buf.offset || buf.data[buf.offset - 1] != '\n') | |
2461 buf.writeByte('\n'); | |
2462 | |
2463 buf.writeByte(0); | |
2464 | |
2465 // It's a line comment if the start of the doc comment comes | |
2466 // after other non-whitespace on the same line. | |
2467 unsigned char** dc = (lineComment && anyToken) | |
2468 ? &t->lineComment | |
2469 : &t->blockComment; | |
2470 | |
2471 // Combine with previous doc comment, if any | |
2472 if (*dc) | |
2473 *dc = combineComments(*dc, (unsigned char *)buf.data); | |
2474 else | |
2475 *dc = (unsigned char *)buf.extractData(); | |
2476 } | |
2477 | |
2478 /******************************************** | |
2479 * Combine two document comments into one. | |
2480 */ | |
2481 | |
2482 unsigned char *Lexer::combineComments(unsigned char *c1, unsigned char *c2) | |
2483 { | |
2484 unsigned char *c = c2; | |
2485 | |
2486 if (c1) | |
2487 { c = c1; | |
2488 if (c2) | |
2489 { size_t len1 = strlen((char *)c1); | |
2490 size_t len2 = strlen((char *)c2); | |
2491 | |
2492 c = (unsigned char *)mem.malloc(len1 + 1 + len2 + 1); | |
2493 memcpy(c, c1, len1); | |
2494 c[len1] = '\n'; | |
2495 memcpy(c + len1 + 1, c2, len2); | |
2496 c[len1 + 1 + len2] = 0; | |
2497 } | |
2498 } | |
2499 return c; | |
2500 } | |
2501 | |
2502 /******************************************** | |
2503 * Create an identifier in the string table. | |
2504 */ | |
2505 | |
2506 Identifier *Lexer::idPool(const char *s) | |
2507 { unsigned len; | |
2508 Identifier *id; | |
2509 StringValue *sv; | |
2510 | |
2511 len = strlen(s); | |
2512 sv = stringtable.update(s, len); | |
2513 id = (Identifier *) sv->ptrvalue; | |
2514 if (!id) | |
2515 { | |
2516 id = new Identifier(sv->lstring.string, TOKidentifier); | |
2517 sv->ptrvalue = id; | |
2518 } | |
2519 return id; | |
2520 } | |
2521 | |
2522 /**************************************** | |
2523 */ | |
2524 | |
2525 struct Keyword | |
2526 { char *name; | |
2527 enum TOK value; | |
2528 }; | |
2529 | |
2530 static Keyword keywords[] = | |
2531 { | |
2532 // { "", TOK }, | |
2533 | |
2534 { "this", TOKthis }, | |
2535 { "super", TOKsuper }, | |
2536 { "assert", TOKassert }, | |
2537 { "null", TOKnull }, | |
2538 { "true", TOKtrue }, | |
2539 { "false", TOKfalse }, | |
2540 { "cast", TOKcast }, | |
2541 { "new", TOKnew }, | |
2542 { "delete", TOKdelete }, | |
2543 { "throw", TOKthrow }, | |
2544 { "module", TOKmodule }, | |
2545 { "pragma", TOKpragma }, | |
2546 { "typeof", TOKtypeof }, | |
2547 { "typeid", TOKtypeid }, | |
2548 | |
2549 { "template", TOKtemplate }, | |
2550 | |
2551 { "void", TOKvoid }, | |
2552 { "byte", TOKint8 }, | |
2553 { "ubyte", TOKuns8 }, | |
2554 { "short", TOKint16 }, | |
2555 { "ushort", TOKuns16 }, | |
2556 { "int", TOKint32 }, | |
2557 { "uint", TOKuns32 }, | |
2558 { "long", TOKint64 }, | |
2559 { "ulong", TOKuns64 }, | |
2560 { "cent", TOKcent, }, | |
2561 { "ucent", TOKucent, }, | |
2562 { "float", TOKfloat32 }, | |
2563 { "double", TOKfloat64 }, | |
2564 { "real", TOKfloat80 }, | |
2565 | |
2566 { "bool", TOKbool }, | |
2567 { "char", TOKchar }, | |
2568 { "wchar", TOKwchar }, | |
2569 { "dchar", TOKdchar }, | |
2570 | |
2571 { "ifloat", TOKimaginary32 }, | |
2572 { "idouble", TOKimaginary64 }, | |
2573 { "ireal", TOKimaginary80 }, | |
2574 | |
2575 { "cfloat", TOKcomplex32 }, | |
2576 { "cdouble", TOKcomplex64 }, | |
2577 { "creal", TOKcomplex80 }, | |
2578 | |
2579 { "delegate", TOKdelegate }, | |
2580 { "function", TOKfunction }, | |
2581 | |
2582 { "is", TOKis }, | |
2583 { "if", TOKif }, | |
2584 { "else", TOKelse }, | |
2585 { "while", TOKwhile }, | |
2586 { "for", TOKfor }, | |
2587 { "do", TOKdo }, | |
2588 { "switch", TOKswitch }, | |
2589 { "case", TOKcase }, | |
2590 { "default", TOKdefault }, | |
2591 { "break", TOKbreak }, | |
2592 { "continue", TOKcontinue }, | |
2593 { "synchronized", TOKsynchronized }, | |
2594 { "return", TOKreturn }, | |
2595 { "goto", TOKgoto }, | |
2596 { "try", TOKtry }, | |
2597 { "catch", TOKcatch }, | |
2598 { "finally", TOKfinally }, | |
2599 { "with", TOKwith }, | |
2600 { "asm", TOKasm }, | |
2601 { "foreach", TOKforeach }, | |
2602 { "foreach_reverse", TOKforeach_reverse }, | |
2603 { "scope", TOKscope }, | |
2604 | |
2605 { "struct", TOKstruct }, | |
2606 { "class", TOKclass }, | |
2607 { "interface", TOKinterface }, | |
2608 { "union", TOKunion }, | |
2609 { "enum", TOKenum }, | |
2610 { "import", TOKimport }, | |
2611 { "mixin", TOKmixin }, | |
2612 { "static", TOKstatic }, | |
2613 { "final", TOKfinal }, | |
2614 { "const", TOKconst }, | |
2615 { "typedef", TOKtypedef }, | |
2616 { "alias", TOKalias }, | |
2617 { "override", TOKoverride }, | |
2618 { "abstract", TOKabstract }, | |
2619 { "volatile", TOKvolatile }, | |
2620 { "debug", TOKdebug }, | |
2621 { "deprecated", TOKdeprecated }, | |
2622 { "in", TOKin }, | |
2623 { "out", TOKout }, | |
2624 { "inout", TOKinout }, | |
2625 { "lazy", TOKlazy }, | |
2626 { "auto", TOKauto }, | |
2627 | |
2628 { "align", TOKalign }, | |
2629 { "extern", TOKextern }, | |
2630 { "private", TOKprivate }, | |
2631 { "package", TOKpackage }, | |
2632 { "protected", TOKprotected }, | |
2633 { "public", TOKpublic }, | |
2634 { "export", TOKexport }, | |
2635 | |
2636 { "body", TOKbody }, | |
2637 { "invariant", TOKinvariant }, | |
2638 { "unittest", TOKunittest }, | |
2639 { "version", TOKversion }, | |
2640 | |
2641 // Added after 1.0 | |
2642 { "ref", TOKref }, | |
2643 { "macro", TOKmacro }, | |
2644 }; | |
2645 | |
2646 int Token::isKeyword() | |
2647 { | |
2648 for (unsigned u = 0; u < sizeof(keywords) / sizeof(keywords[0]); u++) | |
2649 { | |
2650 if (keywords[u].value == value) | |
2651 return 1; | |
2652 } | |
2653 return 0; | |
2654 } | |
2655 | |
2656 void Lexer::initKeywords() | |
2657 { StringValue *sv; | |
2658 unsigned u; | |
2659 enum TOK v; | |
2660 unsigned nkeywords = sizeof(keywords) / sizeof(keywords[0]); | |
2661 | |
2662 if (global.params.Dversion == 1) | |
2663 nkeywords -= 2; | |
2664 | |
2665 cmtable_init(); | |
2666 | |
2667 for (u = 0; u < nkeywords; u++) | |
2668 { char *s; | |
2669 | |
2670 //printf("keyword[%d] = '%s'\n",u, keywords[u].name); | |
2671 s = keywords[u].name; | |
2672 v = keywords[u].value; | |
2673 sv = stringtable.insert(s, strlen(s)); | |
2674 sv->ptrvalue = (void *) new Identifier(sv->lstring.string,v); | |
2675 | |
2676 //printf("tochars[%d] = '%s'\n",v, s); | |
2677 Token::tochars[v] = s; | |
2678 } | |
2679 | |
2680 Token::tochars[TOKeof] = "EOF"; | |
2681 Token::tochars[TOKlcurly] = "{"; | |
2682 Token::tochars[TOKrcurly] = "}"; | |
2683 Token::tochars[TOKlparen] = "("; | |
2684 Token::tochars[TOKrparen] = ")"; | |
2685 Token::tochars[TOKlbracket] = "["; | |
2686 Token::tochars[TOKrbracket] = "]"; | |
2687 Token::tochars[TOKsemicolon] = ";"; | |
2688 Token::tochars[TOKcolon] = ":"; | |
2689 Token::tochars[TOKcomma] = ","; | |
2690 Token::tochars[TOKdot] = "."; | |
2691 Token::tochars[TOKxor] = "^"; | |
2692 Token::tochars[TOKxorass] = "^="; | |
2693 Token::tochars[TOKassign] = "="; | |
2694 Token::tochars[TOKconstruct] = "="; | |
2695 Token::tochars[TOKlt] = "<"; | |
2696 Token::tochars[TOKgt] = ">"; | |
2697 Token::tochars[TOKle] = "<="; | |
2698 Token::tochars[TOKge] = ">="; | |
2699 Token::tochars[TOKequal] = "=="; | |
2700 Token::tochars[TOKnotequal] = "!="; | |
2701 Token::tochars[TOKnotidentity] = "!is"; | |
2702 Token::tochars[TOKtobool] = "!!"; | |
2703 | |
2704 Token::tochars[TOKunord] = "!<>="; | |
2705 Token::tochars[TOKue] = "!<>"; | |
2706 Token::tochars[TOKlg] = "<>"; | |
2707 Token::tochars[TOKleg] = "<>="; | |
2708 Token::tochars[TOKule] = "!>"; | |
2709 Token::tochars[TOKul] = "!>="; | |
2710 Token::tochars[TOKuge] = "!<"; | |
2711 Token::tochars[TOKug] = "!<="; | |
2712 | |
2713 Token::tochars[TOKnot] = "!"; | |
2714 Token::tochars[TOKtobool] = "!!"; | |
2715 Token::tochars[TOKshl] = "<<"; | |
2716 Token::tochars[TOKshr] = ">>"; | |
2717 Token::tochars[TOKushr] = ">>>"; | |
2718 Token::tochars[TOKadd] = "+"; | |
2719 Token::tochars[TOKmin] = "-"; | |
2720 Token::tochars[TOKmul] = "*"; | |
2721 Token::tochars[TOKdiv] = "/"; | |
2722 Token::tochars[TOKmod] = "%"; | |
2723 Token::tochars[TOKslice] = ".."; | |
2724 Token::tochars[TOKdotdotdot] = "..."; | |
2725 Token::tochars[TOKand] = "&"; | |
2726 Token::tochars[TOKandand] = "&&"; | |
2727 Token::tochars[TOKor] = "|"; | |
2728 Token::tochars[TOKoror] = "||"; | |
2729 Token::tochars[TOKarray] = "[]"; | |
2730 Token::tochars[TOKindex] = "[i]"; | |
2731 Token::tochars[TOKaddress] = "&"; | |
2732 Token::tochars[TOKstar] = "*"; | |
2733 Token::tochars[TOKtilde] = "~"; | |
2734 Token::tochars[TOKdollar] = "$"; | |
2735 Token::tochars[TOKcast] = "cast"; | |
2736 Token::tochars[TOKplusplus] = "++"; | |
2737 Token::tochars[TOKminusminus] = "--"; | |
2738 Token::tochars[TOKtype] = "type"; | |
2739 Token::tochars[TOKquestion] = "?"; | |
2740 Token::tochars[TOKneg] = "-"; | |
2741 Token::tochars[TOKuadd] = "+"; | |
2742 Token::tochars[TOKvar] = "var"; | |
2743 Token::tochars[TOKaddass] = "+="; | |
2744 Token::tochars[TOKminass] = "-="; | |
2745 Token::tochars[TOKmulass] = "*="; | |
2746 Token::tochars[TOKdivass] = "/="; | |
2747 Token::tochars[TOKmodass] = "%="; | |
2748 Token::tochars[TOKshlass] = "<<="; | |
2749 Token::tochars[TOKshrass] = ">>="; | |
2750 Token::tochars[TOKushrass] = ">>>="; | |
2751 Token::tochars[TOKandass] = "&="; | |
2752 Token::tochars[TOKorass] = "|="; | |
2753 Token::tochars[TOKcatass] = "~="; | |
2754 Token::tochars[TOKcat] = "~"; | |
2755 Token::tochars[TOKcall] = "call"; | |
2756 Token::tochars[TOKidentity] = "is"; | |
2757 Token::tochars[TOKnotidentity] = "!is"; | |
2758 | |
2759 Token::tochars[TOKorass] = "|="; | |
2760 Token::tochars[TOKidentifier] = "identifier"; | |
2761 | |
2762 // For debugging | |
2763 Token::tochars[TOKdotexp] = "dotexp"; | |
2764 Token::tochars[TOKdotti] = "dotti"; | |
2765 Token::tochars[TOKdotvar] = "dotvar"; | |
2766 Token::tochars[TOKdottype] = "dottype"; | |
2767 Token::tochars[TOKsymoff] = "symoff"; | |
2768 Token::tochars[TOKtypedot] = "typedot"; | |
2769 Token::tochars[TOKarraylength] = "arraylength"; | |
2770 Token::tochars[TOKarrayliteral] = "arrayliteral"; | |
2771 Token::tochars[TOKassocarrayliteral] = "assocarrayliteral"; | |
2772 Token::tochars[TOKstructliteral] = "structliteral"; | |
2773 Token::tochars[TOKstring] = "string"; | |
2774 Token::tochars[TOKdsymbol] = "symbol"; | |
2775 Token::tochars[TOKtuple] = "tuple"; | |
2776 Token::tochars[TOKdeclaration] = "declaration"; | |
2777 Token::tochars[TOKdottd] = "dottd"; | |
2778 } |