comparison src/basic/LiteralParsing.d @ 207:e0551773a005

Added the correct version.
author Anders Johnsen <skabet@gmail.com>
date Tue, 12 Aug 2008 18:19:34 +0200
parents
children
comparison
equal deleted inserted replaced
206:d3c148ca429b 207:e0551773a005
1 module basic.LiteralParsing;
2
3 import basic.SourceLocation,
4 basic.Message,
5 basic.conv;
6
7 import tango.io.Stdout,
8 tango.core.BitManip,
9 Integer = tango.text.convert.Integer,
10 Utf = tango.text.convert.Utf,
11 tango.text.Util;
12
13 enum StringType
14 {
15 Char,
16 WChar,
17 DChar
18 }
19
20 enum NumberType
21 {
22 Int,
23 UInt,
24 Long,
25 ULong,
26 Float,
27 Double,
28 Real
29 }
30
31 struct String
32 {
33 StringType type;
34 ubyte[] data;
35 }
36
37 struct Number
38 {
39 NumberType type;
40 ulong integer;
41 real floating;
42 }
43
44 private struct EscapeReturn
45 {
46 ubyte[] data;
47 int length;
48 }
49
50 private struct NumberReturn
51 {
52 char[] data;
53 int length;
54 }
55
56 Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages)
57 {
58 Number num;
59
60 switch(str[0])
61 {
62 case '0':
63 case '1':
64 case '2':
65 case '3':
66 case '4':
67 case '5':
68 case '6':
69 case '7':
70 case '8':
71 case '9':
72 if(str.contains('.') || str.contains('e') || str.contains('E'))
73 {
74 auto n = parseRealNumber(str, loc, messages);
75
76 try
77 {
78 num.floating = toReal(n.data);
79 num.type = NumberType.Double;
80 }
81 catch(Exception e)
82 {
83 num.floating = real.init;
84 messages.report(FloatingToLarge, loc, loc + n.length - 1);
85 }
86
87 if(num.floating > double.max)
88 num.type = NumberType.Real;
89 }
90 else
91 {
92 auto n = parseDecimalDigits(str, loc, messages);
93
94 try
95 {
96 num.integer = toUlong(n.data);
97 }
98 catch(Exception e)
99 {
100 num.integer = 0;
101 messages.report(IntegerToLarge, loc, loc + n.length - 1);
102 }
103
104 if(num.integer > uint.max)
105 num.type = NumberType.Long;
106 if(num.integer > long.max)
107 num.type = NumberType.ULong;
108 }
109 break;
110 default:
111 messages.report(InvalidStartInteger, loc, loc+1);
112 }
113
114 // printNumber(str, num);
115 return num;
116 }
117
118 NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages)
119 {
120 int i = 0;
121
122 char[] number;
123
124 bool end;
125 while(!end)
126 {
127 switch(str[i])
128 {
129 case '0':
130 case '1':
131 case '2':
132 case '3':
133 case '4':
134 case '5':
135 case '6':
136 case '7':
137 case '8':
138 case '9':
139 number ~= str[i];
140 break;
141 case '_':
142 break;
143 default:
144 end = true;
145 }
146 i++;
147 if(str.length == i)
148 {
149 end = true;
150 i++;
151 }
152 }
153
154 NumberReturn res;
155 res.length = i - 1;
156 res.data = number;
157
158 return res;
159 }
160
161 NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages)
162 {
163 int i = 0;
164
165 bool dot, e;
166 char[] number;
167
168 NumberReturn num;
169
170 bool end;
171 while(!end)
172 {
173 switch(str[i])
174 {
175 case '0':
176 case '1':
177 case '2':
178 case '3':
179 case '4':
180 case '5':
181 case '6':
182 case '7':
183 case '8':
184 case '9':
185 case '_':
186 auto n = parseDecimalDigits(str[i..$], loc, messages);
187 number ~= n.data;
188 i += n.length;
189 break;
190 case '.':
191 if(e)
192 messages.report(FloatingDotInE, loc + i, loc + i + 1);
193 else if(dot)
194 messages.report(OnlyOneDotFloating, loc + i, loc + i + 1);
195 else
196 {
197 dot = true;
198 number ~= str[i];
199 }
200 i++;
201 break;
202 case 'e':
203 case 'E':
204 if(e)
205 messages.report(OnlyOneEFloating, loc + i, loc + i + 1);
206 else
207 {
208 e = true;
209 number ~= str[i];
210 }
211 i++;
212 break;
213 case '+':
214 case '-':
215 if (number[$-1] != 'e' &&
216 number[$-1] != 'E')
217 messages.report(FloatingBadLocation, loc + i, loc + i + 1)
218 .arg(str[i]);
219 else
220 number ~= str[i];
221 i++;
222 break;
223 default:
224 end = true;
225 }
226 if(str.length == i)
227 end = true;
228 }
229
230 if (number[$-1] == '+' ||
231 number[$-1] == '-' ||
232 number[$-1] == 'e' ||
233 number[$-1] == 'E')
234 {
235 messages.report(FloatingInvalidEnd, loc + i - 1, loc + i);
236 return num;
237 }
238
239 num.data = number;
240 num.length = i;
241
242 return num;
243 }
244
245
246 void printNumber(char[] str, Number num)
247 {
248 Stdout(str)(" have become").newline;
249 switch(num.type)
250 {
251 case NumberType.Int:
252 Stdout(num.integer)(" of type ")("int");
253 break;
254 case NumberType.UInt:
255 Stdout(num.integer)(" of type ")("uint");
256 break;
257 case NumberType.Long:
258 Stdout(num.integer)(" of type ")("long");
259 break;
260 case NumberType.ULong:
261 Stdout(num.integer)(" of type ")("ulong");
262 break;
263 case NumberType.Float:
264 Stdout(num.floating)(" of type ")("float");
265 break;
266 case NumberType.Double:
267 Stdout(num.floating)(" of type ")("double");
268 break;
269 case NumberType.Real:
270 Stdout(num.floating)(" of type ")("real");
271 break;
272 }
273 Stdout().newline;
274 }
275
276
277 String parseString(char[] str, SourceLocation loc, MessageHandler messages)
278 {
279 String strBuf;
280 strBuf.data.length = str.length;
281 strBuf.data.length = 0;
282
283 switch(str[0])
284 {
285 case 'r':
286 strBuf = parseWysiwygString(str[1..$], strBuf);
287 break;
288 case '`':
289 strBuf = parseWysiwygString(str, strBuf);
290 break;
291 case '"':
292 strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
293 break;
294 case 'x':
295 strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
296 break;
297 default:
298 messages.report(InvalidStrPrefix, loc, loc + 1);
299
300 }
301
302 // printString(str, strBuf);
303
304 return strBuf;
305 }
306
307 String parseHexString(char[] str, String strBuf,
308 SourceLocation loc, MessageHandler messages)
309 {
310 int i = 1; // first char is "
311 char[] hex = "0123456789abcdefABCDEF";
312 char[] whitespace = "\r\n ";
313 char[] hexBuf;
314
315 while(str[i] != '"')
316 {
317 if(hex.contains(str[i]))
318 {
319 hexBuf ~= str[i];
320 if(hexBuf.length == 2)
321 {
322 strBuf.data ~= Integer.toInt(hexBuf, 16);
323 hexBuf.length = 0;
324 }
325 }
326 else if(!whitespace.contains(str[i]))
327 messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
328
329 i++;
330 }
331
332
333
334 return strBuf;
335 }
336 //
337
338 String parseDoubleQuotedString(char[] str, String strBuf,
339 SourceLocation loc, MessageHandler messages)
340 {
341 int i = 1; // first char is "
342
343 while(str[i] != '"')
344 {
345 switch(str[i])
346 {
347 case '\\': // EscapeSequence
348 EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
349 strBuf.data ~= res.data;
350 i += res.length;
351 break;
352 default:
353 strBuf.data ~= str[i];
354 i++;
355 }
356 if(i >= str.length)
357 break;
358 }
359
360 if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d.
361 switch(str[i+1])
362 {
363 case 'c':
364 break;
365 case 'w':
366 strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data);
367 strBuf.type = StringType.WChar;
368 break;
369 case 'd':
370 strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data);
371 strBuf.type = StringType.DChar;
372 break;
373 }
374
375
376 return strBuf;
377 }
378
379 EscapeReturn parseEscapeSequence(char[] str,
380 SourceLocation loc, MessageHandler messages)
381 {
382 EscapeReturn res;
383
384 switch(str[1])
385 {
386 case '\'':
387 res.length = 2;
388 res.data ~= '\'';
389 break;
390 case '"':
391 res.length = 2;
392 res.data ~= '\"';
393 break;
394 case '?':
395 res.length = 2;
396 res.data ~= '\?';
397 break;
398 case '\\':
399 res.length = 2;
400 res.data ~= '\\';
401 break;
402 case 'a':
403 res.length = 2;
404 res.data ~= '\a';
405 break;
406 case 'b':
407 res.length = 2;
408 res.data ~= '\b';
409 break;
410 case 'f':
411 res.length = 2;
412 res.data ~= '\f';
413 break;
414 case 'n':
415 res.length = 2;
416 res.data ~= '\n';
417 break;
418 case 'r':
419 res.length = 2;
420 res.data ~= '\r';
421 break;
422 case 't':
423 res.length = 2;
424 res.data ~= '\t';
425 break;
426 case 'v':
427 res.length = 2;
428 res.data ~= '\v';
429 break;
430 case 'x':
431 char[] hex = "0123456789abcdefABCDEF";
432 char[] hexBuf;
433 if(str.length - 1 >= 4)
434 {
435 for(int i = 2; i < 4; i++)
436 if(hex.contains(str[i]))
437 hexBuf ~= str[i];
438 else
439 messages.report(StringHexInvalid, loc + i, loc + i + 1)
440 .arg(Integer.toString(i-1))
441 .arg(Integer.toString(2));
442 res.length = 4;
443 }
444 else
445 {
446 messages.report(StringShortEscape, loc, loc + str.length);
447 res.length = str.length - 1;
448 }
449 res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
450 break;
451 case 'u':
452 char[] hex = "0123456789abcdefABCDEF";
453 char[] hexBuf;
454 if(str.length - 1 >= 6)
455 {
456 for(int i = 2; i < 6; i++)
457 if(hex.contains(str[i]))
458 hexBuf ~= str[i];
459 else
460 messages.report(StringHexInvalid, loc + i, loc + i + 1)
461 .arg(Integer.toString(i-1))
462 .arg(Integer.toString(6));
463 res.length = 6;
464 }
465 else
466 {
467 messages.report(StringShortEscape, loc, loc + str.length);
468 res.length = str.length - 1;
469 }
470 uint i = Integer.toLong(hexBuf, 16);
471 if(!isValidUtf8(i))
472 messages.report(InvalidUtf8Hex, loc, loc+6);
473 else
474 res.data ~= parseToUtf8(i);
475 break;
476 case 'U':
477 char[] hex = "0123456789abcdefABCDEF";
478 char[] hexBuf;
479 if(str.length - 1 >= 10)
480 {
481 for(int i = 2; i < 10; i++)
482 if(hex.contains(str[i]))
483 hexBuf ~= str[i];
484 else
485 messages.report(StringHexInvalid, loc + i, loc + i + 1)
486 .arg(Integer.toString(i-1))
487 .arg(Integer.toString(10));
488 res.length = 10;
489 }
490 else
491 {
492 messages.report(StringShortEscape, loc, loc + str.length);
493 res.length = str.length - 1;
494 }
495 uint i = Integer.toLong(hexBuf, 16);
496 if(!isValidUtf8(i))
497 messages.report(InvalidUtf8Hex, loc, loc+10);
498 else
499 res.data ~= parseToUtf8(i);
500 break;
501 case '0':
502 case '1':
503 case '2':
504 case '3':
505 case '4':
506 case '5':
507 case '6':
508 case '7':
509 char[] oct = "01234567";
510 char[] octBuf;
511 octBuf ~= str[1];
512 res.length = 2;
513 for(int i = 2; i < 4; i++)
514 if(oct.contains(str[i]))
515 {
516 octBuf ~= str[i];
517 res.length += 1;
518 }
519 else
520 break;
521
522 uint i = Integer.toLong(octBuf, 8);
523 res.data ~= i;
524 break;
525 case '&':
526 int i = 2;
527 char[] s;
528 while(str[i] != ';')
529 {
530 if(str[i] == '"')
531 {
532 messages.report(NoCharEntityEnd, loc+i, loc+i+1);
533 res.length = 2;
534 break;
535 }
536 s ~= str[i];
537 i++;
538 }
539
540 if ( s in characterEntities )
541 {
542 res.data ~= parseToUtf8(characterEntities[s]);
543 }
544 else
545 messages.report(InvalidCharEntity, loc + 2, loc + i);
546
547 res.length = i + 1; // remember the ;
548
549 break;
550 default:
551 messages.report(InvalidStrEscape, loc, loc + 2);
552 res.length += 2;
553 }
554
555 return res;
556 }
557
558 String parseWysiwygString(char[] str, String strBuf)
559 {
560 char start = str[0];
561
562 int i = 1;
563
564 while(str[i] != start)
565 {
566 strBuf.data ~= cast(ubyte)str[i];
567 i++;
568 }
569 return strBuf;
570 }
571
572 ubyte[] parseToUtf8(uint i)
573 {
574 if(i <= 0x00007F)
575 return [cast(ubyte)i];
576 else if(i <= 0x0007FF)
577 {
578 ubyte a = (i << 26) >> 26;
579 bts(cast(uint*)&a, 7);
580 ubyte b = (i << 19) >> 25;
581 bts(cast(uint*)&b, 7);
582 bts(cast(uint*)&b, 6);
583 return [b,a];
584 }
585 else if(i <= 0x00FFFF)
586 {
587 ubyte a = (i << 26) >> 26;
588 bts(cast(uint*)&a, 7);
589 ubyte b = (i << 20) >> 26;
590 bts(cast(uint*)&b, 7);
591 ubyte c = (i << 16) >> 28;
592 bts(cast(uint*)&c, 7);
593 bts(cast(uint*)&c, 6);
594 bts(cast(uint*)&c, 5);
595 return [c,b,a];
596 }
597 else if(i <= 0x10FFFF)
598 {
599 ubyte a = (i << 26) >> 26;
600 bts(cast(uint*)&a, 7);
601 ubyte b = (i << 20) >> 26;
602 bts(cast(uint*)&b, 7);
603 ubyte c = (i << 14) >> 26;
604 bts(cast(uint*)&c, 7);
605 ubyte d = (i << 11) >> 29;
606 bts(cast(uint*)&d, 7);
607 bts(cast(uint*)&d, 6);
608 bts(cast(uint*)&d, 5);
609 bts(cast(uint*)&d, 4);
610 return [d,c,b,a];
611 }
612 }
613
614 bool isValidUtf8(uint i)
615 {
616 if(i <= 0x10FFFF)
617 return true;
618 return false;
619 }
620
621 void printString(char[] str, String strBuf)
622 {
623 char[] s;
624 switch(strBuf.type)
625 {
626 case StringType.Char:
627 Stdout(str)(" have become").newline()
628 (cast(char[])strBuf.data).newline;
629 break;
630 case StringType.WChar:
631 Stdout(str)(" have become").newline()
632 (cast(wchar[])strBuf.data).newline;
633 break;
634 case StringType.DChar:
635 Stdout(str)(" have become").newline()
636 (cast(dchar[])strBuf.data).newline;
637 break;
638 }
639 }
640
641 static ushort[char[]] characterEntities;
642
643 static this()
644 {
645 characterEntities =
646 [
647 "quot"[]: 34,
648 "amp": 38,
649 "lt": 60,
650 "gt": 62,
651 "OElig": 338,
652 "oelig": 339,
653 "Scaron": 352,
654 "scaron": 353,
655 "Yuml": 376,
656 "circ": 710,
657 "tilde": 732,
658 "ensp": 8194,
659 "emsp": 8195,
660 "thinsp": 8201,
661 "zwnj": 8204,
662 "zwj": 8205,
663 "lrm": 8206,
664 "rlm": 8207,
665 "ndash": 8211,
666 "mdash": 8212,
667 "lsquo": 8216,
668 "rsquo": 8217,
669 "sbquo": 8218,
670 "ldquo": 8220,
671 "rdquo": 8221,
672 "bdquo": 8222,
673 "dagger": 8224,
674 "Dagger": 8225,
675 "permil": 8240,
676 "lsaquo": 8249,
677 "rsaquo": 8250,
678 "euro": 8364,
679 "nbsp": 160,
680 "iexcl": 161,
681 "cent": 162,
682 "pound": 163,
683 "curren": 164,
684 "yen": 165,
685 "brvbar": 166,
686 "sect": 167,
687 "uml": 168,
688 "copy": 169,
689 "ordf": 170,
690 "laquo": 171,
691 "not": 172,
692 "shy": 173,
693 "reg": 174,
694 "macr": 175,
695 "deg": 176,
696 "plusmn": 177,
697 "sup2": 178,
698 "sup3": 179,
699 "acute": 180,
700 "micro": 181,
701 "para": 182,
702 "middot": 183,
703 "cedil": 184,
704 "sup1": 185,
705 "ordm": 186,
706 "raquo": 187,
707 "frac14": 188,
708 "frac12": 189,
709 "frac34": 190,
710 "iquest": 191,
711 "Agrave": 192,
712 "Aacute": 193,
713 "Acirc": 194,
714 "Atilde": 195,
715 "Auml": 196,
716 "Aring": 197,
717 "AElig": 198,
718 "Ccedil": 199,
719 "Egrave": 200,
720 "Eacute": 201,
721 "Ecirc": 202,
722 "Euml": 203,
723 "Igrave": 204,
724 "Iacute": 205,
725 "Icirc": 206,
726 "Iuml": 207,
727 "ETH": 208,
728 "Ntilde": 209,
729 "Ograve": 210,
730 "Oacute": 211,
731 "Ocirc": 212,
732 "Otilde": 213,
733 "Ouml": 214,
734 "times": 215,
735 "Oslash": 216,
736 "Ugrave": 217,
737 "Uacute": 218,
738 "Ucirc": 219,
739 "Uuml": 220,
740 "Yacute": 221,
741 "THORN": 222,
742 "szlig": 223,
743 "agrave": 224,
744 "aacute": 225,
745 "acirc": 226,
746 "atilde": 227,
747 "auml": 228,
748 "aring": 229,
749 "aelig": 230,
750 "ccedil": 231,
751 "egrave": 232,
752 "eacute": 233,
753 "ecirc": 234,
754 "euml": 235,
755 "igrave": 236,
756 "iacute": 237,
757 "icirc": 238,
758 "iuml": 239,
759 "eth": 240,
760 "ntilde": 241,
761 "ograve": 242,
762 "oacute": 243,
763 "ocirc": 244,
764 "otilde": 245,
765 "ouml": 246,
766 "divide": 247,
767 "oslash": 248,
768 "ugrave": 249,
769 "uacute": 250,
770 "ucirc": 251,
771 "uuml": 252,
772 "yacute": 253,
773 "thorn": 254,
774 "yuml": 255,
775 "fnof": 402,
776 "Alpha": 913,
777 "Beta": 914,
778 "Gamma": 915,
779 "Delta": 916,
780 "Epsilon": 917,
781 "Zeta": 918,
782 "Eta": 919,
783 "Theta": 920,
784 "Iota": 921,
785 "Kappa": 922,
786 "Lambda": 923,
787 "Mu": 924,
788 "Nu": 925,
789 "Xi": 926,
790 "Omicron": 927,
791 "Pi": 928,
792 "Rho": 929,
793 "Sigma": 931,
794 "Tau": 932,
795 "Upsilon": 933,
796 "Phi": 934,
797 "Chi": 935,
798 "Psi": 936,
799 "Omega": 937,
800 "alpha": 945,
801 "beta": 946,
802 "gamma": 947,
803 "delta": 948,
804 "epsilon": 949,
805 "zeta": 950,
806 "eta": 951,
807 "theta": 952,
808 "iota": 953,
809 "kappa": 954,
810 "lambda": 955,
811 "mu": 956,
812 "nu": 957,
813 "xi": 958,
814 "omicron": 959,
815 "pi": 960,
816 "rho": 961,
817 "sigmaf": 962,
818 "sigma": 963,
819 "tau": 964,
820 "upsilon": 965,
821 "phi": 966,
822 "chi": 967,
823 "psi": 968,
824 "omega": 969,
825 "thetasym": 977,
826 "upsih": 978,
827 "piv": 982,
828 "bull": 8226,
829 "hellip": 8230,
830 "prime": 8242,
831 "Prime": 8243,
832 "oline": 8254,
833 "frasl": 8260,
834 "weierp": 8472,
835 "image": 8465,
836 "real": 8476,
837 "trade": 8482,
838 "alefsym": 8501,
839 "larr": 8592,
840 "uarr": 8593,
841 "rarr": 8594,
842 "darr": 8595,
843 "harr": 8596,
844 "crarr": 8629,
845 "lArr": 8656,
846 "uArr": 8657,
847 "rArr": 8658,
848 "dArr": 8659,
849 "hArr": 8660,
850 "forall": 8704,
851 "part": 8706,
852 "exist": 8707,
853 "empty": 8709,
854 "nabla": 8711,
855 "isin": 8712,
856 "notin": 8713,
857 "ni": 8715,
858 "prod": 8719,
859 "sum": 8721,
860 "minus": 8722,
861 "lowast": 8727,
862 "radic": 8730,
863 "prop": 8733,
864 "infin": 8734,
865 "ang": 8736,
866 "and": 8743,
867 "or": 8744,
868 "cap": 8745,
869 "cup": 8746,
870 "int": 8747,
871 "there4": 8756,
872 "sim": 8764,
873 "cong": 8773,
874 "asymp": 8776,
875 "ne": 8800,
876 "equiv": 8801,
877 "le": 8804,
878 "ge": 8805,
879 "sub": 8834,
880 "sup": 8835,
881 "nsub": 8836,
882 "sube": 8838,
883 "supe": 8839,
884 "oplus": 8853,
885 "otimes": 8855,
886 "perp": 8869,
887 "sdot": 8901,
888 "lceil": 8968,
889 "rceil": 8969,
890 "lfloor": 8970,
891 "rfloor": 8971,
892 "lang": 9001,
893 "rang": 9002,
894 "loz": 9674,
895 "spades": 9824,
896 "clubs": 9827,
897 "hearts": 9829,
898 "diams": 9830
899 ];
900 }