Mercurial > projects > dang
view basic/LiteralParsing.d @ 192:fda35d57847e
Fixed String parsing, so that they get created with the right type in AST.
Also added so that you can parse options to the test program, that will mirror them to Dang.
Eg.
./tests/run --semantic-only
will pass --semantic-only to Dang on each run.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Fri, 25 Jul 2008 15:00:54 +0200 |
parents | c0b531362ca6 |
children |
line wrap: on
line source
module basic.LiteralParsing; import basic.SourceLocation, basic.Message, basic.conv; import tango.io.Stdout, tango.core.BitManip, Integer = tango.text.convert.Integer, Utf = tango.text.convert.Utf, tango.text.Util; enum StringType { Char, WChar, DChar } enum NumberType { Int, UInt, Long, ULong, Float, Double, Real } struct String { StringType type; ubyte[] data; } struct Number { NumberType type; ulong integer; real floating; } private struct EscapeReturn { ubyte[] data; int length; } private struct NumberReturn { char[] data; int length; } Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages) { Number num; switch(str[0]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if(str.contains('.') || str.contains('e') || str.contains('E')) { auto n = parseRealNumber(str, loc, messages); try { num.floating = toReal(n.data); num.type = NumberType.Double; } catch(Exception e) { num.floating = real.init; messages.report(FloatingToLarge, loc, loc + n.length - 1); } if(num.floating > double.max) num.type = NumberType.Real; } else { auto n = parseDecimalDigits(str, loc, messages); try { num.integer = toUlong(n.data); } catch(Exception e) { num.integer = 0; messages.report(IntegerToLarge, loc, loc + n.length - 1); } if(num.integer > uint.max) num.type = NumberType.Long; if(num.integer > long.max) num.type = NumberType.ULong; } break; default: messages.report(InvalidStartInteger, loc, loc+1); } // printNumber(str, num); return num; } NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages) { int i = 0; char[] number; bool end; while(!end) { switch(str[i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': number ~= str[i]; break; case '_': break; default: end = true; } i++; if(str.length == i) { end = true; i++; } } NumberReturn res; res.length = i - 1; res.data = number; return res; } NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages) { int i = 0; bool dot, e; char[] number; NumberReturn num; bool end; while(!end) { switch(str[i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '_': auto n = parseDecimalDigits(str[i..$], loc, messages); number ~= n.data; i += n.length; break; case '.': if(e) messages.report(FloatingDotInE, loc + i, loc + i + 1); else if(dot) messages.report(OnlyOneDotFloating, loc + i, loc + i + 1); else { dot = true; number ~= str[i]; } i++; break; case 'e': case 'E': if(e) messages.report(OnlyOneEFloating, loc + i, loc + i + 1); else { e = true; number ~= str[i]; } i++; break; case '+': case '-': if (number[$-1] != 'e' && number[$-1] != 'E') messages.report(FloatingBadLocation, loc + i, loc + i + 1) .arg(str[i]); else number ~= str[i]; i++; break; default: end = true; } if(str.length == i) end = true; } if (number[$-1] == '+' || number[$-1] == '-' || number[$-1] == 'e' || number[$-1] == 'E') { messages.report(FloatingInvalidEnd, loc + i - 1, loc + i); return num; } num.data = number; num.length = i; return num; } void printNumber(char[] str, Number num) { Stdout(str)(" have become").newline; switch(num.type) { case NumberType.Int: Stdout(num.integer)(" of type ")("int"); break; case NumberType.UInt: Stdout(num.integer)(" of type ")("uint"); break; case NumberType.Long: Stdout(num.integer)(" of type ")("long"); break; case NumberType.ULong: Stdout(num.integer)(" of type ")("ulong"); break; case NumberType.Float: Stdout(num.floating)(" of type ")("float"); break; case NumberType.Double: Stdout(num.floating)(" of type ")("double"); break; case NumberType.Real: Stdout(num.floating)(" of type ")("real"); break; } Stdout().newline; } String parseString(char[] str, SourceLocation loc, MessageHandler messages) { String strBuf; strBuf.data.length = str.length; strBuf.data.length = 0; switch(str[0]) { case 'r': strBuf = parseWysiwygString(str[1..$], strBuf); break; case '`': strBuf = parseWysiwygString(str, strBuf); break; case '"': strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); break; case 'x': strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); break; default: messages.report(InvalidStrPrefix, loc, loc + 1); } // printString(str, strBuf); return strBuf; } String parseHexString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " char[] hex = "0123456789abcdefABCDEF"; char[] whitespace = "\r\n "; char[] hexBuf; while(str[i] != '"') { if(hex.contains(str[i])) { hexBuf ~= str[i]; if(hexBuf.length == 2) { strBuf.data ~= Integer.toInt(hexBuf, 16); hexBuf.length = 0; } } else if(!whitespace.contains(str[i])) messages.report(InvalidHexStrChar, loc + i, loc + i + 1); i++; } return strBuf; } // String parseDoubleQuotedString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " while(str[i] != '"') { switch(str[i]) { case '\\': // EscapeSequence EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); strBuf.data ~= res.data; i += res.length; break; default: strBuf.data ~= str[i]; i++; } if(i >= str.length) break; } if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d. switch(str[i+1]) { case 'c': break; case 'w': strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data); strBuf.type = StringType.WChar; break; case 'd': strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data); strBuf.type = StringType.DChar; break; } return strBuf; } EscapeReturn parseEscapeSequence(char[] str, SourceLocation loc, MessageHandler messages) { EscapeReturn res; switch(str[1]) { case '\'': res.length = 2; res.data ~= '\''; break; case '"': res.length = 2; res.data ~= '\"'; break; case '?': res.length = 2; res.data ~= '\?'; break; case '\\': res.length = 2; res.data ~= '\\'; break; case 'a': res.length = 2; res.data ~= '\a'; break; case 'b': res.length = 2; res.data ~= '\b'; break; case 'f': res.length = 2; res.data ~= '\f'; break; case 'n': res.length = 2; res.data ~= '\n'; break; case 'r': res.length = 2; res.data ~= '\r'; break; case 't': res.length = 2; res.data ~= '\t'; break; case 'v': res.length = 2; res.data ~= '\v'; break; case 'x': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 4) { for(int i = 2; i < 4; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(2)); res.length = 4; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); break; case 'u': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 6) { for(int i = 2; i < 6; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(6)); res.length = 6; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+6); else res.data ~= parseToUtf8(i); break; case 'U': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 10) { for(int i = 2; i < 10; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(10)); res.length = 10; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+10); else res.data ~= parseToUtf8(i); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': char[] oct = "01234567"; char[] octBuf; octBuf ~= str[1]; res.length = 2; for(int i = 2; i < 4; i++) if(oct.contains(str[i])) { octBuf ~= str[i]; res.length += 1; } else break; uint i = Integer.toLong(octBuf, 8); res.data ~= i; break; case '&': int i = 2; char[] s; while(str[i] != ';') { if(str[i] == '"') { messages.report(NoCharEntityEnd, loc+i, loc+i+1); res.length = 2; break; } s ~= str[i]; i++; } if ( s in characterEntities ) { res.data ~= parseToUtf8(characterEntities[s]); } else messages.report(InvalidCharEntity, loc + 2, loc + i); res.length = i + 1; // remember the ; break; default: messages.report(InvalidStrEscape, loc, loc + 2); res.length += 2; } return res; } String parseWysiwygString(char[] str, String strBuf) { char start = str[0]; int i = 1; while(str[i] != start) { strBuf.data ~= cast(ubyte)str[i]; i++; } return strBuf; } ubyte[] parseToUtf8(uint i) { if(i <= 0x00007F) return [cast(ubyte)i]; else if(i <= 0x0007FF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 19) >> 25; bts(cast(uint*)&b, 7); bts(cast(uint*)&b, 6); return [b,a]; } else if(i <= 0x00FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 16) >> 28; bts(cast(uint*)&c, 7); bts(cast(uint*)&c, 6); bts(cast(uint*)&c, 5); return [c,b,a]; } else if(i <= 0x10FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 14) >> 26; bts(cast(uint*)&c, 7); ubyte d = (i << 11) >> 29; bts(cast(uint*)&d, 7); bts(cast(uint*)&d, 6); bts(cast(uint*)&d, 5); bts(cast(uint*)&d, 4); return [d,c,b,a]; } } bool isValidUtf8(uint i) { if(i <= 0x10FFFF) return true; return false; } void printString(char[] str, String strBuf) { char[] s; switch(strBuf.type) { case StringType.Char: Stdout(str)(" have become").newline() (cast(char[])strBuf.data).newline; break; case StringType.WChar: Stdout(str)(" have become").newline() (cast(wchar[])strBuf.data).newline; break; case StringType.DChar: Stdout(str)(" have become").newline() (cast(dchar[])strBuf.data).newline; break; } } static ushort[char[]] characterEntities; static this() { characterEntities = [ "quot"[]: 34, "amp": 38, "lt": 60, "gt": 62, "OElig": 338, "oelig": 339, "Scaron": 352, "scaron": 353, "Yuml": 376, "circ": 710, "tilde": 732, "ensp": 8194, "emsp": 8195, "thinsp": 8201, "zwnj": 8204, "zwj": 8205, "lrm": 8206, "rlm": 8207, "ndash": 8211, "mdash": 8212, "lsquo": 8216, "rsquo": 8217, "sbquo": 8218, "ldquo": 8220, "rdquo": 8221, "bdquo": 8222, "dagger": 8224, "Dagger": 8225, "permil": 8240, "lsaquo": 8249, "rsaquo": 8250, "euro": 8364, "nbsp": 160, "iexcl": 161, "cent": 162, "pound": 163, "curren": 164, "yen": 165, "brvbar": 166, "sect": 167, "uml": 168, "copy": 169, "ordf": 170, "laquo": 171, "not": 172, "shy": 173, "reg": 174, "macr": 175, "deg": 176, "plusmn": 177, "sup2": 178, "sup3": 179, "acute": 180, "micro": 181, "para": 182, "middot": 183, "cedil": 184, "sup1": 185, "ordm": 186, "raquo": 187, "frac14": 188, "frac12": 189, "frac34": 190, "iquest": 191, "Agrave": 192, "Aacute": 193, "Acirc": 194, "Atilde": 195, "Auml": 196, "Aring": 197, "AElig": 198, "Ccedil": 199, "Egrave": 200, "Eacute": 201, "Ecirc": 202, "Euml": 203, "Igrave": 204, "Iacute": 205, "Icirc": 206, "Iuml": 207, "ETH": 208, "Ntilde": 209, "Ograve": 210, "Oacute": 211, "Ocirc": 212, "Otilde": 213, "Ouml": 214, "times": 215, "Oslash": 216, "Ugrave": 217, "Uacute": 218, "Ucirc": 219, "Uuml": 220, "Yacute": 221, "THORN": 222, "szlig": 223, "agrave": 224, "aacute": 225, "acirc": 226, "atilde": 227, "auml": 228, "aring": 229, "aelig": 230, "ccedil": 231, "egrave": 232, "eacute": 233, "ecirc": 234, "euml": 235, "igrave": 236, "iacute": 237, "icirc": 238, "iuml": 239, "eth": 240, "ntilde": 241, "ograve": 242, "oacute": 243, "ocirc": 244, "otilde": 245, "ouml": 246, "divide": 247, "oslash": 248, "ugrave": 249, "uacute": 250, "ucirc": 251, "uuml": 252, "yacute": 253, "thorn": 254, "yuml": 255, "fnof": 402, "Alpha": 913, "Beta": 914, "Gamma": 915, "Delta": 916, "Epsilon": 917, "Zeta": 918, "Eta": 919, "Theta": 920, "Iota": 921, "Kappa": 922, "Lambda": 923, "Mu": 924, "Nu": 925, "Xi": 926, "Omicron": 927, "Pi": 928, "Rho": 929, "Sigma": 931, "Tau": 932, "Upsilon": 933, "Phi": 934, "Chi": 935, "Psi": 936, "Omega": 937, "alpha": 945, "beta": 946, "gamma": 947, "delta": 948, "epsilon": 949, "zeta": 950, "eta": 951, "theta": 952, "iota": 953, "kappa": 954, "lambda": 955, "mu": 956, "nu": 957, "xi": 958, "omicron": 959, "pi": 960, "rho": 961, "sigmaf": 962, "sigma": 963, "tau": 964, "upsilon": 965, "phi": 966, "chi": 967, "psi": 968, "omega": 969, "thetasym": 977, "upsih": 978, "piv": 982, "bull": 8226, "hellip": 8230, "prime": 8242, "Prime": 8243, "oline": 8254, "frasl": 8260, "weierp": 8472, "image": 8465, "real": 8476, "trade": 8482, "alefsym": 8501, "larr": 8592, "uarr": 8593, "rarr": 8594, "darr": 8595, "harr": 8596, "crarr": 8629, "lArr": 8656, "uArr": 8657, "rArr": 8658, "dArr": 8659, "hArr": 8660, "forall": 8704, "part": 8706, "exist": 8707, "empty": 8709, "nabla": 8711, "isin": 8712, "notin": 8713, "ni": 8715, "prod": 8719, "sum": 8721, "minus": 8722, "lowast": 8727, "radic": 8730, "prop": 8733, "infin": 8734, "ang": 8736, "and": 8743, "or": 8744, "cap": 8745, "cup": 8746, "int": 8747, "there4": 8756, "sim": 8764, "cong": 8773, "asymp": 8776, "ne": 8800, "equiv": 8801, "le": 8804, "ge": 8805, "sub": 8834, "sup": 8835, "nsub": 8836, "sube": 8838, "supe": 8839, "oplus": 8853, "otimes": 8855, "perp": 8869, "sdot": 8901, "lceil": 8968, "rceil": 8969, "lfloor": 8970, "rfloor": 8971, "lang": 9001, "rang": 9002, "loz": 9674, "spades": 9824, "clubs": 9827, "hearts": 9829, "diams": 9830 ]; }