Mercurial > projects > dang
view basic/LiteralParsing.d @ 174:20ff3c31f600
Putting symbol on MemberRef -calls.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Thu, 24 Jul 2008 21:06:42 +0200 |
parents | c0b531362ca6 |
children | fda35d57847e |
line wrap: on
line source
module basic.LiteralParsing; import basic.SourceLocation, basic.Message, basic.conv; import tango.io.Stdout, tango.core.BitManip, Integer = tango.text.convert.Integer, Utf = tango.text.convert.Utf, tango.text.Util; enum StringType { Char, WChar, DChar } enum NumberType { Int, UInt, Long, ULong, Float, Double, Real } struct String { StringType type; ubyte[] data; } struct Number { NumberType type; ulong integer; real floating; } private struct EscapeReturn { ubyte[] data; int length; } private struct NumberReturn { char[] data; int length; } Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages) { Number num; switch(str[0]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if(str.contains('.') || str.contains('e') || str.contains('E')) { auto n = parseRealNumber(str, loc, messages); try { num.floating = toReal(n.data); num.type = NumberType.Double; } catch(Exception e) { num.floating = real.init; messages.report(FloatingToLarge, loc, loc + n.length - 1); } if(num.floating > double.max) num.type = NumberType.Real; } else { auto n = parseDecimalDigits(str, loc, messages); try { num.integer = toUlong(n.data); } catch(Exception e) { num.integer = 0; messages.report(IntegerToLarge, loc, loc + n.length - 1); } if(num.integer > uint.max) num.type = NumberType.Long; if(num.integer > long.max) num.type = NumberType.ULong; } break; default: messages.report(InvalidStartInteger, loc, loc+1); } // printNumber(str, num); return num; } NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages) { int i = 0; char[] number; bool end; while(!end) { switch(str[i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': number ~= str[i]; break; case '_': break; default: end = true; } i++; if(str.length == i) { end = true; i++; } } NumberReturn res; res.length = i - 1; res.data = number; return res; } NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages) { int i = 0; bool dot, e; char[] number; NumberReturn num; bool end; while(!end) { switch(str[i]) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '_': auto n = parseDecimalDigits(str[i..$], loc, messages); number ~= n.data; i += n.length; break; case '.': if(e) messages.report(FloatingDotInE, loc + i, loc + i + 1); else if(dot) messages.report(OnlyOneDotFloating, loc + i, loc + i + 1); else { dot = true; number ~= str[i]; } i++; break; case 'e': case 'E': if(e) messages.report(OnlyOneEFloating, loc + i, loc + i + 1); else { e = true; number ~= str[i]; } i++; break; case '+': case '-': if (number[$-1] != 'e' && number[$-1] != 'E') messages.report(FloatingBadLocation, loc + i, loc + i + 1) .arg(str[i]); else number ~= str[i]; i++; break; default: end = true; } if(str.length == i) end = true; } if (number[$-1] == '+' || number[$-1] == '-' || number[$-1] == 'e' || number[$-1] == 'E') { messages.report(FloatingInvalidEnd, loc + i - 1, loc + i); return num; } num.data = number; num.length = i; return num; } void printNumber(char[] str, Number num) { Stdout(str)(" have become").newline; switch(num.type) { case NumberType.Int: Stdout(num.integer)(" of type ")("int"); break; case NumberType.UInt: Stdout(num.integer)(" of type ")("uint"); break; case NumberType.Long: Stdout(num.integer)(" of type ")("long"); break; case NumberType.ULong: Stdout(num.integer)(" of type ")("ulong"); break; case NumberType.Float: Stdout(num.floating)(" of type ")("float"); break; case NumberType.Double: Stdout(num.floating)(" of type ")("double"); break; case NumberType.Real: Stdout(num.floating)(" of type ")("real"); break; } Stdout().newline; } String parseString(char[] str, SourceLocation loc, MessageHandler messages) { String strBuf; strBuf.data.length = str.length; strBuf.data.length = 0; switch(str[0]) { case 'r': strBuf = parseWysiwygString(str[1..$], strBuf); break; case '`': strBuf = parseWysiwygString(str, strBuf); break; case '"': strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); break; case 'x': strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); break; default: messages.report(InvalidStrPrefix, loc, loc + 1); } printString(str, strBuf); return strBuf; } String parseHexString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " char[] hex = "0123456789abcdefABCDEF"; char[] whitespace = "\r\n "; char[] hexBuf; while(str[i] != '"') { if(hex.contains(str[i])) { hexBuf ~= str[i]; if(hexBuf.length == 2) { strBuf.data ~= Integer.toInt(hexBuf, 16); hexBuf.length = 0; } } else if(!whitespace.contains(str[i])) messages.report(InvalidHexStrChar, loc + i, loc + i + 1); i++; } return strBuf; } // String parseDoubleQuotedString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " while(str[i] != '"') { switch(str[i]) { case '\\': // EscapeSequence EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); strBuf.data ~= res.data; i += res.length; break; default: strBuf.data ~= str[i]; i++; } if(i >= str.length) break; } if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d. switch(str[i+1]) { case 'c': break; case 'w': strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data); strBuf.type = StringType.WChar; break; case 'd': strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data); strBuf.type = StringType.DChar; break; } return strBuf; } EscapeReturn parseEscapeSequence(char[] str, SourceLocation loc, MessageHandler messages) { EscapeReturn res; switch(str[1]) { case '\'': res.length = 2; res.data ~= '\''; break; case '"': res.length = 2; res.data ~= '\"'; break; case '?': res.length = 2; res.data ~= '\?'; break; case '\\': res.length = 2; res.data ~= '\\'; break; case 'a': res.length = 2; res.data ~= '\a'; break; case 'b': res.length = 2; res.data ~= '\b'; break; case 'f': res.length = 2; res.data ~= '\f'; break; case 'n': res.length = 2; res.data ~= '\n'; break; case 'r': res.length = 2; res.data ~= '\r'; break; case 't': res.length = 2; res.data ~= '\t'; break; case 'v': res.length = 2; res.data ~= '\v'; break; case 'x': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 4) { for(int i = 2; i < 4; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(2)); res.length = 4; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); break; case 'u': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 6) { for(int i = 2; i < 6; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(6)); res.length = 6; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+6); else res.data ~= parseToUtf8(i); break; case 'U': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 10) { for(int i = 2; i < 10; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(10)); res.length = 10; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+10); else res.data ~= parseToUtf8(i); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': char[] oct = "01234567"; char[] octBuf; octBuf ~= str[1]; res.length = 2; for(int i = 2; i < 4; i++) if(oct.contains(str[i])) { octBuf ~= str[i]; res.length += 1; } else break; uint i = Integer.toLong(octBuf, 8); res.data ~= i; break; case '&': int i = 2; char[] s; while(str[i] != ';') { if(str[i] == '"') { messages.report(NoCharEntityEnd, loc+i, loc+i+1); res.length = 2; break; } s ~= str[i]; i++; } if ( s in characterEntities ) { res.data ~= parseToUtf8(characterEntities[s]); } else messages.report(InvalidCharEntity, loc + 2, loc + i); res.length = i + 1; // remember the ; break; default: messages.report(InvalidStrEscape, loc, loc + 2); res.length += 2; } return res; } String parseWysiwygString(char[] str, String strBuf) { char start = str[0]; int i = 1; while(str[i] != start) { strBuf.data ~= cast(ubyte)str[i]; i++; } return strBuf; } ubyte[] parseToUtf8(uint i) { if(i <= 0x00007F) return [cast(ubyte)i]; else if(i <= 0x0007FF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 19) >> 25; bts(cast(uint*)&b, 7); bts(cast(uint*)&b, 6); return [b,a]; } else if(i <= 0x00FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 16) >> 28; bts(cast(uint*)&c, 7); bts(cast(uint*)&c, 6); bts(cast(uint*)&c, 5); return [c,b,a]; } else if(i <= 0x10FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 14) >> 26; bts(cast(uint*)&c, 7); ubyte d = (i << 11) >> 29; bts(cast(uint*)&d, 7); bts(cast(uint*)&d, 6); bts(cast(uint*)&d, 5); bts(cast(uint*)&d, 4); return [d,c,b,a]; } } bool isValidUtf8(uint i) { if(i <= 0x10FFFF) return true; return false; } void printString(char[] str, String strBuf) { char[] s; switch(strBuf.type) { case StringType.Char: Stdout(str)(" have become").newline() (cast(char[])strBuf.data).newline; break; case StringType.WChar: Stdout(str)(" have become").newline() (cast(wchar[])strBuf.data).newline; break; case StringType.DChar: Stdout(str)(" have become").newline() (cast(dchar[])strBuf.data).newline; break; } } static ushort[char[]] characterEntities; static this() { characterEntities = [ "quot"[]: 34, "amp": 38, "lt": 60, "gt": 62, "OElig": 338, "oelig": 339, "Scaron": 352, "scaron": 353, "Yuml": 376, "circ": 710, "tilde": 732, "ensp": 8194, "emsp": 8195, "thinsp": 8201, "zwnj": 8204, "zwj": 8205, "lrm": 8206, "rlm": 8207, "ndash": 8211, "mdash": 8212, "lsquo": 8216, "rsquo": 8217, "sbquo": 8218, "ldquo": 8220, "rdquo": 8221, "bdquo": 8222, "dagger": 8224, "Dagger": 8225, "permil": 8240, "lsaquo": 8249, "rsaquo": 8250, "euro": 8364, "nbsp": 160, "iexcl": 161, "cent": 162, "pound": 163, "curren": 164, "yen": 165, "brvbar": 166, "sect": 167, "uml": 168, "copy": 169, "ordf": 170, "laquo": 171, "not": 172, "shy": 173, "reg": 174, "macr": 175, "deg": 176, "plusmn": 177, "sup2": 178, "sup3": 179, "acute": 180, "micro": 181, "para": 182, "middot": 183, "cedil": 184, "sup1": 185, "ordm": 186, "raquo": 187, "frac14": 188, "frac12": 189, "frac34": 190, "iquest": 191, "Agrave": 192, "Aacute": 193, "Acirc": 194, "Atilde": 195, "Auml": 196, "Aring": 197, "AElig": 198, "Ccedil": 199, "Egrave": 200, "Eacute": 201, "Ecirc": 202, "Euml": 203, "Igrave": 204, "Iacute": 205, "Icirc": 206, "Iuml": 207, "ETH": 208, "Ntilde": 209, "Ograve": 210, "Oacute": 211, "Ocirc": 212, "Otilde": 213, "Ouml": 214, "times": 215, "Oslash": 216, "Ugrave": 217, "Uacute": 218, "Ucirc": 219, "Uuml": 220, "Yacute": 221, "THORN": 222, "szlig": 223, "agrave": 224, "aacute": 225, "acirc": 226, "atilde": 227, "auml": 228, "aring": 229, "aelig": 230, "ccedil": 231, "egrave": 232, "eacute": 233, "ecirc": 234, "euml": 235, "igrave": 236, "iacute": 237, "icirc": 238, "iuml": 239, "eth": 240, "ntilde": 241, "ograve": 242, "oacute": 243, "ocirc": 244, "otilde": 245, "ouml": 246, "divide": 247, "oslash": 248, "ugrave": 249, "uacute": 250, "ucirc": 251, "uuml": 252, "yacute": 253, "thorn": 254, "yuml": 255, "fnof": 402, "Alpha": 913, "Beta": 914, "Gamma": 915, "Delta": 916, "Epsilon": 917, "Zeta": 918, "Eta": 919, "Theta": 920, "Iota": 921, "Kappa": 922, "Lambda": 923, "Mu": 924, "Nu": 925, "Xi": 926, "Omicron": 927, "Pi": 928, "Rho": 929, "Sigma": 931, "Tau": 932, "Upsilon": 933, "Phi": 934, "Chi": 935, "Psi": 936, "Omega": 937, "alpha": 945, "beta": 946, "gamma": 947, "delta": 948, "epsilon": 949, "zeta": 950, "eta": 951, "theta": 952, "iota": 953, "kappa": 954, "lambda": 955, "mu": 956, "nu": 957, "xi": 958, "omicron": 959, "pi": 960, "rho": 961, "sigmaf": 962, "sigma": 963, "tau": 964, "upsilon": 965, "phi": 966, "chi": 967, "psi": 968, "omega": 969, "thetasym": 977, "upsih": 978, "piv": 982, "bull": 8226, "hellip": 8230, "prime": 8242, "Prime": 8243, "oline": 8254, "frasl": 8260, "weierp": 8472, "image": 8465, "real": 8476, "trade": 8482, "alefsym": 8501, "larr": 8592, "uarr": 8593, "rarr": 8594, "darr": 8595, "harr": 8596, "crarr": 8629, "lArr": 8656, "uArr": 8657, "rArr": 8658, "dArr": 8659, "hArr": 8660, "forall": 8704, "part": 8706, "exist": 8707, "empty": 8709, "nabla": 8711, "isin": 8712, "notin": 8713, "ni": 8715, "prod": 8719, "sum": 8721, "minus": 8722, "lowast": 8727, "radic": 8730, "prop": 8733, "infin": 8734, "ang": 8736, "and": 8743, "or": 8744, "cap": 8745, "cup": 8746, "int": 8747, "there4": 8756, "sim": 8764, "cong": 8773, "asymp": 8776, "ne": 8800, "equiv": 8801, "le": 8804, "ge": 8805, "sub": 8834, "sup": 8835, "nsub": 8836, "sube": 8838, "supe": 8839, "oplus": 8853, "otimes": 8855, "perp": 8869, "sdot": 8901, "lceil": 8968, "rceil": 8969, "lfloor": 8970, "rfloor": 8971, "lang": 9001, "rang": 9002, "loz": 9674, "spades": 9824, "clubs": 9827, "hearts": 9829, "diams": 9830 ]; }