Mercurial > projects > dang
diff src/basic/LiteralParsing.d @ 207:e0551773a005
Added the correct version.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Tue, 12 Aug 2008 18:19:34 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/basic/LiteralParsing.d Tue Aug 12 18:19:34 2008 +0200 @@ -0,0 +1,900 @@ +module basic.LiteralParsing; + +import basic.SourceLocation, + basic.Message, + basic.conv; + +import tango.io.Stdout, + tango.core.BitManip, + Integer = tango.text.convert.Integer, + Utf = tango.text.convert.Utf, + tango.text.Util; + +enum StringType +{ + Char, + WChar, + DChar +} + +enum NumberType +{ + Int, + UInt, + Long, + ULong, + Float, + Double, + Real +} + +struct String +{ + StringType type; + ubyte[] data; +} + +struct Number +{ + NumberType type; + ulong integer; + real floating; +} + +private struct EscapeReturn +{ + ubyte[] data; + int length; +} + +private struct NumberReturn +{ + char[] data; + int length; +} + +Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages) +{ + Number num; + + switch(str[0]) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if(str.contains('.') || str.contains('e') || str.contains('E')) + { + auto n = parseRealNumber(str, loc, messages); + + try + { + num.floating = toReal(n.data); + num.type = NumberType.Double; + } + catch(Exception e) + { + num.floating = real.init; + messages.report(FloatingToLarge, loc, loc + n.length - 1); + } + + if(num.floating > double.max) + num.type = NumberType.Real; + } + else + { + auto n = parseDecimalDigits(str, loc, messages); + + try + { + num.integer = toUlong(n.data); + } + catch(Exception e) + { + num.integer = 0; + messages.report(IntegerToLarge, loc, loc + n.length - 1); + } + + if(num.integer > uint.max) + num.type = NumberType.Long; + if(num.integer > long.max) + num.type = NumberType.ULong; + } + break; + default: + messages.report(InvalidStartInteger, loc, loc+1); + } + +// printNumber(str, num); + return num; +} + +NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages) +{ + int i = 0; + + char[] number; + + bool end; + while(!end) + { + switch(str[i]) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + number ~= str[i]; + break; + case '_': + break; + default: + end = true; + } + i++; + if(str.length == i) + { + end = true; + i++; + } + } + + NumberReturn res; + res.length = i - 1; + res.data = number; + + return res; +} + +NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages) +{ + int i = 0; + + bool dot, e; + char[] number; + + NumberReturn num; + + bool end; + while(!end) + { + switch(str[i]) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '_': + auto n = parseDecimalDigits(str[i..$], loc, messages); + number ~= n.data; + i += n.length; + break; + case '.': + if(e) + messages.report(FloatingDotInE, loc + i, loc + i + 1); + else if(dot) + messages.report(OnlyOneDotFloating, loc + i, loc + i + 1); + else + { + dot = true; + number ~= str[i]; + } + i++; + break; + case 'e': + case 'E': + if(e) + messages.report(OnlyOneEFloating, loc + i, loc + i + 1); + else + { + e = true; + number ~= str[i]; + } + i++; + break; + case '+': + case '-': + if (number[$-1] != 'e' && + number[$-1] != 'E') + messages.report(FloatingBadLocation, loc + i, loc + i + 1) + .arg(str[i]); + else + number ~= str[i]; + i++; + break; + default: + end = true; + } + if(str.length == i) + end = true; + } + + if (number[$-1] == '+' || + number[$-1] == '-' || + number[$-1] == 'e' || + number[$-1] == 'E') + { + messages.report(FloatingInvalidEnd, loc + i - 1, loc + i); + return num; + } + + num.data = number; + num.length = i; + + return num; +} + + +void printNumber(char[] str, Number num) +{ + Stdout(str)(" have become").newline; + switch(num.type) + { + case NumberType.Int: + Stdout(num.integer)(" of type ")("int"); + break; + case NumberType.UInt: + Stdout(num.integer)(" of type ")("uint"); + break; + case NumberType.Long: + Stdout(num.integer)(" of type ")("long"); + break; + case NumberType.ULong: + Stdout(num.integer)(" of type ")("ulong"); + break; + case NumberType.Float: + Stdout(num.floating)(" of type ")("float"); + break; + case NumberType.Double: + Stdout(num.floating)(" of type ")("double"); + break; + case NumberType.Real: + Stdout(num.floating)(" of type ")("real"); + break; + } + Stdout().newline; +} + + +String parseString(char[] str, SourceLocation loc, MessageHandler messages) +{ + String strBuf; + strBuf.data.length = str.length; + strBuf.data.length = 0; + + switch(str[0]) + { + case 'r': + strBuf = parseWysiwygString(str[1..$], strBuf); + break; + case '`': + strBuf = parseWysiwygString(str, strBuf); + break; + case '"': + strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); + break; + case 'x': + strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); + break; + default: + messages.report(InvalidStrPrefix, loc, loc + 1); + + } + +// printString(str, strBuf); + + return strBuf; +} + +String parseHexString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + char[] hex = "0123456789abcdefABCDEF"; + char[] whitespace = "\r\n "; + char[] hexBuf; + + while(str[i] != '"') + { + if(hex.contains(str[i])) + { + hexBuf ~= str[i]; + if(hexBuf.length == 2) + { + strBuf.data ~= Integer.toInt(hexBuf, 16); + hexBuf.length = 0; + } + } + else if(!whitespace.contains(str[i])) + messages.report(InvalidHexStrChar, loc + i, loc + i + 1); + + i++; + } + + + + return strBuf; +} +// + +String parseDoubleQuotedString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + + while(str[i] != '"') + { + switch(str[i]) + { + case '\\': // EscapeSequence + EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); + strBuf.data ~= res.data; + i += res.length; + break; + default: + strBuf.data ~= str[i]; + i++; + } + if(i >= str.length) + break; + } + + if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d. + switch(str[i+1]) + { + case 'c': + break; + case 'w': + strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data); + strBuf.type = StringType.WChar; + break; + case 'd': + strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data); + strBuf.type = StringType.DChar; + break; + } + + + return strBuf; +} + +EscapeReturn parseEscapeSequence(char[] str, + SourceLocation loc, MessageHandler messages) +{ + EscapeReturn res; + + switch(str[1]) + { + case '\'': + res.length = 2; + res.data ~= '\''; + break; + case '"': + res.length = 2; + res.data ~= '\"'; + break; + case '?': + res.length = 2; + res.data ~= '\?'; + break; + case '\\': + res.length = 2; + res.data ~= '\\'; + break; + case 'a': + res.length = 2; + res.data ~= '\a'; + break; + case 'b': + res.length = 2; + res.data ~= '\b'; + break; + case 'f': + res.length = 2; + res.data ~= '\f'; + break; + case 'n': + res.length = 2; + res.data ~= '\n'; + break; + case 'r': + res.length = 2; + res.data ~= '\r'; + break; + case 't': + res.length = 2; + res.data ~= '\t'; + break; + case 'v': + res.length = 2; + res.data ~= '\v'; + break; + case 'x': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 4) + { + for(int i = 2; i < 4; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(2)); + res.length = 4; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); + break; + case 'u': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 6) + { + for(int i = 2; i < 6; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(6)); + res.length = 6; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+6); + else + res.data ~= parseToUtf8(i); + break; + case 'U': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 10) + { + for(int i = 2; i < 10; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(10)); + res.length = 10; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+10); + else + res.data ~= parseToUtf8(i); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + char[] oct = "01234567"; + char[] octBuf; + octBuf ~= str[1]; + res.length = 2; + for(int i = 2; i < 4; i++) + if(oct.contains(str[i])) + { + octBuf ~= str[i]; + res.length += 1; + } + else + break; + + uint i = Integer.toLong(octBuf, 8); + res.data ~= i; + break; + case '&': + int i = 2; + char[] s; + while(str[i] != ';') + { + if(str[i] == '"') + { + messages.report(NoCharEntityEnd, loc+i, loc+i+1); + res.length = 2; + break; + } + s ~= str[i]; + i++; + } + + if ( s in characterEntities ) + { + res.data ~= parseToUtf8(characterEntities[s]); + } + else + messages.report(InvalidCharEntity, loc + 2, loc + i); + + res.length = i + 1; // remember the ; + + break; + default: + messages.report(InvalidStrEscape, loc, loc + 2); + res.length += 2; + } + + return res; +} + +String parseWysiwygString(char[] str, String strBuf) +{ + char start = str[0]; + + int i = 1; + + while(str[i] != start) + { + strBuf.data ~= cast(ubyte)str[i]; + i++; + } + return strBuf; +} + +ubyte[] parseToUtf8(uint i) +{ + if(i <= 0x00007F) + return [cast(ubyte)i]; + else if(i <= 0x0007FF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 19) >> 25; + bts(cast(uint*)&b, 7); + bts(cast(uint*)&b, 6); + return [b,a]; + } + else if(i <= 0x00FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 16) >> 28; + bts(cast(uint*)&c, 7); + bts(cast(uint*)&c, 6); + bts(cast(uint*)&c, 5); + return [c,b,a]; + } + else if(i <= 0x10FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 14) >> 26; + bts(cast(uint*)&c, 7); + ubyte d = (i << 11) >> 29; + bts(cast(uint*)&d, 7); + bts(cast(uint*)&d, 6); + bts(cast(uint*)&d, 5); + bts(cast(uint*)&d, 4); + return [d,c,b,a]; + } +} + +bool isValidUtf8(uint i) +{ + if(i <= 0x10FFFF) + return true; + return false; +} + +void printString(char[] str, String strBuf) +{ + char[] s; + switch(strBuf.type) + { + case StringType.Char: + Stdout(str)(" have become").newline() + (cast(char[])strBuf.data).newline; + break; + case StringType.WChar: + Stdout(str)(" have become").newline() + (cast(wchar[])strBuf.data).newline; + break; + case StringType.DChar: + Stdout(str)(" have become").newline() + (cast(dchar[])strBuf.data).newline; + break; + } +} + +static ushort[char[]] characterEntities; + +static this() +{ + characterEntities = + [ + "quot"[]: 34, + "amp": 38, + "lt": 60, + "gt": 62, + "OElig": 338, + "oelig": 339, + "Scaron": 352, + "scaron": 353, + "Yuml": 376, + "circ": 710, + "tilde": 732, + "ensp": 8194, + "emsp": 8195, + "thinsp": 8201, + "zwnj": 8204, + "zwj": 8205, + "lrm": 8206, + "rlm": 8207, + "ndash": 8211, + "mdash": 8212, + "lsquo": 8216, + "rsquo": 8217, + "sbquo": 8218, + "ldquo": 8220, + "rdquo": 8221, + "bdquo": 8222, + "dagger": 8224, + "Dagger": 8225, + "permil": 8240, + "lsaquo": 8249, + "rsaquo": 8250, + "euro": 8364, + "nbsp": 160, + "iexcl": 161, + "cent": 162, + "pound": 163, + "curren": 164, + "yen": 165, + "brvbar": 166, + "sect": 167, + "uml": 168, + "copy": 169, + "ordf": 170, + "laquo": 171, + "not": 172, + "shy": 173, + "reg": 174, + "macr": 175, + "deg": 176, + "plusmn": 177, + "sup2": 178, + "sup3": 179, + "acute": 180, + "micro": 181, + "para": 182, + "middot": 183, + "cedil": 184, + "sup1": 185, + "ordm": 186, + "raquo": 187, + "frac14": 188, + "frac12": 189, + "frac34": 190, + "iquest": 191, + "Agrave": 192, + "Aacute": 193, + "Acirc": 194, + "Atilde": 195, + "Auml": 196, + "Aring": 197, + "AElig": 198, + "Ccedil": 199, + "Egrave": 200, + "Eacute": 201, + "Ecirc": 202, + "Euml": 203, + "Igrave": 204, + "Iacute": 205, + "Icirc": 206, + "Iuml": 207, + "ETH": 208, + "Ntilde": 209, + "Ograve": 210, + "Oacute": 211, + "Ocirc": 212, + "Otilde": 213, + "Ouml": 214, + "times": 215, + "Oslash": 216, + "Ugrave": 217, + "Uacute": 218, + "Ucirc": 219, + "Uuml": 220, + "Yacute": 221, + "THORN": 222, + "szlig": 223, + "agrave": 224, + "aacute": 225, + "acirc": 226, + "atilde": 227, + "auml": 228, + "aring": 229, + "aelig": 230, + "ccedil": 231, + "egrave": 232, + "eacute": 233, + "ecirc": 234, + "euml": 235, + "igrave": 236, + "iacute": 237, + "icirc": 238, + "iuml": 239, + "eth": 240, + "ntilde": 241, + "ograve": 242, + "oacute": 243, + "ocirc": 244, + "otilde": 245, + "ouml": 246, + "divide": 247, + "oslash": 248, + "ugrave": 249, + "uacute": 250, + "ucirc": 251, + "uuml": 252, + "yacute": 253, + "thorn": 254, + "yuml": 255, + "fnof": 402, + "Alpha": 913, + "Beta": 914, + "Gamma": 915, + "Delta": 916, + "Epsilon": 917, + "Zeta": 918, + "Eta": 919, + "Theta": 920, + "Iota": 921, + "Kappa": 922, + "Lambda": 923, + "Mu": 924, + "Nu": 925, + "Xi": 926, + "Omicron": 927, + "Pi": 928, + "Rho": 929, + "Sigma": 931, + "Tau": 932, + "Upsilon": 933, + "Phi": 934, + "Chi": 935, + "Psi": 936, + "Omega": 937, + "alpha": 945, + "beta": 946, + "gamma": 947, + "delta": 948, + "epsilon": 949, + "zeta": 950, + "eta": 951, + "theta": 952, + "iota": 953, + "kappa": 954, + "lambda": 955, + "mu": 956, + "nu": 957, + "xi": 958, + "omicron": 959, + "pi": 960, + "rho": 961, + "sigmaf": 962, + "sigma": 963, + "tau": 964, + "upsilon": 965, + "phi": 966, + "chi": 967, + "psi": 968, + "omega": 969, + "thetasym": 977, + "upsih": 978, + "piv": 982, + "bull": 8226, + "hellip": 8230, + "prime": 8242, + "Prime": 8243, + "oline": 8254, + "frasl": 8260, + "weierp": 8472, + "image": 8465, + "real": 8476, + "trade": 8482, + "alefsym": 8501, + "larr": 8592, + "uarr": 8593, + "rarr": 8594, + "darr": 8595, + "harr": 8596, + "crarr": 8629, + "lArr": 8656, + "uArr": 8657, + "rArr": 8658, + "dArr": 8659, + "hArr": 8660, + "forall": 8704, + "part": 8706, + "exist": 8707, + "empty": 8709, + "nabla": 8711, + "isin": 8712, + "notin": 8713, + "ni": 8715, + "prod": 8719, + "sum": 8721, + "minus": 8722, + "lowast": 8727, + "radic": 8730, + "prop": 8733, + "infin": 8734, + "ang": 8736, + "and": 8743, + "or": 8744, + "cap": 8745, + "cup": 8746, + "int": 8747, + "there4": 8756, + "sim": 8764, + "cong": 8773, + "asymp": 8776, + "ne": 8800, + "equiv": 8801, + "le": 8804, + "ge": 8805, + "sub": 8834, + "sup": 8835, + "nsub": 8836, + "sube": 8838, + "supe": 8839, + "oplus": 8853, + "otimes": 8855, + "perp": 8869, + "sdot": 8901, + "lceil": 8968, + "rceil": 8969, + "lfloor": 8970, + "rfloor": 8971, + "lang": 9001, + "rang": 9002, + "loz": 9674, + "spades": 9824, + "clubs": 9827, + "hearts": 9829, + "diams": 9830 + ]; +}