diff src/basic/LiteralParsing.d @ 207:e0551773a005

Added the correct version.
author Anders Johnsen <skabet@gmail.com>
date Tue, 12 Aug 2008 18:19:34 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/basic/LiteralParsing.d	Tue Aug 12 18:19:34 2008 +0200
@@ -0,0 +1,900 @@
+module basic.LiteralParsing;
+
+import basic.SourceLocation,
+       basic.Message,
+       basic.conv;
+
+import tango.io.Stdout,
+       tango.core.BitManip,
+       Integer = tango.text.convert.Integer,
+       Utf = tango.text.convert.Utf,
+       tango.text.Util;
+
+enum StringType
+{
+    Char,
+    WChar,
+    DChar
+}
+
+enum NumberType
+{
+    Int,
+    UInt,
+    Long,
+    ULong,
+    Float,
+    Double,
+    Real
+}
+
+struct String
+{
+    StringType type;
+    ubyte[] data;
+}
+
+struct Number
+{
+    NumberType type;
+    ulong integer;
+    real  floating;
+}
+
+private struct EscapeReturn
+{
+    ubyte[] data;
+    int length;
+}
+
+private struct NumberReturn
+{
+    char[] data;
+    int length;
+}
+
+Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    Number num;
+
+    switch(str[0])
+    {
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+            if(str.contains('.') || str.contains('e') || str.contains('E'))
+            {
+                auto n = parseRealNumber(str, loc, messages);
+
+                try
+                {
+                    num.floating = toReal(n.data);
+                    num.type = NumberType.Double;
+                }
+                catch(Exception e)
+                {
+                    num.floating = real.init;
+                    messages.report(FloatingToLarge, loc, loc + n.length - 1);
+                }
+
+                if(num.floating > double.max)
+                    num.type = NumberType.Real;
+            }
+            else
+            {
+                auto n = parseDecimalDigits(str, loc, messages);
+
+                try
+                {
+                    num.integer = toUlong(n.data);
+                }
+                catch(Exception e)
+                {
+                    num.integer = 0;
+                    messages.report(IntegerToLarge, loc, loc + n.length - 1);
+                }
+
+                if(num.integer > uint.max)
+                    num.type = NumberType.Long;
+                if(num.integer > long.max)
+                    num.type = NumberType.ULong;
+            }
+            break;
+        default:
+            messages.report(InvalidStartInteger, loc, loc+1);
+    }
+
+//    printNumber(str, num);
+    return num;
+}
+
+NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    int i = 0;
+
+    char[] number;
+
+    bool end;
+    while(!end)
+    {
+        switch(str[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                number ~= str[i];
+                break;
+            case '_':
+                break;
+            default:
+                end = true;
+        }
+        i++;
+        if(str.length == i)
+        {
+            end = true;
+            i++;
+        }
+    }
+
+    NumberReturn res;
+    res.length = i - 1;
+    res.data = number;
+
+    return res;
+}
+
+NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    int i = 0;
+
+    bool dot, e;
+    char[] number;
+
+    NumberReturn num;
+
+    bool end;
+    while(!end)
+    {
+        switch(str[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '_':
+                auto n = parseDecimalDigits(str[i..$], loc, messages);
+                number ~= n.data;
+                i += n.length;
+                break;
+            case '.':
+                if(e)
+                    messages.report(FloatingDotInE, loc + i, loc + i + 1);
+                else if(dot)
+                    messages.report(OnlyOneDotFloating, loc + i, loc + i + 1);
+                else
+                {
+                    dot = true;
+                    number ~= str[i];
+                }
+                i++;
+                break;
+            case 'e':
+            case 'E':
+                if(e)
+                    messages.report(OnlyOneEFloating, loc + i, loc + i + 1);
+                else
+                {
+                    e = true;
+                    number ~= str[i];
+                }
+                i++;
+                break;
+            case '+':
+            case '-':
+                if (number[$-1] != 'e' &&
+                    number[$-1] != 'E')
+                    messages.report(FloatingBadLocation, loc + i, loc + i + 1)
+                        .arg(str[i]);
+                else
+                    number ~= str[i];
+                i++;
+                break;
+            default:
+                end = true;
+        }
+        if(str.length == i)
+            end = true;
+    }
+    
+    if (number[$-1] == '+' ||
+        number[$-1] == '-' ||
+        number[$-1] == 'e' ||
+        number[$-1] == 'E')
+    {
+        messages.report(FloatingInvalidEnd, loc + i - 1, loc + i);
+        return num;
+    }
+
+    num.data = number;
+    num.length = i;
+
+    return num;
+}
+
+
+void printNumber(char[] str, Number num)
+{
+    Stdout(str)(" have become").newline;
+    switch(num.type)
+    {
+        case NumberType.Int:
+            Stdout(num.integer)(" of type ")("int");
+            break;
+        case NumberType.UInt:
+            Stdout(num.integer)(" of type ")("uint");
+            break;
+        case NumberType.Long:
+            Stdout(num.integer)(" of type ")("long");
+            break;
+        case NumberType.ULong:
+            Stdout(num.integer)(" of type ")("ulong");
+            break;
+        case NumberType.Float:
+            Stdout(num.floating)(" of type ")("float");
+            break;
+        case NumberType.Double:
+            Stdout(num.floating)(" of type ")("double");
+            break;
+        case NumberType.Real:
+            Stdout(num.floating)(" of type ")("real");
+            break;
+    }
+    Stdout().newline;
+}
+
+
+String parseString(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    String strBuf;
+    strBuf.data.length = str.length;
+    strBuf.data.length = 0;
+
+    switch(str[0])
+    {
+        case 'r':
+            strBuf = parseWysiwygString(str[1..$], strBuf);
+            break;
+        case '`':
+            strBuf = parseWysiwygString(str, strBuf);
+            break;
+        case '"':
+            strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
+            break;
+        case 'x':
+            strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
+            break;
+        default:
+            messages.report(InvalidStrPrefix, loc, loc + 1);
+
+    }
+
+//    printString(str, strBuf);
+
+    return strBuf;
+}
+
+String parseHexString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    char[] hex = "0123456789abcdefABCDEF";
+    char[] whitespace = "\r\n ";
+    char[] hexBuf;
+
+    while(str[i] != '"')
+    {
+        if(hex.contains(str[i]))
+        {
+            hexBuf ~= str[i];
+            if(hexBuf.length == 2)
+            {
+                strBuf.data ~= Integer.toInt(hexBuf, 16);
+                hexBuf.length = 0;
+            }
+        }
+        else if(!whitespace.contains(str[i]))
+            messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
+
+        i++;
+    }
+
+
+
+    return strBuf;
+}
+//
+
+String parseDoubleQuotedString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    
+    while(str[i] != '"')
+    {
+        switch(str[i])
+        {
+            case '\\': // EscapeSequence
+                EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
+                strBuf.data ~= res.data;
+                i += res.length;
+                break;
+            default:
+                strBuf.data ~= str[i];
+                i++;
+        }
+        if(i >= str.length)
+            break;
+    }
+
+    if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d.
+        switch(str[i+1])
+        {
+            case 'c':
+                break;
+            case 'w':
+                strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data);
+                strBuf.type = StringType.WChar;
+                break;
+            case 'd':
+                strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data);
+                strBuf.type = StringType.DChar;
+                break;
+        }
+
+
+    return strBuf;
+}
+
+EscapeReturn parseEscapeSequence(char[] str,
+        SourceLocation loc, MessageHandler messages)
+{
+    EscapeReturn res;
+
+    switch(str[1])
+    {
+        case '\'':
+            res.length = 2;
+            res.data ~= '\'';
+            break;
+        case '"':
+            res.length = 2;
+            res.data ~= '\"';
+            break;
+        case '?':
+            res.length = 2;
+            res.data ~= '\?';
+            break;
+        case '\\':
+            res.length = 2;
+            res.data ~= '\\';
+            break;
+        case 'a':
+            res.length = 2;
+            res.data ~= '\a';
+            break;
+        case 'b':
+            res.length = 2;
+            res.data ~= '\b';
+            break;
+        case 'f':
+            res.length = 2;
+            res.data ~= '\f';
+            break;
+        case 'n':
+            res.length = 2;
+            res.data ~= '\n';
+            break;
+        case 'r':
+            res.length = 2;
+            res.data ~= '\r';
+            break;
+        case 't':
+            res.length = 2;
+            res.data ~= '\t';
+            break;
+        case 'v':
+            res.length = 2;
+            res.data ~= '\v';
+            break;
+        case 'x':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 4)
+            {
+                for(int i = 2; i < 4; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(2));
+                res.length = 4;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
+            break;
+        case 'u':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 6)
+            {
+                for(int i = 2; i < 6; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(6));
+                res.length = 6;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+6);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case 'U':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 10)
+            {
+                for(int i = 2; i < 10; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(10));
+                res.length = 10;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+10);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+            char[] oct = "01234567";
+            char[] octBuf;
+            octBuf ~= str[1];
+            res.length = 2;
+            for(int i = 2; i < 4; i++)
+                if(oct.contains(str[i]))
+                {
+                    octBuf ~= str[i];
+                    res.length += 1;
+                }
+                else
+                    break;
+
+            uint i = Integer.toLong(octBuf, 8);
+            res.data ~= i;
+            break;
+        case '&':
+            int i = 2;
+            char[] s;
+            while(str[i] != ';')
+            {
+                if(str[i] == '"')
+                {
+                    messages.report(NoCharEntityEnd, loc+i, loc+i+1);
+                    res.length = 2;
+                    break;
+                }
+                s ~= str[i];
+                i++;
+            }
+
+            if ( s in characterEntities )
+            {
+                res.data ~= parseToUtf8(characterEntities[s]);
+            }
+            else
+                messages.report(InvalidCharEntity, loc + 2, loc + i);
+
+            res.length = i + 1; // remember the ;
+
+            break;
+        default:
+            messages.report(InvalidStrEscape, loc, loc + 2);
+            res.length += 2;
+    }
+
+    return res;
+}
+
+String parseWysiwygString(char[] str, String strBuf)
+{
+    char start = str[0];
+
+    int i = 1;
+
+    while(str[i] != start)
+    {
+        strBuf.data ~= cast(ubyte)str[i];
+        i++;
+    }
+    return strBuf;
+}
+
+ubyte[] parseToUtf8(uint i)
+{
+    if(i <= 0x00007F)
+        return [cast(ubyte)i];
+    else if(i <= 0x0007FF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 19) >> 25;
+        bts(cast(uint*)&b, 7);
+        bts(cast(uint*)&b, 6);
+        return [b,a];
+    }
+    else if(i <= 0x00FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 16) >> 28;
+        bts(cast(uint*)&c, 7);
+        bts(cast(uint*)&c, 6);
+        bts(cast(uint*)&c, 5);
+        return [c,b,a];
+    }
+    else if(i <= 0x10FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 14) >> 26;
+        bts(cast(uint*)&c, 7);
+        ubyte d = (i << 11) >> 29;
+        bts(cast(uint*)&d, 7);
+        bts(cast(uint*)&d, 6);
+        bts(cast(uint*)&d, 5);
+        bts(cast(uint*)&d, 4);
+        return [d,c,b,a];
+    }
+}
+
+bool isValidUtf8(uint i)
+{
+    if(i <= 0x10FFFF)
+        return true;
+    return false;
+}
+
+void printString(char[] str, String strBuf)
+{
+    char[] s;
+    switch(strBuf.type)
+    {
+        case StringType.Char:
+            Stdout(str)(" have become").newline()
+                (cast(char[])strBuf.data).newline;
+            break;
+        case StringType.WChar:
+            Stdout(str)(" have become").newline()
+                (cast(wchar[])strBuf.data).newline;
+            break;
+        case StringType.DChar:
+            Stdout(str)(" have become").newline()
+                (cast(dchar[])strBuf.data).newline;
+            break;
+    }
+}
+
+static ushort[char[]] characterEntities;
+
+static this()
+{
+    characterEntities = 
+    [
+        "quot"[]: 34,
+        "amp": 38,
+        "lt": 60,
+        "gt": 62,
+        "OElig": 338,
+        "oelig": 339,
+        "Scaron": 352,
+        "scaron": 353,
+        "Yuml": 376,
+        "circ": 710,
+        "tilde": 732,
+        "ensp": 8194,
+        "emsp": 8195,
+        "thinsp": 8201,
+        "zwnj": 8204,
+        "zwj": 8205,
+        "lrm": 8206,
+        "rlm": 8207,
+        "ndash": 8211,
+        "mdash": 8212,
+        "lsquo": 8216,
+        "rsquo": 8217,
+        "sbquo": 8218,
+        "ldquo": 8220,
+        "rdquo": 8221,
+        "bdquo": 8222,
+        "dagger": 8224,
+        "Dagger": 8225,
+        "permil": 8240,
+        "lsaquo": 8249,
+        "rsaquo": 8250,
+        "euro": 8364,
+        "nbsp": 160,
+        "iexcl": 161,
+        "cent": 162,
+        "pound": 163,
+        "curren": 164,
+        "yen": 165,
+        "brvbar": 166,
+        "sect": 167,
+        "uml": 168,
+        "copy": 169,
+        "ordf": 170,
+        "laquo": 171,
+        "not": 172,
+        "shy": 173,
+        "reg": 174,
+        "macr": 175,
+        "deg": 176,
+        "plusmn": 177,
+        "sup2": 178,
+        "sup3": 179,
+        "acute": 180,
+        "micro": 181,
+        "para": 182,
+        "middot": 183,
+        "cedil": 184,
+        "sup1": 185,
+        "ordm": 186,
+        "raquo": 187,
+        "frac14": 188,
+        "frac12": 189,
+        "frac34": 190,
+        "iquest": 191,
+        "Agrave": 192,
+        "Aacute": 193,
+        "Acirc": 194,
+        "Atilde": 195,
+        "Auml": 196,
+        "Aring": 197,
+        "AElig": 198,
+        "Ccedil": 199,
+        "Egrave": 200,
+        "Eacute": 201,
+        "Ecirc": 202,
+        "Euml": 203,
+        "Igrave": 204,
+        "Iacute": 205,
+        "Icirc": 206,
+        "Iuml": 207,
+        "ETH": 208,
+        "Ntilde": 209,
+        "Ograve": 210,
+        "Oacute": 211,
+        "Ocirc": 212,
+        "Otilde": 213,
+        "Ouml": 214,
+        "times": 215,
+        "Oslash": 216,
+        "Ugrave": 217,
+        "Uacute": 218,
+        "Ucirc": 219,
+        "Uuml": 220,
+        "Yacute": 221,
+        "THORN": 222,
+        "szlig": 223,
+        "agrave": 224,
+        "aacute": 225,
+        "acirc": 226,
+        "atilde": 227,
+        "auml": 228,
+        "aring": 229,
+        "aelig": 230,
+        "ccedil": 231,
+        "egrave": 232,
+        "eacute": 233,
+        "ecirc": 234,
+        "euml": 235,
+        "igrave": 236,
+        "iacute": 237,
+        "icirc": 238,
+        "iuml": 239,
+        "eth": 240,
+        "ntilde": 241,
+        "ograve": 242,
+        "oacute": 243,
+        "ocirc": 244,
+        "otilde": 245,
+        "ouml": 246,
+        "divide": 247,
+        "oslash": 248,
+        "ugrave": 249,
+        "uacute": 250,
+        "ucirc": 251,
+        "uuml": 252,
+        "yacute": 253,
+        "thorn": 254,
+        "yuml": 255,
+        "fnof": 402,
+        "Alpha": 913,
+        "Beta": 914,
+        "Gamma": 915,
+        "Delta": 916,
+        "Epsilon": 917,
+        "Zeta": 918,
+        "Eta": 919,
+        "Theta": 920,
+        "Iota": 921,
+        "Kappa": 922,
+        "Lambda": 923,
+        "Mu": 924,
+        "Nu": 925,
+        "Xi": 926,
+        "Omicron": 927,
+        "Pi": 928,
+        "Rho": 929,
+        "Sigma": 931,
+        "Tau": 932,
+        "Upsilon": 933,
+        "Phi": 934,
+        "Chi": 935,
+        "Psi": 936,
+        "Omega": 937,
+        "alpha": 945,
+        "beta": 946,
+        "gamma": 947,
+        "delta": 948,
+        "epsilon": 949,
+        "zeta": 950,
+        "eta": 951,
+        "theta": 952,
+        "iota": 953,
+        "kappa": 954,
+        "lambda": 955,
+        "mu": 956,
+        "nu": 957,
+        "xi": 958,
+        "omicron": 959,
+        "pi": 960,
+        "rho": 961,
+        "sigmaf": 962,
+        "sigma": 963,
+        "tau": 964,
+        "upsilon": 965,
+        "phi": 966,
+        "chi": 967,
+        "psi": 968,
+        "omega": 969,
+        "thetasym": 977,
+        "upsih": 978,
+        "piv": 982,
+        "bull": 8226,
+        "hellip": 8230,
+        "prime": 8242,
+        "Prime": 8243,
+        "oline": 8254,
+        "frasl": 8260,
+        "weierp": 8472,
+        "image": 8465,
+        "real": 8476,
+        "trade": 8482,
+        "alefsym": 8501,
+        "larr": 8592,
+        "uarr": 8593,
+        "rarr": 8594,
+        "darr": 8595,
+        "harr": 8596,
+        "crarr": 8629,
+        "lArr": 8656,
+        "uArr": 8657,
+        "rArr": 8658,
+        "dArr": 8659,
+        "hArr": 8660,
+        "forall": 8704,
+        "part": 8706,
+        "exist": 8707,
+        "empty": 8709,
+        "nabla": 8711,
+        "isin": 8712,
+        "notin": 8713,
+        "ni": 8715,
+        "prod": 8719,
+        "sum": 8721,
+        "minus": 8722,
+        "lowast": 8727,
+        "radic": 8730,
+        "prop": 8733,
+        "infin": 8734,
+        "ang": 8736,
+        "and": 8743,
+        "or": 8744,
+        "cap": 8745,
+        "cup": 8746,
+        "int": 8747,
+        "there4": 8756,
+        "sim": 8764,
+        "cong": 8773,
+        "asymp": 8776,
+        "ne": 8800,
+        "equiv": 8801,
+        "le": 8804,
+        "ge": 8805,
+        "sub": 8834,
+        "sup": 8835,
+        "nsub": 8836,
+        "sube": 8838,
+        "supe": 8839,
+        "oplus": 8853,
+        "otimes": 8855,
+        "perp": 8869,
+        "sdot": 8901,
+        "lceil": 8968,
+        "rceil": 8969,
+        "lfloor": 8970,
+        "rfloor": 8971,
+        "lang": 9001,
+        "rang": 9002,
+        "loz": 9674,
+        "spades": 9824,
+        "clubs": 9827,
+        "hearts": 9829,
+        "diams": 9830
+    ];
+}