changeset 111:c658172ca8a0

Parsing basic integers and floats.
author Anders Johnsen <skabet@gmail.com>
date Sun, 25 May 2008 15:42:44 +0200
parents 2deb4c1f0d93
children d03b011c50e9
files ast/Exp.d basic/LiteralParsing.d basic/Messages.d lexer/Lexer.d sema/LiteralInterpreter.d tests/parser/float_1.d tests/parser/int_1.d tests/parser/string_1.d
diffstat 8 files changed, 640 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/ast/Exp.d	Sun May 25 14:56:05 2008 +0200
+++ b/ast/Exp.d	Sun May 25 15:42:44 2008 +0200
@@ -301,7 +301,7 @@
     {
         super(ExpType.IntegerLit, loc);
         range = SourceRange(loc, loc + t.length);
-        this.name = substitute(t, "_", "");
+        this.name = t;
     }
 
     char[] get()
--- a/basic/LiteralParsing.d	Sun May 25 14:56:05 2008 +0200
+++ b/basic/LiteralParsing.d	Sun May 25 15:42:44 2008 +0200
@@ -1,32 +1,279 @@
 module basic.LiteralParsing.d;
 
 import basic.SourceLocation,
-       basic.Message;
+       basic.Message,
+       basic.conv;
 
 import tango.io.Stdout,
        tango.core.BitManip,
        Integer = tango.text.convert.Integer,
+       Utf = tango.text.convert.Utf,
        tango.text.Util;
 
-enum StructType
+enum StringType
 {
     Char,
     WChar,
     DChar
 }
 
+enum NumberType
+{
+    Int,
+    UInt,
+    Long,
+    ULong,
+    Float,
+    Double,
+    Real
+}
+
 struct String
 {
-    StructType type;
+    StringType type;
     ubyte[] data;
 }
 
+struct Number
+{
+    NumberType type;
+    ulong integer;
+    real  floating;
+}
+
 private struct EscapeReturn
 {
     ubyte[] data;
     int length;
 }
 
+private struct NumberReturn
+{
+    char[] data;
+    int length;
+}
+
+Number parseNumber(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    Number num;
+
+    switch(str[0])
+    {
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+            if(str.contains('.') || str.contains('e') || str.contains('E'))
+            {
+                auto n = parseRealNumber(str, loc, messages);
+
+                try
+                {
+                    num.floating = toReal(n.data);
+                    num.type = NumberType.Double;
+                }
+                catch(Exception e)
+                {
+                    num.floating = real.init;
+                    messages.report(FloatingToLarge, loc, loc + n.length - 1);
+                }
+
+                if(num.floating > double.max)
+                    num.type = NumberType.Real;
+            }
+            else
+            {
+                auto n = parseDecimalDigits(str, loc, messages);
+
+                try
+                {
+                    num.integer = toUlong(n.data);
+                }
+                catch(Exception e)
+                {
+                    num.integer = 0;
+                    messages.report(IntegerToLarge, loc, loc + n.length - 1);
+                }
+
+                if(num.integer > uint.max)
+                    num.type = NumberType.Long;
+                if(num.integer > long.max)
+                    num.type = NumberType.ULong;
+            }
+            break;
+        default:
+            messages.report(InvalidStartInteger, loc, loc+1);
+    }
+
+//    printNumber(str, num);
+    return num;
+}
+
+NumberReturn parseDecimalDigits(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    int i = 0;
+
+    char[] number;
+
+    bool end;
+    while(!end)
+    {
+        switch(str[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                number ~= str[i];
+                break;
+            case '_':
+                break;
+            default:
+                end = true;
+        }
+        i++;
+        if(str.length == i)
+        {
+            end = true;
+            i++;
+        }
+    }
+
+    NumberReturn res;
+    res.length = i - 1;
+    res.data = number;
+
+    return res;
+}
+
+NumberReturn parseRealNumber(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    int i = 0;
+
+    bool dot, e;
+    char[] number;
+
+    NumberReturn num;
+
+    bool end;
+    while(!end)
+    {
+        switch(str[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '_':
+                auto n = parseDecimalDigits(str[i..$], loc, messages);
+                number ~= n.data;
+                i += n.length;
+                break;
+            case '.':
+                if(e)
+                    messages.report(FloatingDotInE, loc + i, loc + i + 1);
+                else if(dot)
+                    messages.report(OnlyOneDotFloating, loc + i, loc + i + 1);
+                else
+                {
+                    dot = true;
+                    number ~= str[i];
+                }
+                i++;
+                break;
+            case 'e':
+            case 'E':
+                if(e)
+                    messages.report(OnlyOneEFloating, loc + i, loc + i + 1);
+                else
+                {
+                    e = true;
+                    number ~= str[i];
+                }
+                i++;
+                break;
+            case '+':
+            case '-':
+                if (number[$-1] != 'e' &&
+                    number[$-1] != 'E')
+                    messages.report(FloatingBadLocation, loc + i, loc + i + 1)
+                        .arg(str[i]);
+                else
+                    number ~= str[i];
+                i++;
+                break;
+            default:
+                end = true;
+        }
+        if(str.length == i)
+            end = true;
+    }
+    
+    if (number[$-1] == '+' ||
+        number[$-1] == '-' ||
+        number[$-1] == 'e' ||
+        number[$-1] == 'E')
+    {
+        messages.report(FloatingInvalidEnd, loc + i - 1, loc + i);
+        return num;
+    }
+
+    num.data = number;
+    num.length = i;
+
+    return num;
+}
+
+
+void printNumber(char[] str, Number num)
+{
+    Stdout(str)(" have become").newline;
+    switch(num.type)
+    {
+        case NumberType.Int:
+            Stdout(num.integer)(" of type ")("int");
+            break;
+        case NumberType.UInt:
+            Stdout(num.integer)(" of type ")("uint");
+            break;
+        case NumberType.Long:
+            Stdout(num.integer)(" of type ")("long");
+            break;
+        case NumberType.ULong:
+            Stdout(num.integer)(" of type ")("ulong");
+            break;
+        case NumberType.Float:
+            Stdout(num.floating)(" of type ")("float");
+            break;
+        case NumberType.Double:
+            Stdout(num.floating)(" of type ")("double");
+            break;
+        case NumberType.Real:
+            Stdout(num.floating)(" of type ")("real");
+            break;
+    }
+    Stdout().newline;
+}
+
+
 String parseString(char[] str, SourceLocation loc, MessageHandler messages)
 {
     String strBuf;
@@ -76,9 +323,7 @@
                 hexBuf.length = 0;
             }
         }
-        else if(whitespace.contains(str[i]))
-        {}
-        else
+        else if(!whitespace.contains(str[i]))
             messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
 
         i++;
@@ -88,7 +333,7 @@
 
     return strBuf;
 }
-
+//
 
 String parseDoubleQuotedString(char[] str, String strBuf, 
         SourceLocation loc, MessageHandler messages)
@@ -112,6 +357,22 @@
             break;
     }
 
+    if(str.length > i + 1) // Then we have a postfix. Lexer makes sure this is c, w or d.
+        switch(str[i+1])
+        {
+            case 'c':
+                break;
+            case 'w':
+                strBuf.data = cast(ubyte[])Utf.toString16(cast(char[])strBuf.data);
+                strBuf.type = StringType.WChar;
+                break;
+            case 'd':
+                strBuf.data = cast(ubyte[])Utf.toString32(cast(char[])strBuf.data);
+                strBuf.type = StringType.DChar;
+                break;
+        }
+
+
     return strBuf;
 }
 
@@ -261,6 +522,31 @@
             uint i = Integer.toLong(octBuf, 8);
             res.data ~= i;
             break;
+        case '&':
+            int i = 2;
+            char[] s;
+            while(str[i] != ';')
+            {
+                if(str[i] == '"')
+                {
+                    messages.report(NoCharEntityEnd, loc+i, loc+i+1);
+                    res.length = 2;
+                    break;
+                }
+                s ~= str[i];
+                i++;
+            }
+
+            if ( s in characterEntities )
+            {
+                res.data ~= parseToUtf8(characterEntities[s]);
+            }
+            else
+                messages.report(InvalidCharEntity, loc + 2, loc + i);
+
+            res.length = i + 1; // remember the ;
+
+            break;
         default:
             messages.report(InvalidStrEscape, loc, loc + 2);
             res.length += 2;
@@ -337,17 +623,278 @@
     char[] s;
     switch(strBuf.type)
     {
-        case StructType.Char:
+        case StringType.Char:
             Stdout(str)(" have become").newline()
                 (cast(char[])strBuf.data).newline;
             break;
-        case StructType.WChar:
+        case StringType.WChar:
             Stdout(str)(" have become").newline()
                 (cast(wchar[])strBuf.data).newline;
             break;
-        case StructType.DChar:
+        case StringType.DChar:
             Stdout(str)(" have become").newline()
                 (cast(dchar[])strBuf.data).newline;
             break;
     }
 }
+
+static ushort[char[]] characterEntities;
+
+static this()
+{
+    characterEntities = 
+    [
+        "quot"[]: 34,
+        "amp": 38,
+        "lt": 60,
+        "gt": 62,
+        "OElig": 338,
+        "oelig": 339,
+        "Scaron": 352,
+        "scaron": 353,
+        "Yuml": 376,
+        "circ": 710,
+        "tilde": 732,
+        "ensp": 8194,
+        "emsp": 8195,
+        "thinsp": 8201,
+        "zwnj": 8204,
+        "zwj": 8205,
+        "lrm": 8206,
+        "rlm": 8207,
+        "ndash": 8211,
+        "mdash": 8212,
+        "lsquo": 8216,
+        "rsquo": 8217,
+        "sbquo": 8218,
+        "ldquo": 8220,
+        "rdquo": 8221,
+        "bdquo": 8222,
+        "dagger": 8224,
+        "Dagger": 8225,
+        "permil": 8240,
+        "lsaquo": 8249,
+        "rsaquo": 8250,
+        "euro": 8364,
+        "nbsp": 160,
+        "iexcl": 161,
+        "cent": 162,
+        "pound": 163,
+        "curren": 164,
+        "yen": 165,
+        "brvbar": 166,
+        "sect": 167,
+        "uml": 168,
+        "copy": 169,
+        "ordf": 170,
+        "laquo": 171,
+        "not": 172,
+        "shy": 173,
+        "reg": 174,
+        "macr": 175,
+        "deg": 176,
+        "plusmn": 177,
+        "sup2": 178,
+        "sup3": 179,
+        "acute": 180,
+        "micro": 181,
+        "para": 182,
+        "middot": 183,
+        "cedil": 184,
+        "sup1": 185,
+        "ordm": 186,
+        "raquo": 187,
+        "frac14": 188,
+        "frac12": 189,
+        "frac34": 190,
+        "iquest": 191,
+        "Agrave": 192,
+        "Aacute": 193,
+        "Acirc": 194,
+        "Atilde": 195,
+        "Auml": 196,
+        "Aring": 197,
+        "AElig": 198,
+        "Ccedil": 199,
+        "Egrave": 200,
+        "Eacute": 201,
+        "Ecirc": 202,
+        "Euml": 203,
+        "Igrave": 204,
+        "Iacute": 205,
+        "Icirc": 206,
+        "Iuml": 207,
+        "ETH": 208,
+        "Ntilde": 209,
+        "Ograve": 210,
+        "Oacute": 211,
+        "Ocirc": 212,
+        "Otilde": 213,
+        "Ouml": 214,
+        "times": 215,
+        "Oslash": 216,
+        "Ugrave": 217,
+        "Uacute": 218,
+        "Ucirc": 219,
+        "Uuml": 220,
+        "Yacute": 221,
+        "THORN": 222,
+        "szlig": 223,
+        "agrave": 224,
+        "aacute": 225,
+        "acirc": 226,
+        "atilde": 227,
+        "auml": 228,
+        "aring": 229,
+        "aelig": 230,
+        "ccedil": 231,
+        "egrave": 232,
+        "eacute": 233,
+        "ecirc": 234,
+        "euml": 235,
+        "igrave": 236,
+        "iacute": 237,
+        "icirc": 238,
+        "iuml": 239,
+        "eth": 240,
+        "ntilde": 241,
+        "ograve": 242,
+        "oacute": 243,
+        "ocirc": 244,
+        "otilde": 245,
+        "ouml": 246,
+        "divide": 247,
+        "oslash": 248,
+        "ugrave": 249,
+        "uacute": 250,
+        "ucirc": 251,
+        "uuml": 252,
+        "yacute": 253,
+        "thorn": 254,
+        "yuml": 255,
+        "fnof": 402,
+        "Alpha": 913,
+        "Beta": 914,
+        "Gamma": 915,
+        "Delta": 916,
+        "Epsilon": 917,
+        "Zeta": 918,
+        "Eta": 919,
+        "Theta": 920,
+        "Iota": 921,
+        "Kappa": 922,
+        "Lambda": 923,
+        "Mu": 924,
+        "Nu": 925,
+        "Xi": 926,
+        "Omicron": 927,
+        "Pi": 928,
+        "Rho": 929,
+        "Sigma": 931,
+        "Tau": 932,
+        "Upsilon": 933,
+        "Phi": 934,
+        "Chi": 935,
+        "Psi": 936,
+        "Omega": 937,
+        "alpha": 945,
+        "beta": 946,
+        "gamma": 947,
+        "delta": 948,
+        "epsilon": 949,
+        "zeta": 950,
+        "eta": 951,
+        "theta": 952,
+        "iota": 953,
+        "kappa": 954,
+        "lambda": 955,
+        "mu": 956,
+        "nu": 957,
+        "xi": 958,
+        "omicron": 959,
+        "pi": 960,
+        "rho": 961,
+        "sigmaf": 962,
+        "sigma": 963,
+        "tau": 964,
+        "upsilon": 965,
+        "phi": 966,
+        "chi": 967,
+        "psi": 968,
+        "omega": 969,
+        "thetasym": 977,
+        "upsih": 978,
+        "piv": 982,
+        "bull": 8226,
+        "hellip": 8230,
+        "prime": 8242,
+        "Prime": 8243,
+        "oline": 8254,
+        "frasl": 8260,
+        "weierp": 8472,
+        "image": 8465,
+        "real": 8476,
+        "trade": 8482,
+        "alefsym": 8501,
+        "larr": 8592,
+        "uarr": 8593,
+        "rarr": 8594,
+        "darr": 8595,
+        "harr": 8596,
+        "crarr": 8629,
+        "lArr": 8656,
+        "uArr": 8657,
+        "rArr": 8658,
+        "dArr": 8659,
+        "hArr": 8660,
+        "forall": 8704,
+        "part": 8706,
+        "exist": 8707,
+        "empty": 8709,
+        "nabla": 8711,
+        "isin": 8712,
+        "notin": 8713,
+        "ni": 8715,
+        "prod": 8719,
+        "sum": 8721,
+        "minus": 8722,
+        "lowast": 8727,
+        "radic": 8730,
+        "prop": 8733,
+        "infin": 8734,
+        "ang": 8736,
+        "and": 8743,
+        "or": 8744,
+        "cap": 8745,
+        "cup": 8746,
+        "int": 8747,
+        "there4": 8756,
+        "sim": 8764,
+        "cong": 8773,
+        "asymp": 8776,
+        "ne": 8800,
+        "equiv": 8801,
+        "le": 8804,
+        "ge": 8805,
+        "sub": 8834,
+        "sup": 8835,
+        "nsub": 8836,
+        "sube": 8838,
+        "supe": 8839,
+        "oplus": 8853,
+        "otimes": 8855,
+        "perp": 8869,
+        "sdot": 8901,
+        "lceil": 8968,
+        "rceil": 8969,
+        "lfloor": 8970,
+        "rfloor": 8971,
+        "lang": 9001,
+        "rang": 9002,
+        "loz": 9674,
+        "spades": 9824,
+        "clubs": 9827,
+        "hearts": 9829,
+        "diams": 9830
+    ];
+}
--- a/basic/Messages.d	Sun May 25 14:56:05 2008 +0200
+++ b/basic/Messages.d	Sun May 25 15:42:44 2008 +0200
@@ -34,8 +34,16 @@
     InvalidStrEscape,
     InvalidUtf8Hex,
     InvalidHexStrChar,
+    InvalidCharEntity,
+    NoCharEntityEnd,
     StringShortEscape,
     StringHexInvalid,
+    InvalidStartInteger,
+    IntegerToLarge,
+    FloatingToLarge,
+    FloatingInvalidEnd,
+    FloatingBadLocation,
+    FloatingDotInE,
 }
 
 enum MessageType
@@ -85,9 +93,17 @@
         InvalidStrPrefix    : E(Err, "Invalid string literal prefix"),
         InvalidStrEscape    : E(Err, "Invalid escape sequence"),
         InvalidUtf8Hex      : E(Err, "Invalid Utf8 hex char"),
+        NoCharEntityEnd     : E(Err, "Character entity have no end, insert ';'"),
+        InvalidCharEntity   : E(Err, "Invalid character entity"),
         InvalidHexStrChar   : E(Err, "Invalid character in hex string"),
         StringShortEscape   : E(Err, "String literal is to short for escape sequence"),
-        StringHexInvalid    : E(Err, "Hex escape sequence have invalid digit at position %0 of %1")
+        StringHexInvalid    : E(Err, "Hex escape sequence have invalid digit at position %0 of %1"),
+        InvalidStartInteger : E(Err, "Invalid begining of number"),
+        IntegerToLarge      : E(Err, "Integer is to large. Max size is 18446744073709551615"),
+        FloatingToLarge     : E(Err, "Floating literal is to large"),
+        FloatingInvalidEnd  : E(Err, "Floating literal have wrong ending"),
+        FloatingBadLocation : E(Err, "Bad location for '%0' in floting literal"),
+        FloatingDotInE      : E(Err, "There cannot be a dot in the exponent of a floating literal")
     ];
 }
 
--- a/lexer/Lexer.d	Sun May 25 14:56:05 2008 +0200
+++ b/lexer/Lexer.d	Sun May 25 15:42:44 2008 +0200
@@ -289,7 +289,15 @@
                 {
                     ++position;
                     if (source[position-1] == '"' )
+                    {
+                        if(getNextChar != CharType.EOF)
+                            if (source[position] == 'c' ||
+                                source[position] == 'w' ||
+                                source[position] == 'd')
+                                position++;
+
                         return Token(Tok.String, Loc(start), position - start);
+                    }
                     else if (source[position-1] == '\\')
                         position++;
                 }
@@ -310,12 +318,11 @@
     
     Token lexNumber ()
     {
-        bool sign = false;
-        bool dot = false;
-        bool e = false;
+        bool sign;
 
         int i = 0;
 
+
         bool end = false;
         while(!end)
         {
@@ -326,11 +333,15 @@
                 case CharType.Symbol:
                     if(this.source[position+i] == '.')
                     {
-                        if(dot)
-                            messages.report(OnlyOneDotFloating, Loc(position + i));
-                        dot = true;
                         break;
                     }
+                    if (this.source[position+i] == '+' ||
+                        this.source[position+i] == '-')
+                    {
+                        if (source[position+i-1] == 'e' ||
+                            source[position+i-1] == 'E')
+                            break;
+                    }
                     end = true;
                     continue;
                 case CharType.Letter:
@@ -339,9 +350,6 @@
                     if (this.source[position+i] == 'e' || 
                         this.source[position+i] == 'E')
                     {
-                        if (e)
-                            messages.report(OnlyOneEFloating, Loc(position + i));
-                        e = true;
                         break;
                     }
                     end = true;
@@ -354,6 +362,13 @@
             i++;
         }
 
+        while(source[position+i] == 'u' ||
+              source[position+i] == 'U' ||
+              source[position+i] == 'L')
+            i += 1;
+
+        
+
         position += i;
 
         return Token(Tok.Integer, Loc(position - i), i);
--- a/sema/LiteralInterpreter.d	Sun May 25 14:56:05 2008 +0200
+++ b/sema/LiteralInterpreter.d	Sun May 25 15:42:44 2008 +0200
@@ -23,5 +23,10 @@
         auto type = parseString(exp.str, exp.loc, messages);
     }
 
+    void visitIntegerLit(IntegerLit exp)
+    {
+        auto type = parseNumber(exp.name, exp.loc, messages);
+    }
+
     MessageHandler messages;
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/parser/float_1.d	Sun May 25 15:42:44 2008 +0200
@@ -0,0 +1,8 @@
+
+
+void main()
+{
+    float f1  = 4_.5_e5+4;
+    float f2  = 4._5_e+344;
+    float f3  = 4.__5_e-_2;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/parser/int_1.d	Sun May 25 15:42:44 2008 +0200
@@ -0,0 +1,21 @@
+
+int main()
+{
+    int   i1  = 123_456;
+    int   i2  = 1_2_3_4_5_6_;
+
+    int   i3  = 43_422_253;
+    long  i4  = 34_322_523_123;
+
+    long  i5  = 43_422_253L;
+    long  i6  = 34_322_523_123L;
+
+    uint  i7  = 43_422_253u;
+    ulong i8  = 18_446_744_073_709_551_615U;
+
+    ulong i9  = 0UL;
+    ulong i10 = 18_446_744_073_709_551_615LU;
+
+    ulong i10 = 18_446_744_073_709_551_615_23LU;
+
+}
--- a/tests/parser/string_1.d	Sun May 25 14:56:05 2008 +0200
+++ b/tests/parser/string_1.d	Sun May 25 15:42:44 2008 +0200
@@ -29,6 +29,13 @@
     char[6]     s15 = x"61 62 63 64
                         65 66 67 68";
 
+    char[4]     s16 = "\&reg;\&amp;";
+
+    char[4]     s16 = "\&reg;\&amp;"c;
+    wchar[2]    s16 = "\&reg;\&amp;"w;
+    dchar[2]    s16 = "\&reg;\&amp;"d;
+    
+
 
     return 0;
 }