changeset 106:89db676fbacb

Now able of understanding strings.
author Anders Johnsen <skabet@gmail.com>
date Thu, 22 May 2008 12:09:11 +0200
parents f1282c5fe8e3
children d1f68bfb58ae
files basic/LiteralParsing.d basic/Message.d basic/Messages.d basic/SourceLocation.d dang/compiler.d sema/LiteralInterpreter.d sema/Visitor.d tests/parser/string_1.d
diffstat 8 files changed, 453 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/basic/LiteralParsing.d	Thu May 22 12:09:11 2008 +0200
@@ -0,0 +1,353 @@
+module basic.LiteralParsing.d;
+
+import basic.SourceLocation,
+       basic.Message;
+
+import tango.io.Stdout,
+       tango.core.BitManip,
+       Integer = tango.text.convert.Integer,
+       tango.text.Util;
+
+enum StructType
+{
+    Char,
+    WChar,
+    DChar
+}
+
+struct String
+{
+    StructType type;
+    ubyte[] data;
+}
+
+private struct EscapeReturn
+{
+    ubyte[] data;
+    int length;
+}
+
+String parseString(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    String strBuf;
+    strBuf.data.length = str.length;
+    strBuf.data.length = 0;
+
+    switch(str[0])
+    {
+        case 'r':
+            strBuf = parseWysiwygString(str[1..$], strBuf);
+            break;
+        case '`':
+            strBuf = parseWysiwygString(str, strBuf);
+            break;
+        case '"':
+            strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
+            break;
+        case 'x':
+            strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
+            break;
+        default:
+            messages.report(InvalidStrPrefix, loc, loc + 1);
+
+    }
+
+    printString(str, strBuf);
+
+    return strBuf;
+}
+
+String parseHexString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    char[] hex = "0123456789abcdefABCDEF";
+    char[] whitespace = "\r\n ";
+    char[] hexBuf;
+
+    while(str[i] != '"')
+    {
+        if(hex.contains(str[i]))
+        {
+            hexBuf ~= str[i];
+            if(hexBuf.length == 2)
+            {
+                strBuf.data ~= Integer.toInt(hexBuf, 16);
+                hexBuf.length = 0;
+            }
+        }
+        else if(whitespace.contains(str[i]))
+        {}
+        else
+            messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
+
+        i++;
+    }
+
+
+
+    return strBuf;
+}
+
+
+String parseDoubleQuotedString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    
+    while(str[i] != '"')
+    {
+        switch(str[i])
+        {
+            case '\\': // EscapeSequence
+                EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
+                strBuf.data ~= res.data;
+                i += res.length;
+                break;
+            default:
+                strBuf.data ~= str[i];
+                i++;
+        }
+        if(i >= str.length)
+            break;
+    }
+
+    return strBuf;
+}
+
+EscapeReturn parseEscapeSequence(char[] str,
+        SourceLocation loc, MessageHandler messages)
+{
+    EscapeReturn res;
+
+    switch(str[1])
+    {
+        case '\'':
+            res.length = 2;
+            res.data ~= '\'';
+            break;
+        case '"':
+            res.length = 2;
+            res.data ~= '\"';
+            break;
+        case '?':
+            res.length = 2;
+            res.data ~= '\?';
+            break;
+        case '\\':
+            res.length = 2;
+            res.data ~= '\\';
+            break;
+        case 'a':
+            res.length = 2;
+            res.data ~= '\a';
+            break;
+        case 'b':
+            res.length = 2;
+            res.data ~= '\b';
+            break;
+        case 'f':
+            res.length = 2;
+            res.data ~= '\f';
+            break;
+        case 'n':
+            res.length = 2;
+            res.data ~= '\n';
+            break;
+        case 'r':
+            res.length = 2;
+            res.data ~= '\r';
+            break;
+        case 't':
+            res.length = 2;
+            res.data ~= '\t';
+            break;
+        case 'v':
+            res.length = 2;
+            res.data ~= '\v';
+            break;
+        case 'x':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 4)
+            {
+                for(int i = 2; i < 4; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(2));
+                res.length = 4;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
+            break;
+        case 'u':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 6)
+            {
+                for(int i = 2; i < 6; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(6));
+                res.length = 6;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+6);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case 'U':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 10)
+            {
+                for(int i = 2; i < 10; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(10));
+                res.length = 10;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+10);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+            char[] oct = "01234567";
+            char[] octBuf;
+            octBuf ~= str[1];
+            res.length = 2;
+            for(int i = 2; i < 4; i++)
+                if(oct.contains(str[i]))
+                {
+                    octBuf ~= str[i];
+                    res.length += 1;
+                }
+                else
+                    break;
+
+            uint i = Integer.toLong(octBuf, 8);
+            res.data ~= i;
+            break;
+        default:
+            messages.report(InvalidStrEscape, loc, loc + 2);
+            res.length += 2;
+    }
+
+    return res;
+}
+
+String parseWysiwygString(char[] str, String strBuf)
+{
+    char start = str[0];
+
+    int i = 1;
+
+    while(str[i] != start)
+    {
+        strBuf.data ~= cast(ubyte)str[i];
+        i++;
+    }
+    return strBuf;
+}
+
+ubyte[] parseToUtf8(uint i)
+{
+    if(i <= 0x00007F)
+        return [cast(ubyte)i];
+    else if(i <= 0x0007FF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 19) >> 25;
+        bts(cast(uint*)&b, 7);
+        bts(cast(uint*)&b, 6);
+        return [b,a];
+    }
+    else if(i <= 0x00FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 16) >> 28;
+        bts(cast(uint*)&c, 7);
+        bts(cast(uint*)&c, 6);
+        bts(cast(uint*)&c, 5);
+        return [c,b,a];
+    }
+    else if(i <= 0x10FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 14) >> 26;
+        bts(cast(uint*)&c, 7);
+        ubyte d = (i << 11) >> 29;
+        bts(cast(uint*)&d, 7);
+        bts(cast(uint*)&d, 6);
+        bts(cast(uint*)&d, 5);
+        bts(cast(uint*)&d, 4);
+        return [d,c,b,a];
+    }
+}
+
+bool isValidUtf8(uint i)
+{
+    if(i <= 0x10FFFF)
+        return true;
+    return false;
+}
+
+void printString(char[] str, String strBuf)
+{
+    char[] s;
+    switch(strBuf.type)
+    {
+        case StructType.Char:
+            Stdout(str)(" have become").newline()
+                (cast(char[])strBuf.data).newline;
+            break;
+        case StructType.WChar:
+            Stdout(str)(" have become").newline()
+                (cast(wchar[])strBuf.data).newline;
+            break;
+        case StructType.DChar:
+            Stdout(str)(" have become").newline()
+                (cast(dchar[])strBuf.data).newline;
+            break;
+    }
+}
--- a/basic/Message.d	Wed May 21 21:11:55 2008 +0200
+++ b/basic/Message.d	Thu May 22 12:09:11 2008 +0200
@@ -42,6 +42,13 @@
         return m;
     }
 
+    Message report(uint opcode, SLoc location1, SLoc location2)
+    {
+        Message m = new Message(opcode, location1, location2, src_mgr, this);
+        messages ~= m;
+        return m;
+    }
+
     void checkErrors(ExitLevel exitlevel = ExitLevel.Normal)
     {
         if(messages.length == 0)
@@ -90,14 +97,32 @@
         this.msg_handler = msg_handler;
     }
 
+    this(int opcode, SLoc location, SLoc end, SourceManager src_mgr, MessageHandler msg_handler)
+    {
+        this.src_mgr = src_mgr;
+        this.location = location;
+        this.end = end;
+        args ~= Messages[opcode].message;
+        this.type = Messages[opcode].type;
+        this.msg_handler = msg_handler;
+        haveEnd = true;
+    }
+
     char[] toString()
     {
         char[256] tmp = void;
         char[] msg = layout(tmp, args);
 
-        Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr));
+        int len = 0;
+        if(!haveEnd)
+        {
+            Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr));
 
-        Token t = l.next;
+            Token t = l.next;
+            len = t.length;
+        }
+        else
+            len = end - location;
         
         if (src_mgr.getRawData(location).length > 0)
             msg = src_mgr.getLocationAsString(location) ~ ": " ~ msg;
@@ -109,7 +134,7 @@
         char[] marks = line.dup;
         marks[] = ' ';
         size_t p = src_mgr.getColumn(location);
-        marks[p .. p + t.length] = '^';
+        marks[p .. p + len] = '^';
 
         msg ~= "\n    ";
         msg ~= line;
@@ -166,7 +191,9 @@
     MessageType type;
 private:
     char[][] args;
-    SLoc location;
+    SLoc location, end;
+    bool haveEnd;
     SourceManager src_mgr;
     MessageHandler msg_handler;
+    Token t;
 }
--- a/basic/Messages.d	Wed May 21 21:11:55 2008 +0200
+++ b/basic/Messages.d	Thu May 22 12:09:11 2008 +0200
@@ -28,6 +28,14 @@
 
     // Imports
     CannotFindModule,
+
+    // Strings
+    InvalidStrPrefix,
+    InvalidStrEscape,
+    InvalidUtf8Hex,
+    InvalidHexStrChar,
+    StringShortEscape,
+    StringHexInvalid,
 }
 
 enum MessageType
@@ -67,7 +75,13 @@
         InvalidDeclType     : E(Err, "Invalid declaration type"),
         InvalidType         : E(Err, "Invalid type"),
         ExpectedIdAfterPackage : E(Err, "Identifier expected following package"),
-        CannotFindModule    : E(Err, "Cannot find module '%0'")
+        CannotFindModule    : E(Err, "Cannot find module '%0'"),
+        InvalidStrPrefix    : E(Err, "Invalid string literal prefix"),
+        InvalidStrEscape    : E(Err, "Invalid escape sequence"),
+        InvalidUtf8Hex      : E(Err, "Invalid Utf8 hex char"),
+        InvalidHexStrChar   : E(Err, "Invalid character in hex string"),
+        StringShortEscape   : E(Err, "String literal is to short for escape sequence"),
+        StringHexInvalid    : E(Err, "Hex escape sequence have invalid digit at position %0 of %1")
     ];
 }
 
--- a/basic/SourceLocation.d	Wed May 21 21:11:55 2008 +0200
+++ b/basic/SourceLocation.d	Thu May 22 12:09:11 2008 +0200
@@ -62,6 +62,12 @@
         return res;
     }
 
+    /// Get the length between two location
+    int opSub(SourceLocation loc)
+    {
+        return val - loc.val;
+    }
+
     /// Creates a SourceLocation from a File ID
     static SourceLocation fromFileID(uint fileID)
     {
--- a/dang/compiler.d	Wed May 21 21:11:55 2008 +0200
+++ b/dang/compiler.d	Thu May 22 12:09:11 2008 +0200
@@ -25,6 +25,7 @@
 import sema.Visitor,
        sema.AstAction,
        sema.ScopeBuilder,
+       sema.LiteralInterpreter,
        sema.ScopeCheck,
        sema.TypeCheck;
 
@@ -240,6 +241,8 @@
         postParse(m, src_mgr);*/
     }
 
+    (new LiteralInterpreter(messages)).visit(modules);
+
     (new ScopeBuilder).visit(modules);
     StopWatch watch2;
     watch.start;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sema/LiteralInterpreter.d	Thu May 22 12:09:11 2008 +0200
@@ -0,0 +1,27 @@
+module sema.LiteralInterpreter;
+
+import sema.Visitor;
+
+import basic.LiteralParsing,
+       basic.Message;
+
+class LiteralInterpreter : Visitor!(void)
+{
+    this(MessageHandler messages)
+    {
+        this.messages = messages;
+    }
+
+    void visit(Module[] modules)
+    {
+        super.visit(modules);
+        messages.checkErrors();
+    }
+    
+    void visitStringExp(StringExp exp)
+    {
+        auto type = parseString(exp.str, exp.loc, messages);
+    }
+
+    MessageHandler messages;
+}
--- a/sema/Visitor.d	Wed May 21 21:11:55 2008 +0200
+++ b/sema/Visitor.d	Thu May 22 12:09:11 2008 +0200
@@ -97,6 +97,8 @@
                 return visitPointerIdentifier(cast(PointerIdentifier)exp);
             case ExpType.ArrayIdentifier:
                 return visitArrayIdentifier(cast(ArrayIdentifier)exp);
+            case ExpType.StringExp:
+                return visitStringExp(cast(StringExp)exp);
             case ExpType.Index:
                 return visitIndexExp(cast(IndexExp)exp);
             case ExpType.MemberReference:
@@ -314,6 +316,14 @@
             return ExpT.init;
     }
 
+    ExpT visitStringExp(StringExp exp)
+    {
+        static if (is(ExpT == void))
+            return;
+        else
+            return ExpT.init;
+    }
+
     ExpT visitIdentifier(Identifier exp)
     {
         static if (is(ExpT == void))
--- a/tests/parser/string_1.d	Wed May 21 21:11:55 2008 +0200
+++ b/tests/parser/string_1.d	Thu May 22 12:09:11 2008 +0200
@@ -12,7 +12,6 @@
     char[5]     s5  = `hello`;
     char[15]    s6  = `c:\root\foo.exe`;
     char[4]     s7  = `ab\n`;
-    char[4]     s9  = `abn\`;
 
     char[5]     s10 = "hello";
     char[15]    s11 = "c:\\root\\foo.exe";
@@ -23,5 +22,13 @@
     char[1]     s14 = x"0A";
     char[6]     s15 = x"00 FBCD 32FD 0A";
 
+    /* And some custom ones */
+
+    char[8]     s16 = "\x61\u05D0\U000201A4";
+    char[2]     s17 = "\122\522";
+    char[6]     s15 = x"61 62 63 64
+                        65 66 67 68";
+
+
     return 0;
 }