changeset 109:d1f68bfb58ae

merge
author Anders Halager <halager@gmail.com>
date Sun, 25 May 2008 14:46:01 +0200
parents 5e383b3755d6 (current diff) 89db676fbacb (diff)
children 2deb4c1f0d93
files ast/Exp.d basic/Messages.d
diffstat 14 files changed, 573 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/ast/Exp.d	Sun May 25 14:43:16 2008 +0200
+++ b/ast/Exp.d	Sun May 25 14:46:01 2008 +0200
@@ -25,6 +25,7 @@
     AssignExp,
     CallExp,
     CastExp,
+    StringExp,
 }
 
 abstract class Exp
@@ -434,6 +435,17 @@
     Exp exp;
 }
 
+class StringExp : Exp
+{
+    this(SLoc loc, char[] str)
+    {
+        super(ExpType.StringExp, loc);
+        this.str = str;
+    }
+
+    char[] str;
+}
+
 class PointerIdentifier : Identifier
 {
     this(Identifier pointerOf)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/basic/LiteralParsing.d	Sun May 25 14:46:01 2008 +0200
@@ -0,0 +1,353 @@
+module basic.LiteralParsing.d;
+
+import basic.SourceLocation,
+       basic.Message;
+
+import tango.io.Stdout,
+       tango.core.BitManip,
+       Integer = tango.text.convert.Integer,
+       tango.text.Util;
+
+enum StructType
+{
+    Char,
+    WChar,
+    DChar
+}
+
+struct String
+{
+    StructType type;
+    ubyte[] data;
+}
+
+private struct EscapeReturn
+{
+    ubyte[] data;
+    int length;
+}
+
+String parseString(char[] str, SourceLocation loc, MessageHandler messages)
+{
+    String strBuf;
+    strBuf.data.length = str.length;
+    strBuf.data.length = 0;
+
+    switch(str[0])
+    {
+        case 'r':
+            strBuf = parseWysiwygString(str[1..$], strBuf);
+            break;
+        case '`':
+            strBuf = parseWysiwygString(str, strBuf);
+            break;
+        case '"':
+            strBuf = parseDoubleQuotedString(str, strBuf, loc, messages);
+            break;
+        case 'x':
+            strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages);
+            break;
+        default:
+            messages.report(InvalidStrPrefix, loc, loc + 1);
+
+    }
+
+    printString(str, strBuf);
+
+    return strBuf;
+}
+
+String parseHexString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    char[] hex = "0123456789abcdefABCDEF";
+    char[] whitespace = "\r\n ";
+    char[] hexBuf;
+
+    while(str[i] != '"')
+    {
+        if(hex.contains(str[i]))
+        {
+            hexBuf ~= str[i];
+            if(hexBuf.length == 2)
+            {
+                strBuf.data ~= Integer.toInt(hexBuf, 16);
+                hexBuf.length = 0;
+            }
+        }
+        else if(whitespace.contains(str[i]))
+        {}
+        else
+            messages.report(InvalidHexStrChar, loc + i, loc + i + 1);
+
+        i++;
+    }
+
+
+
+    return strBuf;
+}
+
+
+String parseDoubleQuotedString(char[] str, String strBuf, 
+        SourceLocation loc, MessageHandler messages)
+{
+    int i = 1; // first char is "
+    
+    while(str[i] != '"')
+    {
+        switch(str[i])
+        {
+            case '\\': // EscapeSequence
+                EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages);
+                strBuf.data ~= res.data;
+                i += res.length;
+                break;
+            default:
+                strBuf.data ~= str[i];
+                i++;
+        }
+        if(i >= str.length)
+            break;
+    }
+
+    return strBuf;
+}
+
+EscapeReturn parseEscapeSequence(char[] str,
+        SourceLocation loc, MessageHandler messages)
+{
+    EscapeReturn res;
+
+    switch(str[1])
+    {
+        case '\'':
+            res.length = 2;
+            res.data ~= '\'';
+            break;
+        case '"':
+            res.length = 2;
+            res.data ~= '\"';
+            break;
+        case '?':
+            res.length = 2;
+            res.data ~= '\?';
+            break;
+        case '\\':
+            res.length = 2;
+            res.data ~= '\\';
+            break;
+        case 'a':
+            res.length = 2;
+            res.data ~= '\a';
+            break;
+        case 'b':
+            res.length = 2;
+            res.data ~= '\b';
+            break;
+        case 'f':
+            res.length = 2;
+            res.data ~= '\f';
+            break;
+        case 'n':
+            res.length = 2;
+            res.data ~= '\n';
+            break;
+        case 'r':
+            res.length = 2;
+            res.data ~= '\r';
+            break;
+        case 't':
+            res.length = 2;
+            res.data ~= '\t';
+            break;
+        case 'v':
+            res.length = 2;
+            res.data ~= '\v';
+            break;
+        case 'x':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 4)
+            {
+                for(int i = 2; i < 4; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(2));
+                res.length = 4;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16);
+            break;
+        case 'u':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 6)
+            {
+                for(int i = 2; i < 6; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(6));
+                res.length = 6;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+6);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case 'U':
+            char[] hex = "0123456789abcdefABCDEF";
+            char[] hexBuf;
+            if(str.length - 1 >= 10)
+            {
+                for(int i = 2; i < 10; i++)
+                    if(hex.contains(str[i]))
+                        hexBuf ~= str[i];
+                    else
+                        messages.report(StringHexInvalid, loc + i, loc + i + 1)
+                            .arg(Integer.toString(i-1))
+                            .arg(Integer.toString(10));
+                res.length = 10;
+            }
+            else
+            {
+                messages.report(StringShortEscape, loc, loc + str.length);
+                res.length = str.length - 1;
+            }
+            uint i = Integer.toLong(hexBuf, 16);
+            if(!isValidUtf8(i))
+                messages.report(InvalidUtf8Hex, loc, loc+10);
+            else
+                res.data ~= parseToUtf8(i);
+            break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+            char[] oct = "01234567";
+            char[] octBuf;
+            octBuf ~= str[1];
+            res.length = 2;
+            for(int i = 2; i < 4; i++)
+                if(oct.contains(str[i]))
+                {
+                    octBuf ~= str[i];
+                    res.length += 1;
+                }
+                else
+                    break;
+
+            uint i = Integer.toLong(octBuf, 8);
+            res.data ~= i;
+            break;
+        default:
+            messages.report(InvalidStrEscape, loc, loc + 2);
+            res.length += 2;
+    }
+
+    return res;
+}
+
+String parseWysiwygString(char[] str, String strBuf)
+{
+    char start = str[0];
+
+    int i = 1;
+
+    while(str[i] != start)
+    {
+        strBuf.data ~= cast(ubyte)str[i];
+        i++;
+    }
+    return strBuf;
+}
+
+ubyte[] parseToUtf8(uint i)
+{
+    if(i <= 0x00007F)
+        return [cast(ubyte)i];
+    else if(i <= 0x0007FF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 19) >> 25;
+        bts(cast(uint*)&b, 7);
+        bts(cast(uint*)&b, 6);
+        return [b,a];
+    }
+    else if(i <= 0x00FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 16) >> 28;
+        bts(cast(uint*)&c, 7);
+        bts(cast(uint*)&c, 6);
+        bts(cast(uint*)&c, 5);
+        return [c,b,a];
+    }
+    else if(i <= 0x10FFFF)
+    {
+        ubyte a = (i << 26) >> 26;
+        bts(cast(uint*)&a, 7);
+        ubyte b = (i << 20) >> 26;
+        bts(cast(uint*)&b, 7);
+        ubyte c = (i << 14) >> 26;
+        bts(cast(uint*)&c, 7);
+        ubyte d = (i << 11) >> 29;
+        bts(cast(uint*)&d, 7);
+        bts(cast(uint*)&d, 6);
+        bts(cast(uint*)&d, 5);
+        bts(cast(uint*)&d, 4);
+        return [d,c,b,a];
+    }
+}
+
+bool isValidUtf8(uint i)
+{
+    if(i <= 0x10FFFF)
+        return true;
+    return false;
+}
+
+void printString(char[] str, String strBuf)
+{
+    char[] s;
+    switch(strBuf.type)
+    {
+        case StructType.Char:
+            Stdout(str)(" have become").newline()
+                (cast(char[])strBuf.data).newline;
+            break;
+        case StructType.WChar:
+            Stdout(str)(" have become").newline()
+                (cast(wchar[])strBuf.data).newline;
+            break;
+        case StructType.DChar:
+            Stdout(str)(" have become").newline()
+                (cast(dchar[])strBuf.data).newline;
+            break;
+    }
+}
--- a/basic/Message.d	Sun May 25 14:43:16 2008 +0200
+++ b/basic/Message.d	Sun May 25 14:46:01 2008 +0200
@@ -42,6 +42,13 @@
         return m;
     }
 
+    Message report(uint opcode, SLoc location1, SLoc location2)
+    {
+        Message m = new Message(opcode, location1, location2, src_mgr, this);
+        messages ~= m;
+        return m;
+    }
+
     void checkErrors(ExitLevel exitlevel = ExitLevel.Normal)
     {
         if(messages.length == 0)
@@ -90,14 +97,32 @@
         this.msg_handler = msg_handler;
     }
 
+    this(int opcode, SLoc location, SLoc end, SourceManager src_mgr, MessageHandler msg_handler)
+    {
+        this.src_mgr = src_mgr;
+        this.location = location;
+        this.end = end;
+        args ~= Messages[opcode].message;
+        this.type = Messages[opcode].type;
+        this.msg_handler = msg_handler;
+        haveEnd = true;
+    }
+
     char[] toString()
     {
         char[256] tmp = void;
         char[] msg = layout(tmp, args);
 
-        Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr));
+        int len = 0;
+        if(!haveEnd)
+        {
+            Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr));
 
-        Token t = l.next;
+            Token t = l.next;
+            len = t.length;
+        }
+        else
+            len = end - location;
         
         if (src_mgr.getRawData(location).length > 0)
             msg = src_mgr.getLocationAsString(location) ~ ": " ~ msg;
@@ -109,7 +134,7 @@
         char[] marks = line.dup;
         marks[] = ' ';
         size_t p = src_mgr.getColumn(location);
-        marks[p .. p + t.length] = '^';
+        marks[p .. p + len] = '^';
 
         msg ~= "\n    ";
         msg ~= line;
@@ -166,7 +191,9 @@
     MessageType type;
 private:
     char[][] args;
-    SLoc location;
+    SLoc location, end;
+    bool haveEnd;
     SourceManager src_mgr;
     MessageHandler msg_handler;
+    Token t;
 }
--- a/basic/Messages.d	Sun May 25 14:43:16 2008 +0200
+++ b/basic/Messages.d	Sun May 25 14:46:01 2008 +0200
@@ -28,6 +28,14 @@
 
     // Imports
     CannotFindModule,
+
+    // Strings
+    InvalidStrPrefix,
+    InvalidStrEscape,
+    InvalidUtf8Hex,
+    InvalidHexStrChar,
+    StringShortEscape,
+    StringHexInvalid,
 }
 
 enum MessageType
@@ -50,11 +58,13 @@
 static this()
 {
     Messages = [
+        // lexing
         UnexpectedEOFBlock  : E(Err, "Unexpected end of file. Unclosed comment block"),
         InvalidSymbol       : E(Err, "Read invalid symbol: '%0'"),
         OnlyOneDotFloating  : E(Err, "Only one '.' is allowed in an floating number"),
         OnlyOneEFloating    : E(Err, "Only one E is allowed in an floating number"),
 
+        // parsing
         UnexpectedTokMulti  : E(Err, "Unexpected token, got %0 expected one of %1"),
         UnexpectedTokSingle : E(Err, "Unexpected token, got %0 expected %1"),
         UnexpectedTok       : E(Err, "Unexpected token %0"),
@@ -68,7 +78,16 @@
         InvalidType         : E(Err, "Invalid type"),
         ExpectedIdAfterPackage : E(Err, "Identifier expected following package"),
 
-        CannotFindModule    : E(Err, "Cannot find module '%0'")
+        // sema
+        CannotFindModule    : E(Err, "Cannot find module '%0'"),
+
+        // 
+        InvalidStrPrefix    : E(Err, "Invalid string literal prefix"),
+        InvalidStrEscape    : E(Err, "Invalid escape sequence"),
+        InvalidUtf8Hex      : E(Err, "Invalid Utf8 hex char"),
+        InvalidHexStrChar   : E(Err, "Invalid character in hex string"),
+        StringShortEscape   : E(Err, "String literal is to short for escape sequence"),
+        StringHexInvalid    : E(Err, "Hex escape sequence have invalid digit at position %0 of %1")
     ];
 }
 
--- a/basic/SourceLocation.d	Sun May 25 14:43:16 2008 +0200
+++ b/basic/SourceLocation.d	Sun May 25 14:46:01 2008 +0200
@@ -62,6 +62,12 @@
         return res;
     }
 
+    /// Get the length between two location
+    int opSub(SourceLocation loc)
+    {
+        return val - loc.val;
+    }
+
     /// Creates a SourceLocation from a File ID
     static SourceLocation fromFileID(uint fileID)
     {
--- a/dang/compiler.d	Sun May 25 14:43:16 2008 +0200
+++ b/dang/compiler.d	Sun May 25 14:46:01 2008 +0200
@@ -25,6 +25,7 @@
 import sema.Visitor,
        sema.AstAction,
        sema.ScopeBuilder,
+       sema.LiteralInterpreter,
        sema.ScopeCheck,
        sema.TypeCheck;
 
@@ -240,6 +241,8 @@
         postParse(m, src_mgr);*/
     }
 
+    (new LiteralInterpreter(messages)).visit(modules);
+
     (new ScopeBuilder).visit(modules);
     StopWatch watch2;
     watch.start;
--- a/lexer/Lexer.d	Sun May 25 14:43:16 2008 +0200
+++ b/lexer/Lexer.d	Sun May 25 14:46:01 2008 +0200
@@ -37,12 +37,15 @@
         foreach (c; "0123456789")
             charTable[c] = CharType.Number;
 
-        foreach (c; "(){}[];:.,=!<>+-*/%")
+        foreach (c; "(){}[];:.,=!<>+-*/%\"`")
             charTable[c] = CharType.Symbol;
 
         foreach (c; " \n")
             charTable[c] = CharType.Whitespace;
 
+        foreach (c; "'\\")
+            charTable[c] = CharType.Other;
+
         symbolFunctions.length = 256;
 
         symbolFunctions['('] = &openParentheses;
@@ -64,6 +67,8 @@
         symbolFunctions['*'] = &star;
         symbolFunctions['/'] = &slash;
         symbolFunctions['%'] = &percent;
+        symbolFunctions['"'] = &string;
+        symbolFunctions['`'] = &string;
     }
 
     /**
@@ -93,6 +98,8 @@
 
             case CharType.Number:
                 return lexNumber;
+            case CharType.Other:
+                messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer);
         }
     }
 
@@ -201,7 +208,7 @@
     {
         return Token(Tok.Star, Loc(position - 1), 1);
     }
-    Token slash() 
+    Token slash()
     {
         switch(source[position])
         {
@@ -220,7 +227,9 @@
                     ++position;
                     if(source[position-2] == '*')
                         if(source[position-1] == '/')
+                        {
                             return this.next;
+                        }
                 }
                 messages.report(UnexpectedEOFBlock,Loc(position));
 
@@ -258,6 +267,46 @@
     {
         return Token(Tok.Percent, Loc(position - 1), 1);
     }
+
+    Token string()
+    {
+        --position;
+        int start = position;
+        if(getNextChar() == CharType.Letter)
+            position++;
+        char end = '`';
+        switch(source[position])
+        {
+            case '"':
+                if(position > 0)
+                    if(source[position-1] == 'r')
+                    {
+                        end = '"';
+                        goto string_wys;
+                    }
+                ++position;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if (source[position-1] == '"' )
+                        return Token(Tok.String, Loc(start), position - start);
+                    else if (source[position-1] == '\\')
+                        position++;
+                }
+                break;
+                case '`':
+string_wys:     
+                ++position;
+                while(getNextChar != CharType.EOF)
+                {
+                    ++position;
+                    if (source[position-1] == end )
+                        return Token(Tok.String, Loc(start), position - start);
+                }
+                break;
+        }
+        messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer);
+    }
     
     Token lexNumber ()
     {
@@ -321,6 +370,12 @@
     {
         int i = 0;
         bool hasNumber = false;
+        if (source[position+1] == '"' ||
+            source[position+1] == '`')
+        {
+            ++position;
+            return string;
+        }
         while (getNextChar(++i) == CharType.Letter || 
                 getNextChar(i) == CharType.Number)
         {
@@ -385,6 +440,7 @@
     Number,
     Symbol,
     Whitespace,
+    Other,
 
     EOF
 }
--- a/lexer/Token.d	Sun May 25 14:43:16 2008 +0200
+++ b/lexer/Token.d	Sun May 25 14:46:01 2008 +0200
@@ -136,6 +136,8 @@
     Switch, Case, Default,
     Return, Cast,
 
+    String,
+
     Module, Import,
 
 }
@@ -194,6 +196,7 @@
         Tok.Seperator:"Seperator",
         Tok.Cast:"Cast",
         Tok.Module:"Module",
-        Tok.Import:"Import"
+        Tok.Import:"Import",
+        Tok.String:"String"
     ];
 }
--- a/parser/Action.d	Sun May 25 14:43:16 2008 +0200
+++ b/parser/Action.d	Sun May 25 14:46:01 2008 +0200
@@ -292,6 +292,14 @@
     }
 
     /**
+      This is called when strings are used in expression
+     */
+    ExprT actOnStringExp(Token t)
+    {
+        return null;
+    }
+
+    /**
       Unary operator.
      */
     ExprT actOnUnaryOp(Token op, ExprT operand)
--- a/parser/Parser.d	Sun May 25 14:43:16 2008 +0200
+++ b/parser/Parser.d	Sun May 25 14:46:01 2008 +0200
@@ -627,6 +627,8 @@
             return parseCast(next);
         else if (next.type == Tok.Integer)
             return action.actOnNumericConstant(next);
+        else if (next.type == Tok.String)
+            return action.actOnStringExp(next);
 
         messages.report(ExpectedExp, next.location)
             .fatal(ExitLevel.Parser);
--- a/sema/AstAction.d	Sun May 25 14:43:16 2008 +0200
+++ b/sema/AstAction.d	Sun May 25 14:46:01 2008 +0200
@@ -173,6 +173,11 @@
         return new IntegerLit(c.location, sm.getText(c.asRange));
     }
 
+    override ExprT actOnStringExp(Token s)
+    {
+        return new StringExp(s.location, sm.getText(s.asRange));
+    }
+
     override ExprT actOnIdentifierExp(Id id)
     {
         return identifierFromTok(id.tok);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sema/LiteralInterpreter.d	Sun May 25 14:46:01 2008 +0200
@@ -0,0 +1,27 @@
+module sema.LiteralInterpreter;
+
+import sema.Visitor;
+
+import basic.LiteralParsing,
+       basic.Message;
+
+class LiteralInterpreter : Visitor!(void)
+{
+    this(MessageHandler messages)
+    {
+        this.messages = messages;
+    }
+
+    void visit(Module[] modules)
+    {
+        super.visit(modules);
+        messages.checkErrors();
+    }
+    
+    void visitStringExp(StringExp exp)
+    {
+        auto type = parseString(exp.str, exp.loc, messages);
+    }
+
+    MessageHandler messages;
+}
--- a/sema/Visitor.d	Sun May 25 14:43:16 2008 +0200
+++ b/sema/Visitor.d	Sun May 25 14:46:01 2008 +0200
@@ -97,6 +97,8 @@
                 return visitPointerIdentifier(cast(PointerIdentifier)exp);
             case ExpType.ArrayIdentifier:
                 return visitArrayIdentifier(cast(ArrayIdentifier)exp);
+            case ExpType.StringExp:
+                return visitStringExp(cast(StringExp)exp);
             case ExpType.Index:
                 return visitIndexExp(cast(IndexExp)exp);
             case ExpType.MemberReference:
@@ -314,6 +316,14 @@
             return ExpT.init;
     }
 
+    ExpT visitStringExp(StringExp exp)
+    {
+        static if (is(ExpT == void))
+            return;
+        else
+            return ExpT.init;
+    }
+
     ExpT visitIdentifier(Identifier exp)
     {
         static if (is(ExpT == void))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/parser/string_1.d	Sun May 25 14:46:01 2008 +0200
@@ -0,0 +1,34 @@
+
+int main()
+{
+    /* All examples taken from D's Language site */
+
+    char[4]     s1  = "food";
+
+    char[5]     s2  = r"hello";
+    char[15]    s3  = r"c:\root\foo.exe";
+    char[4]     s4  = r"ab\n";
+
+    char[5]     s5  = `hello`;
+    char[15]    s6  = `c:\root\foo.exe`;
+    char[4]     s7  = `ab\n`;
+
+    char[5]     s10 = "hello";
+    char[15]    s11 = "c:\\root\\foo.exe";
+    char[3]     s12 = "ab\n";
+    char[3]     s13 = "ab
+";
+
+    char[1]     s14 = x"0A";
+    char[6]     s15 = x"00 FBCD 32FD 0A";
+
+    /* And some custom ones */
+
+    char[8]     s16 = "\x61\u05D0\U000201A4";
+    char[2]     s17 = "\122\522";
+    char[6]     s15 = x"61 62 63 64
+                        65 66 67 68";
+
+
+    return 0;
+}