# HG changeset patch # User Anders Halager # Date 1211719561 -7200 # Node ID d1f68bfb58ae540e2e1e23a76d0662fcb802acda # Parent 5e383b3755d6e2514323e2a2b719570a56ade2d7# Parent 89db676fbacbec5e95dfe4072d14107cdfed8cc6 merge diff -r 5e383b3755d6 -r d1f68bfb58ae ast/Exp.d --- a/ast/Exp.d Sun May 25 14:43:16 2008 +0200 +++ b/ast/Exp.d Sun May 25 14:46:01 2008 +0200 @@ -25,6 +25,7 @@ AssignExp, CallExp, CastExp, + StringExp, } abstract class Exp @@ -434,6 +435,17 @@ Exp exp; } +class StringExp : Exp +{ + this(SLoc loc, char[] str) + { + super(ExpType.StringExp, loc); + this.str = str; + } + + char[] str; +} + class PointerIdentifier : Identifier { this(Identifier pointerOf) diff -r 5e383b3755d6 -r d1f68bfb58ae basic/LiteralParsing.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/basic/LiteralParsing.d Sun May 25 14:46:01 2008 +0200 @@ -0,0 +1,353 @@ +module basic.LiteralParsing.d; + +import basic.SourceLocation, + basic.Message; + +import tango.io.Stdout, + tango.core.BitManip, + Integer = tango.text.convert.Integer, + tango.text.Util; + +enum StructType +{ + Char, + WChar, + DChar +} + +struct String +{ + StructType type; + ubyte[] data; +} + +private struct EscapeReturn +{ + ubyte[] data; + int length; +} + +String parseString(char[] str, SourceLocation loc, MessageHandler messages) +{ + String strBuf; + strBuf.data.length = str.length; + strBuf.data.length = 0; + + switch(str[0]) + { + case 'r': + strBuf = parseWysiwygString(str[1..$], strBuf); + break; + case '`': + strBuf = parseWysiwygString(str, strBuf); + break; + case '"': + strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); + break; + case 'x': + strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); + break; + default: + messages.report(InvalidStrPrefix, loc, loc + 1); + + } + + printString(str, strBuf); + + return strBuf; +} + +String parseHexString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + char[] hex = "0123456789abcdefABCDEF"; + char[] whitespace = "\r\n "; + char[] hexBuf; + + while(str[i] != '"') + { + if(hex.contains(str[i])) + { + hexBuf ~= str[i]; + if(hexBuf.length == 2) + { + strBuf.data ~= Integer.toInt(hexBuf, 16); + hexBuf.length = 0; + } + } + else if(whitespace.contains(str[i])) + {} + else + messages.report(InvalidHexStrChar, loc + i, loc + i + 1); + + i++; + } + + + + return strBuf; +} + + +String parseDoubleQuotedString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + + while(str[i] != '"') + { + switch(str[i]) + { + case '\\': // EscapeSequence + EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); + strBuf.data ~= res.data; + i += res.length; + break; + default: + strBuf.data ~= str[i]; + i++; + } + if(i >= str.length) + break; + } + + return strBuf; +} + +EscapeReturn parseEscapeSequence(char[] str, + SourceLocation loc, MessageHandler messages) +{ + EscapeReturn res; + + switch(str[1]) + { + case '\'': + res.length = 2; + res.data ~= '\''; + break; + case '"': + res.length = 2; + res.data ~= '\"'; + break; + case '?': + res.length = 2; + res.data ~= '\?'; + break; + case '\\': + res.length = 2; + res.data ~= '\\'; + break; + case 'a': + res.length = 2; + res.data ~= '\a'; + break; + case 'b': + res.length = 2; + res.data ~= '\b'; + break; + case 'f': + res.length = 2; + res.data ~= '\f'; + break; + case 'n': + res.length = 2; + res.data ~= '\n'; + break; + case 'r': + res.length = 2; + res.data ~= '\r'; + break; + case 't': + res.length = 2; + res.data ~= '\t'; + break; + case 'v': + res.length = 2; + res.data ~= '\v'; + break; + case 'x': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 4) + { + for(int i = 2; i < 4; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(2)); + res.length = 4; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); + break; + case 'u': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 6) + { + for(int i = 2; i < 6; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(6)); + res.length = 6; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+6); + else + res.data ~= parseToUtf8(i); + break; + case 'U': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 10) + { + for(int i = 2; i < 10; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(10)); + res.length = 10; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+10); + else + res.data ~= parseToUtf8(i); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + char[] oct = "01234567"; + char[] octBuf; + octBuf ~= str[1]; + res.length = 2; + for(int i = 2; i < 4; i++) + if(oct.contains(str[i])) + { + octBuf ~= str[i]; + res.length += 1; + } + else + break; + + uint i = Integer.toLong(octBuf, 8); + res.data ~= i; + break; + default: + messages.report(InvalidStrEscape, loc, loc + 2); + res.length += 2; + } + + return res; +} + +String parseWysiwygString(char[] str, String strBuf) +{ + char start = str[0]; + + int i = 1; + + while(str[i] != start) + { + strBuf.data ~= cast(ubyte)str[i]; + i++; + } + return strBuf; +} + +ubyte[] parseToUtf8(uint i) +{ + if(i <= 0x00007F) + return [cast(ubyte)i]; + else if(i <= 0x0007FF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 19) >> 25; + bts(cast(uint*)&b, 7); + bts(cast(uint*)&b, 6); + return [b,a]; + } + else if(i <= 0x00FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 16) >> 28; + bts(cast(uint*)&c, 7); + bts(cast(uint*)&c, 6); + bts(cast(uint*)&c, 5); + return [c,b,a]; + } + else if(i <= 0x10FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 14) >> 26; + bts(cast(uint*)&c, 7); + ubyte d = (i << 11) >> 29; + bts(cast(uint*)&d, 7); + bts(cast(uint*)&d, 6); + bts(cast(uint*)&d, 5); + bts(cast(uint*)&d, 4); + return [d,c,b,a]; + } +} + +bool isValidUtf8(uint i) +{ + if(i <= 0x10FFFF) + return true; + return false; +} + +void printString(char[] str, String strBuf) +{ + char[] s; + switch(strBuf.type) + { + case StructType.Char: + Stdout(str)(" have become").newline() + (cast(char[])strBuf.data).newline; + break; + case StructType.WChar: + Stdout(str)(" have become").newline() + (cast(wchar[])strBuf.data).newline; + break; + case StructType.DChar: + Stdout(str)(" have become").newline() + (cast(dchar[])strBuf.data).newline; + break; + } +} diff -r 5e383b3755d6 -r d1f68bfb58ae basic/Message.d --- a/basic/Message.d Sun May 25 14:43:16 2008 +0200 +++ b/basic/Message.d Sun May 25 14:46:01 2008 +0200 @@ -42,6 +42,13 @@ return m; } + Message report(uint opcode, SLoc location1, SLoc location2) + { + Message m = new Message(opcode, location1, location2, src_mgr, this); + messages ~= m; + return m; + } + void checkErrors(ExitLevel exitlevel = ExitLevel.Normal) { if(messages.length == 0) @@ -90,14 +97,32 @@ this.msg_handler = msg_handler; } + this(int opcode, SLoc location, SLoc end, SourceManager src_mgr, MessageHandler msg_handler) + { + this.src_mgr = src_mgr; + this.location = location; + this.end = end; + args ~= Messages[opcode].message; + this.type = Messages[opcode].type; + this.msg_handler = msg_handler; + haveEnd = true; + } + char[] toString() { char[256] tmp = void; char[] msg = layout(tmp, args); - Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr)); + int len = 0; + if(!haveEnd) + { + Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr)); - Token t = l.next; + Token t = l.next; + len = t.length; + } + else + len = end - location; if (src_mgr.getRawData(location).length > 0) msg = src_mgr.getLocationAsString(location) ~ ": " ~ msg; @@ -109,7 +134,7 @@ char[] marks = line.dup; marks[] = ' '; size_t p = src_mgr.getColumn(location); - marks[p .. p + t.length] = '^'; + marks[p .. p + len] = '^'; msg ~= "\n "; msg ~= line; @@ -166,7 +191,9 @@ MessageType type; private: char[][] args; - SLoc location; + SLoc location, end; + bool haveEnd; SourceManager src_mgr; MessageHandler msg_handler; + Token t; } diff -r 5e383b3755d6 -r d1f68bfb58ae basic/Messages.d --- a/basic/Messages.d Sun May 25 14:43:16 2008 +0200 +++ b/basic/Messages.d Sun May 25 14:46:01 2008 +0200 @@ -28,6 +28,14 @@ // Imports CannotFindModule, + + // Strings + InvalidStrPrefix, + InvalidStrEscape, + InvalidUtf8Hex, + InvalidHexStrChar, + StringShortEscape, + StringHexInvalid, } enum MessageType @@ -50,11 +58,13 @@ static this() { Messages = [ + // lexing UnexpectedEOFBlock : E(Err, "Unexpected end of file. Unclosed comment block"), InvalidSymbol : E(Err, "Read invalid symbol: '%0'"), OnlyOneDotFloating : E(Err, "Only one '.' is allowed in an floating number"), OnlyOneEFloating : E(Err, "Only one E is allowed in an floating number"), + // parsing UnexpectedTokMulti : E(Err, "Unexpected token, got %0 expected one of %1"), UnexpectedTokSingle : E(Err, "Unexpected token, got %0 expected %1"), UnexpectedTok : E(Err, "Unexpected token %0"), @@ -68,7 +78,16 @@ InvalidType : E(Err, "Invalid type"), ExpectedIdAfterPackage : E(Err, "Identifier expected following package"), - CannotFindModule : E(Err, "Cannot find module '%0'") + // sema + CannotFindModule : E(Err, "Cannot find module '%0'"), + + // + InvalidStrPrefix : E(Err, "Invalid string literal prefix"), + InvalidStrEscape : E(Err, "Invalid escape sequence"), + InvalidUtf8Hex : E(Err, "Invalid Utf8 hex char"), + InvalidHexStrChar : E(Err, "Invalid character in hex string"), + StringShortEscape : E(Err, "String literal is to short for escape sequence"), + StringHexInvalid : E(Err, "Hex escape sequence have invalid digit at position %0 of %1") ]; } diff -r 5e383b3755d6 -r d1f68bfb58ae basic/SourceLocation.d --- a/basic/SourceLocation.d Sun May 25 14:43:16 2008 +0200 +++ b/basic/SourceLocation.d Sun May 25 14:46:01 2008 +0200 @@ -62,6 +62,12 @@ return res; } + /// Get the length between two location + int opSub(SourceLocation loc) + { + return val - loc.val; + } + /// Creates a SourceLocation from a File ID static SourceLocation fromFileID(uint fileID) { diff -r 5e383b3755d6 -r d1f68bfb58ae dang/compiler.d --- a/dang/compiler.d Sun May 25 14:43:16 2008 +0200 +++ b/dang/compiler.d Sun May 25 14:46:01 2008 +0200 @@ -25,6 +25,7 @@ import sema.Visitor, sema.AstAction, sema.ScopeBuilder, + sema.LiteralInterpreter, sema.ScopeCheck, sema.TypeCheck; @@ -240,6 +241,8 @@ postParse(m, src_mgr);*/ } + (new LiteralInterpreter(messages)).visit(modules); + (new ScopeBuilder).visit(modules); StopWatch watch2; watch.start; diff -r 5e383b3755d6 -r d1f68bfb58ae lexer/Lexer.d --- a/lexer/Lexer.d Sun May 25 14:43:16 2008 +0200 +++ b/lexer/Lexer.d Sun May 25 14:46:01 2008 +0200 @@ -37,12 +37,15 @@ foreach (c; "0123456789") charTable[c] = CharType.Number; - foreach (c; "(){}[];:.,=!<>+-*/%") + foreach (c; "(){}[];:.,=!<>+-*/%\"`") charTable[c] = CharType.Symbol; foreach (c; " \n") charTable[c] = CharType.Whitespace; + foreach (c; "'\\") + charTable[c] = CharType.Other; + symbolFunctions.length = 256; symbolFunctions['('] = &openParentheses; @@ -64,6 +67,8 @@ symbolFunctions['*'] = ☆ symbolFunctions['/'] = &slash; symbolFunctions['%'] = &percent; + symbolFunctions['"'] = &string; + symbolFunctions['`'] = &string; } /** @@ -93,6 +98,8 @@ case CharType.Number: return lexNumber; + case CharType.Other: + messages.report(UnexpectedTok, Loc(position)).fatal(ExitLevel.Lexer); } } @@ -201,7 +208,7 @@ { return Token(Tok.Star, Loc(position - 1), 1); } - Token slash() + Token slash() { switch(source[position]) { @@ -220,7 +227,9 @@ ++position; if(source[position-2] == '*') if(source[position-1] == '/') + { return this.next; + } } messages.report(UnexpectedEOFBlock,Loc(position)); @@ -258,6 +267,46 @@ { return Token(Tok.Percent, Loc(position - 1), 1); } + + Token string() + { + --position; + int start = position; + if(getNextChar() == CharType.Letter) + position++; + char end = '`'; + switch(source[position]) + { + case '"': + if(position > 0) + if(source[position-1] == 'r') + { + end = '"'; + goto string_wys; + } + ++position; + while(getNextChar != CharType.EOF) + { + ++position; + if (source[position-1] == '"' ) + return Token(Tok.String, Loc(start), position - start); + else if (source[position-1] == '\\') + position++; + } + break; + case '`': +string_wys: + ++position; + while(getNextChar != CharType.EOF) + { + ++position; + if (source[position-1] == end ) + return Token(Tok.String, Loc(start), position - start); + } + break; + } + messages.report(UnexpectedEOFBlock, Loc(position)).fatal(ExitLevel.Lexer); + } Token lexNumber () { @@ -321,6 +370,12 @@ { int i = 0; bool hasNumber = false; + if (source[position+1] == '"' || + source[position+1] == '`') + { + ++position; + return string; + } while (getNextChar(++i) == CharType.Letter || getNextChar(i) == CharType.Number) { @@ -385,6 +440,7 @@ Number, Symbol, Whitespace, + Other, EOF } diff -r 5e383b3755d6 -r d1f68bfb58ae lexer/Token.d --- a/lexer/Token.d Sun May 25 14:43:16 2008 +0200 +++ b/lexer/Token.d Sun May 25 14:46:01 2008 +0200 @@ -136,6 +136,8 @@ Switch, Case, Default, Return, Cast, + String, + Module, Import, } @@ -194,6 +196,7 @@ Tok.Seperator:"Seperator", Tok.Cast:"Cast", Tok.Module:"Module", - Tok.Import:"Import" + Tok.Import:"Import", + Tok.String:"String" ]; } diff -r 5e383b3755d6 -r d1f68bfb58ae parser/Action.d --- a/parser/Action.d Sun May 25 14:43:16 2008 +0200 +++ b/parser/Action.d Sun May 25 14:46:01 2008 +0200 @@ -292,6 +292,14 @@ } /** + This is called when strings are used in expression + */ + ExprT actOnStringExp(Token t) + { + return null; + } + + /** Unary operator. */ ExprT actOnUnaryOp(Token op, ExprT operand) diff -r 5e383b3755d6 -r d1f68bfb58ae parser/Parser.d --- a/parser/Parser.d Sun May 25 14:43:16 2008 +0200 +++ b/parser/Parser.d Sun May 25 14:46:01 2008 +0200 @@ -627,6 +627,8 @@ return parseCast(next); else if (next.type == Tok.Integer) return action.actOnNumericConstant(next); + else if (next.type == Tok.String) + return action.actOnStringExp(next); messages.report(ExpectedExp, next.location) .fatal(ExitLevel.Parser); diff -r 5e383b3755d6 -r d1f68bfb58ae sema/AstAction.d --- a/sema/AstAction.d Sun May 25 14:43:16 2008 +0200 +++ b/sema/AstAction.d Sun May 25 14:46:01 2008 +0200 @@ -173,6 +173,11 @@ return new IntegerLit(c.location, sm.getText(c.asRange)); } + override ExprT actOnStringExp(Token s) + { + return new StringExp(s.location, sm.getText(s.asRange)); + } + override ExprT actOnIdentifierExp(Id id) { return identifierFromTok(id.tok); diff -r 5e383b3755d6 -r d1f68bfb58ae sema/LiteralInterpreter.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sema/LiteralInterpreter.d Sun May 25 14:46:01 2008 +0200 @@ -0,0 +1,27 @@ +module sema.LiteralInterpreter; + +import sema.Visitor; + +import basic.LiteralParsing, + basic.Message; + +class LiteralInterpreter : Visitor!(void) +{ + this(MessageHandler messages) + { + this.messages = messages; + } + + void visit(Module[] modules) + { + super.visit(modules); + messages.checkErrors(); + } + + void visitStringExp(StringExp exp) + { + auto type = parseString(exp.str, exp.loc, messages); + } + + MessageHandler messages; +} diff -r 5e383b3755d6 -r d1f68bfb58ae sema/Visitor.d --- a/sema/Visitor.d Sun May 25 14:43:16 2008 +0200 +++ b/sema/Visitor.d Sun May 25 14:46:01 2008 +0200 @@ -97,6 +97,8 @@ return visitPointerIdentifier(cast(PointerIdentifier)exp); case ExpType.ArrayIdentifier: return visitArrayIdentifier(cast(ArrayIdentifier)exp); + case ExpType.StringExp: + return visitStringExp(cast(StringExp)exp); case ExpType.Index: return visitIndexExp(cast(IndexExp)exp); case ExpType.MemberReference: @@ -314,6 +316,14 @@ return ExpT.init; } + ExpT visitStringExp(StringExp exp) + { + static if (is(ExpT == void)) + return; + else + return ExpT.init; + } + ExpT visitIdentifier(Identifier exp) { static if (is(ExpT == void)) diff -r 5e383b3755d6 -r d1f68bfb58ae tests/parser/string_1.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/parser/string_1.d Sun May 25 14:46:01 2008 +0200 @@ -0,0 +1,34 @@ + +int main() +{ + /* All examples taken from D's Language site */ + + char[4] s1 = "food"; + + char[5] s2 = r"hello"; + char[15] s3 = r"c:\root\foo.exe"; + char[4] s4 = r"ab\n"; + + char[5] s5 = `hello`; + char[15] s6 = `c:\root\foo.exe`; + char[4] s7 = `ab\n`; + + char[5] s10 = "hello"; + char[15] s11 = "c:\\root\\foo.exe"; + char[3] s12 = "ab\n"; + char[3] s13 = "ab +"; + + char[1] s14 = x"0A"; + char[6] s15 = x"00 FBCD 32FD 0A"; + + /* And some custom ones */ + + char[8] s16 = "\x61\u05D0\U000201A4"; + char[2] s17 = "\122\522"; + char[6] s15 = x"61 62 63 64 + 65 66 67 68"; + + + return 0; +}