Mercurial > projects > dang
changeset 106:89db676fbacb
Now able of understanding strings.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Thu, 22 May 2008 12:09:11 +0200 |
parents | f1282c5fe8e3 |
children | d1f68bfb58ae |
files | basic/LiteralParsing.d basic/Message.d basic/Messages.d basic/SourceLocation.d dang/compiler.d sema/LiteralInterpreter.d sema/Visitor.d tests/parser/string_1.d |
diffstat | 8 files changed, 453 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/basic/LiteralParsing.d Thu May 22 12:09:11 2008 +0200 @@ -0,0 +1,353 @@ +module basic.LiteralParsing.d; + +import basic.SourceLocation, + basic.Message; + +import tango.io.Stdout, + tango.core.BitManip, + Integer = tango.text.convert.Integer, + tango.text.Util; + +enum StructType +{ + Char, + WChar, + DChar +} + +struct String +{ + StructType type; + ubyte[] data; +} + +private struct EscapeReturn +{ + ubyte[] data; + int length; +} + +String parseString(char[] str, SourceLocation loc, MessageHandler messages) +{ + String strBuf; + strBuf.data.length = str.length; + strBuf.data.length = 0; + + switch(str[0]) + { + case 'r': + strBuf = parseWysiwygString(str[1..$], strBuf); + break; + case '`': + strBuf = parseWysiwygString(str, strBuf); + break; + case '"': + strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); + break; + case 'x': + strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); + break; + default: + messages.report(InvalidStrPrefix, loc, loc + 1); + + } + + printString(str, strBuf); + + return strBuf; +} + +String parseHexString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + char[] hex = "0123456789abcdefABCDEF"; + char[] whitespace = "\r\n "; + char[] hexBuf; + + while(str[i] != '"') + { + if(hex.contains(str[i])) + { + hexBuf ~= str[i]; + if(hexBuf.length == 2) + { + strBuf.data ~= Integer.toInt(hexBuf, 16); + hexBuf.length = 0; + } + } + else if(whitespace.contains(str[i])) + {} + else + messages.report(InvalidHexStrChar, loc + i, loc + i + 1); + + i++; + } + + + + return strBuf; +} + + +String parseDoubleQuotedString(char[] str, String strBuf, + SourceLocation loc, MessageHandler messages) +{ + int i = 1; // first char is " + + while(str[i] != '"') + { + switch(str[i]) + { + case '\\': // EscapeSequence + EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); + strBuf.data ~= res.data; + i += res.length; + break; + default: + strBuf.data ~= str[i]; + i++; + } + if(i >= str.length) + break; + } + + return strBuf; +} + +EscapeReturn parseEscapeSequence(char[] str, + SourceLocation loc, MessageHandler messages) +{ + EscapeReturn res; + + switch(str[1]) + { + case '\'': + res.length = 2; + res.data ~= '\''; + break; + case '"': + res.length = 2; + res.data ~= '\"'; + break; + case '?': + res.length = 2; + res.data ~= '\?'; + break; + case '\\': + res.length = 2; + res.data ~= '\\'; + break; + case 'a': + res.length = 2; + res.data ~= '\a'; + break; + case 'b': + res.length = 2; + res.data ~= '\b'; + break; + case 'f': + res.length = 2; + res.data ~= '\f'; + break; + case 'n': + res.length = 2; + res.data ~= '\n'; + break; + case 'r': + res.length = 2; + res.data ~= '\r'; + break; + case 't': + res.length = 2; + res.data ~= '\t'; + break; + case 'v': + res.length = 2; + res.data ~= '\v'; + break; + case 'x': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 4) + { + for(int i = 2; i < 4; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(2)); + res.length = 4; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); + break; + case 'u': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 6) + { + for(int i = 2; i < 6; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(6)); + res.length = 6; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+6); + else + res.data ~= parseToUtf8(i); + break; + case 'U': + char[] hex = "0123456789abcdefABCDEF"; + char[] hexBuf; + if(str.length - 1 >= 10) + { + for(int i = 2; i < 10; i++) + if(hex.contains(str[i])) + hexBuf ~= str[i]; + else + messages.report(StringHexInvalid, loc + i, loc + i + 1) + .arg(Integer.toString(i-1)) + .arg(Integer.toString(10)); + res.length = 10; + } + else + { + messages.report(StringShortEscape, loc, loc + str.length); + res.length = str.length - 1; + } + uint i = Integer.toLong(hexBuf, 16); + if(!isValidUtf8(i)) + messages.report(InvalidUtf8Hex, loc, loc+10); + else + res.data ~= parseToUtf8(i); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + char[] oct = "01234567"; + char[] octBuf; + octBuf ~= str[1]; + res.length = 2; + for(int i = 2; i < 4; i++) + if(oct.contains(str[i])) + { + octBuf ~= str[i]; + res.length += 1; + } + else + break; + + uint i = Integer.toLong(octBuf, 8); + res.data ~= i; + break; + default: + messages.report(InvalidStrEscape, loc, loc + 2); + res.length += 2; + } + + return res; +} + +String parseWysiwygString(char[] str, String strBuf) +{ + char start = str[0]; + + int i = 1; + + while(str[i] != start) + { + strBuf.data ~= cast(ubyte)str[i]; + i++; + } + return strBuf; +} + +ubyte[] parseToUtf8(uint i) +{ + if(i <= 0x00007F) + return [cast(ubyte)i]; + else if(i <= 0x0007FF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 19) >> 25; + bts(cast(uint*)&b, 7); + bts(cast(uint*)&b, 6); + return [b,a]; + } + else if(i <= 0x00FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 16) >> 28; + bts(cast(uint*)&c, 7); + bts(cast(uint*)&c, 6); + bts(cast(uint*)&c, 5); + return [c,b,a]; + } + else if(i <= 0x10FFFF) + { + ubyte a = (i << 26) >> 26; + bts(cast(uint*)&a, 7); + ubyte b = (i << 20) >> 26; + bts(cast(uint*)&b, 7); + ubyte c = (i << 14) >> 26; + bts(cast(uint*)&c, 7); + ubyte d = (i << 11) >> 29; + bts(cast(uint*)&d, 7); + bts(cast(uint*)&d, 6); + bts(cast(uint*)&d, 5); + bts(cast(uint*)&d, 4); + return [d,c,b,a]; + } +} + +bool isValidUtf8(uint i) +{ + if(i <= 0x10FFFF) + return true; + return false; +} + +void printString(char[] str, String strBuf) +{ + char[] s; + switch(strBuf.type) + { + case StructType.Char: + Stdout(str)(" have become").newline() + (cast(char[])strBuf.data).newline; + break; + case StructType.WChar: + Stdout(str)(" have become").newline() + (cast(wchar[])strBuf.data).newline; + break; + case StructType.DChar: + Stdout(str)(" have become").newline() + (cast(dchar[])strBuf.data).newline; + break; + } +}
--- a/basic/Message.d Wed May 21 21:11:55 2008 +0200 +++ b/basic/Message.d Thu May 22 12:09:11 2008 +0200 @@ -42,6 +42,13 @@ return m; } + Message report(uint opcode, SLoc location1, SLoc location2) + { + Message m = new Message(opcode, location1, location2, src_mgr, this); + messages ~= m; + return m; + } + void checkErrors(ExitLevel exitlevel = ExitLevel.Normal) { if(messages.length == 0) @@ -90,14 +97,32 @@ this.msg_handler = msg_handler; } + this(int opcode, SLoc location, SLoc end, SourceManager src_mgr, MessageHandler msg_handler) + { + this.src_mgr = src_mgr; + this.location = location; + this.end = end; + args ~= Messages[opcode].message; + this.type = Messages[opcode].type; + this.msg_handler = msg_handler; + haveEnd = true; + } + char[] toString() { char[256] tmp = void; char[] msg = layout(tmp, args); - Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr)); + int len = 0; + if(!haveEnd) + { + Lexer l = new Lexer(location, src_mgr, new MessageHandler(src_mgr)); - Token t = l.next; + Token t = l.next; + len = t.length; + } + else + len = end - location; if (src_mgr.getRawData(location).length > 0) msg = src_mgr.getLocationAsString(location) ~ ": " ~ msg; @@ -109,7 +134,7 @@ char[] marks = line.dup; marks[] = ' '; size_t p = src_mgr.getColumn(location); - marks[p .. p + t.length] = '^'; + marks[p .. p + len] = '^'; msg ~= "\n "; msg ~= line; @@ -166,7 +191,9 @@ MessageType type; private: char[][] args; - SLoc location; + SLoc location, end; + bool haveEnd; SourceManager src_mgr; MessageHandler msg_handler; + Token t; }
--- a/basic/Messages.d Wed May 21 21:11:55 2008 +0200 +++ b/basic/Messages.d Thu May 22 12:09:11 2008 +0200 @@ -28,6 +28,14 @@ // Imports CannotFindModule, + + // Strings + InvalidStrPrefix, + InvalidStrEscape, + InvalidUtf8Hex, + InvalidHexStrChar, + StringShortEscape, + StringHexInvalid, } enum MessageType @@ -67,7 +75,13 @@ InvalidDeclType : E(Err, "Invalid declaration type"), InvalidType : E(Err, "Invalid type"), ExpectedIdAfterPackage : E(Err, "Identifier expected following package"), - CannotFindModule : E(Err, "Cannot find module '%0'") + CannotFindModule : E(Err, "Cannot find module '%0'"), + InvalidStrPrefix : E(Err, "Invalid string literal prefix"), + InvalidStrEscape : E(Err, "Invalid escape sequence"), + InvalidUtf8Hex : E(Err, "Invalid Utf8 hex char"), + InvalidHexStrChar : E(Err, "Invalid character in hex string"), + StringShortEscape : E(Err, "String literal is to short for escape sequence"), + StringHexInvalid : E(Err, "Hex escape sequence have invalid digit at position %0 of %1") ]; }
--- a/basic/SourceLocation.d Wed May 21 21:11:55 2008 +0200 +++ b/basic/SourceLocation.d Thu May 22 12:09:11 2008 +0200 @@ -62,6 +62,12 @@ return res; } + /// Get the length between two location + int opSub(SourceLocation loc) + { + return val - loc.val; + } + /// Creates a SourceLocation from a File ID static SourceLocation fromFileID(uint fileID) {
--- a/dang/compiler.d Wed May 21 21:11:55 2008 +0200 +++ b/dang/compiler.d Thu May 22 12:09:11 2008 +0200 @@ -25,6 +25,7 @@ import sema.Visitor, sema.AstAction, sema.ScopeBuilder, + sema.LiteralInterpreter, sema.ScopeCheck, sema.TypeCheck; @@ -240,6 +241,8 @@ postParse(m, src_mgr);*/ } + (new LiteralInterpreter(messages)).visit(modules); + (new ScopeBuilder).visit(modules); StopWatch watch2; watch.start;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sema/LiteralInterpreter.d Thu May 22 12:09:11 2008 +0200 @@ -0,0 +1,27 @@ +module sema.LiteralInterpreter; + +import sema.Visitor; + +import basic.LiteralParsing, + basic.Message; + +class LiteralInterpreter : Visitor!(void) +{ + this(MessageHandler messages) + { + this.messages = messages; + } + + void visit(Module[] modules) + { + super.visit(modules); + messages.checkErrors(); + } + + void visitStringExp(StringExp exp) + { + auto type = parseString(exp.str, exp.loc, messages); + } + + MessageHandler messages; +}
--- a/sema/Visitor.d Wed May 21 21:11:55 2008 +0200 +++ b/sema/Visitor.d Thu May 22 12:09:11 2008 +0200 @@ -97,6 +97,8 @@ return visitPointerIdentifier(cast(PointerIdentifier)exp); case ExpType.ArrayIdentifier: return visitArrayIdentifier(cast(ArrayIdentifier)exp); + case ExpType.StringExp: + return visitStringExp(cast(StringExp)exp); case ExpType.Index: return visitIndexExp(cast(IndexExp)exp); case ExpType.MemberReference: @@ -314,6 +316,14 @@ return ExpT.init; } + ExpT visitStringExp(StringExp exp) + { + static if (is(ExpT == void)) + return; + else + return ExpT.init; + } + ExpT visitIdentifier(Identifier exp) { static if (is(ExpT == void))
--- a/tests/parser/string_1.d Wed May 21 21:11:55 2008 +0200 +++ b/tests/parser/string_1.d Thu May 22 12:09:11 2008 +0200 @@ -12,7 +12,6 @@ char[5] s5 = `hello`; char[15] s6 = `c:\root\foo.exe`; char[4] s7 = `ab\n`; - char[4] s9 = `abn\`; char[5] s10 = "hello"; char[15] s11 = "c:\\root\\foo.exe"; @@ -23,5 +22,13 @@ char[1] s14 = x"0A"; char[6] s15 = x"00 FBCD 32FD 0A"; + /* And some custom ones */ + + char[8] s16 = "\x61\u05D0\U000201A4"; + char[2] s17 = "\122\522"; + char[6] s15 = x"61 62 63 64 + 65 66 67 68"; + + return 0; }