Mercurial > projects > dang
view basic/LiteralParsing.d @ 106:89db676fbacb
Now able of understanding strings.
author | Anders Johnsen <skabet@gmail.com> |
---|---|
date | Thu, 22 May 2008 12:09:11 +0200 |
parents | |
children | c658172ca8a0 |
line wrap: on
line source
module basic.LiteralParsing.d; import basic.SourceLocation, basic.Message; import tango.io.Stdout, tango.core.BitManip, Integer = tango.text.convert.Integer, tango.text.Util; enum StructType { Char, WChar, DChar } struct String { StructType type; ubyte[] data; } private struct EscapeReturn { ubyte[] data; int length; } String parseString(char[] str, SourceLocation loc, MessageHandler messages) { String strBuf; strBuf.data.length = str.length; strBuf.data.length = 0; switch(str[0]) { case 'r': strBuf = parseWysiwygString(str[1..$], strBuf); break; case '`': strBuf = parseWysiwygString(str, strBuf); break; case '"': strBuf = parseDoubleQuotedString(str, strBuf, loc, messages); break; case 'x': strBuf = parseHexString(str[1..$], strBuf, loc + 1, messages); break; default: messages.report(InvalidStrPrefix, loc, loc + 1); } printString(str, strBuf); return strBuf; } String parseHexString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " char[] hex = "0123456789abcdefABCDEF"; char[] whitespace = "\r\n "; char[] hexBuf; while(str[i] != '"') { if(hex.contains(str[i])) { hexBuf ~= str[i]; if(hexBuf.length == 2) { strBuf.data ~= Integer.toInt(hexBuf, 16); hexBuf.length = 0; } } else if(whitespace.contains(str[i])) {} else messages.report(InvalidHexStrChar, loc + i, loc + i + 1); i++; } return strBuf; } String parseDoubleQuotedString(char[] str, String strBuf, SourceLocation loc, MessageHandler messages) { int i = 1; // first char is " while(str[i] != '"') { switch(str[i]) { case '\\': // EscapeSequence EscapeReturn res = parseEscapeSequence(str[i..$], loc + i, messages); strBuf.data ~= res.data; i += res.length; break; default: strBuf.data ~= str[i]; i++; } if(i >= str.length) break; } return strBuf; } EscapeReturn parseEscapeSequence(char[] str, SourceLocation loc, MessageHandler messages) { EscapeReturn res; switch(str[1]) { case '\'': res.length = 2; res.data ~= '\''; break; case '"': res.length = 2; res.data ~= '\"'; break; case '?': res.length = 2; res.data ~= '\?'; break; case '\\': res.length = 2; res.data ~= '\\'; break; case 'a': res.length = 2; res.data ~= '\a'; break; case 'b': res.length = 2; res.data ~= '\b'; break; case 'f': res.length = 2; res.data ~= '\f'; break; case 'n': res.length = 2; res.data ~= '\n'; break; case 'r': res.length = 2; res.data ~= '\r'; break; case 't': res.length = 2; res.data ~= '\t'; break; case 'v': res.length = 2; res.data ~= '\v'; break; case 'x': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 4) { for(int i = 2; i < 4; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(2)); res.length = 4; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } res.data ~= cast(ubyte)Integer.toInt(hexBuf, 16); break; case 'u': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 6) { for(int i = 2; i < 6; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(6)); res.length = 6; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+6); else res.data ~= parseToUtf8(i); break; case 'U': char[] hex = "0123456789abcdefABCDEF"; char[] hexBuf; if(str.length - 1 >= 10) { for(int i = 2; i < 10; i++) if(hex.contains(str[i])) hexBuf ~= str[i]; else messages.report(StringHexInvalid, loc + i, loc + i + 1) .arg(Integer.toString(i-1)) .arg(Integer.toString(10)); res.length = 10; } else { messages.report(StringShortEscape, loc, loc + str.length); res.length = str.length - 1; } uint i = Integer.toLong(hexBuf, 16); if(!isValidUtf8(i)) messages.report(InvalidUtf8Hex, loc, loc+10); else res.data ~= parseToUtf8(i); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': char[] oct = "01234567"; char[] octBuf; octBuf ~= str[1]; res.length = 2; for(int i = 2; i < 4; i++) if(oct.contains(str[i])) { octBuf ~= str[i]; res.length += 1; } else break; uint i = Integer.toLong(octBuf, 8); res.data ~= i; break; default: messages.report(InvalidStrEscape, loc, loc + 2); res.length += 2; } return res; } String parseWysiwygString(char[] str, String strBuf) { char start = str[0]; int i = 1; while(str[i] != start) { strBuf.data ~= cast(ubyte)str[i]; i++; } return strBuf; } ubyte[] parseToUtf8(uint i) { if(i <= 0x00007F) return [cast(ubyte)i]; else if(i <= 0x0007FF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 19) >> 25; bts(cast(uint*)&b, 7); bts(cast(uint*)&b, 6); return [b,a]; } else if(i <= 0x00FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 16) >> 28; bts(cast(uint*)&c, 7); bts(cast(uint*)&c, 6); bts(cast(uint*)&c, 5); return [c,b,a]; } else if(i <= 0x10FFFF) { ubyte a = (i << 26) >> 26; bts(cast(uint*)&a, 7); ubyte b = (i << 20) >> 26; bts(cast(uint*)&b, 7); ubyte c = (i << 14) >> 26; bts(cast(uint*)&c, 7); ubyte d = (i << 11) >> 29; bts(cast(uint*)&d, 7); bts(cast(uint*)&d, 6); bts(cast(uint*)&d, 5); bts(cast(uint*)&d, 4); return [d,c,b,a]; } } bool isValidUtf8(uint i) { if(i <= 0x10FFFF) return true; return false; } void printString(char[] str, String strBuf) { char[] s; switch(strBuf.type) { case StructType.Char: Stdout(str)(" have become").newline() (cast(char[])strBuf.data).newline; break; case StructType.WChar: Stdout(str)(" have become").newline() (cast(wchar[])strBuf.data).newline; break; case StructType.DChar: Stdout(str)(" have become").newline() (cast(dchar[])strBuf.data).newline; break; } }