# HG changeset patch # User Aziz K?ksal # Date 1203128919 -3600 # Node ID 4579e8505d5e69d325683ddb55d9f30436425de4 # Parent f26f13b5a3a3f4ebfa44e04b5afed455791587ed Fixed unittests and removed dil.File. Fixed Converter.UTF16toUTF8(). Fixed an encode() function in dil.Unicode. diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/cmd/DDoc.d --- a/trunk/src/cmd/DDoc.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/cmd/DDoc.d Sat Feb 16 03:28:39 2008 +0100 @@ -22,8 +22,8 @@ import dil.semantic.Symbol; import dil.semantic.Symbols; import dil.Information; -import dil.File; import dil.Converter; +import dil.SourceText; import common; import tango.stdc.time : time_t, time, ctime; @@ -40,7 +40,7 @@ MacroParser mparser; foreach (macroPath; macroPaths) { - auto macros = mparser.parse(loadMacroFile(macroPath)); + auto macros = mparser.parse(loadMacroFile(macroPath, infoMan)); mtable = new MacroTable(mtable); mtable.insert(macros); } @@ -109,9 +109,11 @@ file.write(fileText); } -string loadMacroFile(string filePath) +string loadMacroFile(string filePath, InfoManager infoMan) { - return sanitizeText(loadFile(filePath)); + auto src = new SourceText(filePath); + src.load(infoMan); + return sanitizeText(src.data); } /// Traverses the syntax tree and writes DDoc macros to a string buffer. diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/cmd/ImportGraph.d --- a/trunk/src/cmd/ImportGraph.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/cmd/ImportGraph.d Sat Feb 16 03:28:39 2008 +0100 @@ -8,7 +8,6 @@ import dil.ast.Declarations; import dil.semantic.Module; import dil.parser.ImportParser; -import dil.File; import dil.Settings; import dil.SourceText; import common; diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/cmd/Statistics.d --- a/trunk/src/cmd/Statistics.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/cmd/Statistics.d Sat Feb 16 03:28:39 2008 +0100 @@ -4,13 +4,12 @@ +/ module cmd.Statistics; -import dil.File; +import cmd.ASTStats; import dil.lexer.Lexer; import dil.lexer.Token; import dil.parser.Parser; import dil.ast.NodesEnum; import dil.SourceText; -import cmd.ASTStats; import common; struct Statistics diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/dil/Converter.d --- a/trunk/src/dil/Converter.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/dil/Converter.d Sat Feb 16 03:28:39 2008 +0100 @@ -122,19 +122,19 @@ end = text.ptr + text.length; char[] result; uint lineNum = 1; - dchar c = *p; - do + for (; p < end; p++) { + dchar c = *p; static if (isBigEndian) c = BEtoMachineWord(c); else c = LEtoMachineWord(c); - if (c < 0xD800 || 0xDFFF > c) + if (0xD800 > c || c > 0xDFFF) {} else if (c <= 0xDBFF && p+1 < end) - { + { // Decode surrogate pairs. wchar c2 = p[1]; static if (isBigEndian) c2 = BEtoMachineWord(c2); @@ -159,16 +159,14 @@ if (isNewline(c)) ++lineNum; - ++p; dil.Unicode.encode(result, c); - } while (p < end) + } if (data.length % 2) infoMan ~= new LexerError( new Location(filePath, lineNum), MSG.UTF16FileMustBeDivisibleBy2 ); - return result; } @@ -295,3 +293,32 @@ //text = text.ptr[0 .. q - text.ptr]; // Another way. return text; } + +unittest +{ + Stdout("Testing function Converter.\n"); + struct Data2Text + { + char[] text; + char[] expected = "source"; + ubyte[] data() + { return cast(ubyte[])text; } + } + const Data2Text[] map = [ + // Without BOM + {"source"}, + {"s\0o\0u\0r\0c\0e\0"}, + {"\0s\0o\0u\0r\0c\0e"}, + {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, + {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, + // With BOM + {"\xEF\xBB\xBFsource"}, + {"\xFE\xFF\0s\0o\0u\0r\0c\0e"}, + {"\xFF\xFEs\0o\0u\0r\0c\0e\0"}, + {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, + {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, + ]; + auto converter = Converter("", new InfoManager); + foreach (i, pair; map) + assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i)); +} diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/dil/File.d --- a/trunk/src/dil/File.d Sat Feb 16 01:47:39 2008 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -/++ - Author: Aziz Köksal - License: GPL3 -+/ -module dil.File; - -import dil.FileBOM; -import dil.Information; -import dil.Converter; -import tango.io.File; -import util.utf; -import common; - -/// Loads a file in any valid Unicode format and converts it to UTF-8. -char[] loadFile(char[] filePath) -{ - return data2UTF8(cast(ubyte[]) (new File(filePath)).read()); -} - -char[] loadFile(char[] filePath, InfoManager infoMan) -{ - auto converter = Converter(filePath, infoMan); - return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read()); -} - -char[] data2UTF8(ubyte[] data) -{ - if (data.length == 0) - return null; - - char[] text; - BOM bom = tellBOM(data); - - switch (bom) - { - case BOM.None: - // No BOM found. According to the specs the first character - // must be an ASCII character. - if (data.length >= 4) - { - if (data[0..3] == cast(ubyte[3])x"00 00 00") - { - text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX - break; - } - else if (data[1..4] == cast(ubyte[3])x"00 00 00") - { - text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00 - break; - } - } - if (data.length >= 2) - { - if (data[0] == 0) // UTF-16BE: 00 XX - { - text = toUTF8(cast(wchar[])utf16BEtoLE(data)); - break; - } - else if (data[1] == 0) // UTF-16LE: XX 00 - { - text = toUTF8(cast(wchar[])data); - break; - } - } - text = cast(char[])data; // UTF-8 - break; - case BOM.UTF8: - text = cast(char[])data[3..$]; - break; - case BOM.UTF16BE: - text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$])); - break; - case BOM.UTF16LE: - text = toUTF8(cast(wchar[])data[2..$]); - break; - case BOM.UTF32BE: - text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$])); - break; - case BOM.UTF32LE: - text = toUTF8(cast(dchar[])data[4..$]); - break; - default: - assert(0); - } - return text; -} - -unittest -{ - Stdout("Testing function data2Utf8().\n"); - struct Data2Text - { - union - { - ubyte[] data; - char[] u8; - } - char[] text; - } - const Data2Text[] map = [ - // Without BOM - {u8:"source", text:"source"}, - {u8:"s\0o\0u\0r\0c\0e\0", text:"source"}, - {u8:"\0s\0o\0u\0r\0c\0e", text:"source"}, - {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, - {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, - // With BOM - {u8:"\xEF\xBB\xBFsource", text:"source"}, - {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"}, - {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"}, - {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, - {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, - ]; - alias data2UTF8 f; - foreach (pair; map) - assert(f(pair.data) == pair.text); -} - -ubyte[] utf16BEtoLE(ubyte[] data) -{ - if (data.length % 2) - throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2."); - wchar[] result = cast(wchar[]) new ubyte[data.length]; - assert(result.length*2 == data.length); - // BE to LE "1A 2B" -> "2B 1A" - foreach (i, c; cast(ushort[]) data) - result[i] = (c << 8) | (c >> 8); - return cast(ubyte[]) result; -} - -ubyte[] utf32BEtoLE(ubyte[] data) -{ - if (data.length % 4) - throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4."); - dchar[] result = cast(dchar[]) new ubyte[data.length]; - assert(result.length*4 == data.length); - // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A" - // TODO: the 'bswap' asm instruction could be used instead of shifts and &-operations. - foreach (i, c; cast(uint[]) data) - result[i] = (c << 24) | - ((c >> 8) & 0xFF00) | - ((c << 8) & 0xFF0000) | - (c >> 24); - return cast(ubyte[]) result; -} - -unittest -{ - ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D"; - assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A"); -} diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/dil/Unicode.d --- a/trunk/src/dil/Unicode.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/dil/Unicode.d Sat Feb 16 03:28:39 2008 +0100 @@ -157,7 +157,7 @@ char[6] b = void; if (c < 0x80) str ~= c; - if (c < 0x800) + else if (c < 0x800) { b[0] = 0xC0 | (c >> 6); b[1] = 0x80 | (c & 0x3F); @@ -211,8 +211,7 @@ if (c < 0x10000) str ~= cast(wchar)c; else - { - // Encode with surrogate pair. + { // Encode with surrogate pair. wchar[2] pair = void; c -= 0x10000; // c' // higher10bits(c') | 0b1101_10xx_xxxx_xxxx @@ -225,7 +224,7 @@ /++ Returns a decoded character from a UTF-16 sequence. - In case of an error in the sequence 0xD800 is returned. + In case of an error in the sequence ERROR_CHAR is returned. Params: str = the UTF-16 sequence. index = where to start from. @@ -243,7 +242,7 @@ { wchar c2 = str[index+1]; if (0xDC00 <= c2 && c2 <= 0xDFFF) - { + { // Decode surrogate pair. // (c - 0xD800) << 10 + 0x10000 -> // (c - 0xD800 + 0x40) << 10 -> c = (c - 0xD7C0) << 10; @@ -257,7 +256,7 @@ /++ Returns a decoded character from a UTF-16 sequence. - In case of an error in the sequence 0xD800 is returned. + In case of an error in the sequence ERROR_CHAR is returned. Params: p = start of the UTF-16 sequence. end = one past the end of the sequence. diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/dil/lexer/Lexer.d --- a/trunk/src/dil/lexer/Lexer.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/dil/lexer/Lexer.d Sat Feb 16 03:28:39 2008 +0100 @@ -2740,7 +2740,7 @@ else src ~= pair.tokenText ~ " "; - auto lx = new Lexer(src, ""); + auto lx = new Lexer(new SourceText("", src)); auto token = lx.getTokens(); uint i; @@ -2759,7 +2759,7 @@ unittest { Stdout("Testing method Lexer.peek()\n"); - string sourceText = "unittest { }"; + auto sourceText = new SourceText("", "unittest { }"); auto lx = new Lexer(sourceText, null); auto next = lx.head; @@ -2774,7 +2774,7 @@ lx.peek(next); assert(next.kind == TOK.EOF); - lx = new Lexer("", null); + lx = new Lexer(new SourceText("", "")); next = lx.head; lx.peek(next); assert(next.kind == TOK.Newline); diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/dil/semantic/Module.d --- a/trunk/src/dil/semantic/Module.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/dil/semantic/Module.d Sat Feb 16 03:28:39 2008 +0100 @@ -8,7 +8,6 @@ import dil.ast.Declarations; import dil.parser.Parser; import dil.lexer.Lexer; -import dil.File; import dil.semantic.Symbol; import dil.semantic.Symbols; import dil.Information; diff -r f26f13b5a3a3 -r 4579e8505d5e trunk/src/main.d --- a/trunk/src/main.d Sat Feb 16 01:47:39 2008 +0100 +++ b/trunk/src/main.d Sat Feb 16 03:28:39 2008 +0100 @@ -23,7 +23,6 @@ import dil.SettingsLoader; import dil.CompilerInfo; import dil.Information; -import dil.File; import dil.SourceText; import cmd.Generate;