# HG changeset patch # User Aziz K?ksal # Date 1197741306 -3600 # Node ID 8f86bb9ef715e4d1a1e48aa3e86285feb6513e08 # Parent b465c669d70c60c57761aa319ba55951e9ee2e90 Added module dil.Converter and dil.FileBOM. Moved code from dil.File to dil.FileBOM. Added opCatAssign to class InformationManager. Added encode() function to dil.Unicode. diff -r b465c669d70c -r 8f86bb9ef715 trunk/src/dil/Converter.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/dil/Converter.d Sat Dec 15 18:55:06 2007 +0100 @@ -0,0 +1,229 @@ +/++ + Author: Aziz Köksal + License: GPL3 ++/ +module dil.Converter; + +import dil.Information; +import dil.Location; +import dil.Unicode; +import dil.FileBOM; +import common; + +/// Converts various Unicode encoding formats to UTF-8. +struct Converter +{ + char[] filePath; /// For error messages. + InformationManager infoMan; + + static Converter opCall(char[] filePath, InformationManager infoMan) + { + Converter conv; + conv.filePath = filePath; + conv.infoMan = infoMan; + return conv; + } + + dchar swapBytes(dchar c) + { + return c = (c << 24) | + ((c >> 8) & 0xFF00) | + ((c << 8) & 0xFF0000) | + (c >> 24); + } + + wchar swapBytes(wchar c) + { + return (c << 8) | (c >> 8); + } + + wchar BEtoMachineDword(dchar c) + { + version(LittleEndian) + return swapBytes(c); + else + return c; + } + + wchar LEtoMachineDword(dchar c) + { + version(LittleEndian) + return c; + else + return swapBytes(c); + } + + wchar BEtoMachineWord(wchar c) + { + version(LittleEndian) + return swapBytes(c); + else + return c; + } + + wchar LEtoMachineWord(wchar c) + { + version(LittleEndian) + return c; + else + return swapBytes(c); + } + + char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) + { + if (data.length % 4) + { + infoMan.info ~= new LexerError(new Location(filePath, 0), + "the byte length of a UTF-32 source file must be divisible by 4." + ); + data = data[0 .. $ - $ % 4]; // Trim to valid size. + } + if (data.length == 0) + return null; + + char[] result; + foreach (dchar c; cast(dchar[])data) + { + static if (isBigEndian) + c = BEtoMachineDword(c); + else + c = LEtoMachineDword(c); + + if (!isValidChar(c)) + { + // TODO: correct location. + auto loc = new Location(filePath, 0); + infoMan.info ~= new LexerError(null, Format("invalid UTF-32 character '{:X}'.", c)); + c = REPLACEMENT_CHAR; + } + + dil.Unicode.encode(result, c); + } + return result; + } + + alias UTF32toUTF8!(true) UTF32BEtoUTF8; + alias UTF32toUTF8!(false) UTF32LEtoUTF8; + + char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) + { + if (data.length % 2) + { + infoMan ~= new LexerError(new Location(filePath, 0), + "the byte length of a UTF-16 source file must be divisible by 2." + ); + data = data[0 .. $-1]; // Trim to valid size. + } + + if (data.length == 0) + return null; + + wchar[] text = cast(wchar[])data; + wchar* p = text.ptr, + end = text.ptr + text.length; + char[] result; + + dchar c = *p; + + do + { + static if (isBigEndian) + c = BEtoMachineWord(c); + else + c = LEtoMachineWord(c); + + if (c < 0xD800 || 0xDFFF > c) + {} + else if (c <= 0xDBFF && p+1 < end) + { + wchar c2 = p[1]; + static if (isBigEndian) + c2 = BEtoMachineWord(c2); + else + c2 = LEtoMachineWord(c2); + + if (0xDC00 <= c2 && c2 <= 0xDFFF) + { + c = (c - 0xD7C0) << 10; + c |= (c2 & 0x3FF); + ++p; + } + } + else + { + // TODO: correct location. + auto loc = new Location(filePath, 0); + infoMan ~= new LexerError(loc, Format("invalid UTF-16 character '{:X}'.", c)); + c = REPLACEMENT_CHAR; + } + ++p; + dil.Unicode.encode(result, c); + } while (p < end) + return result; + } + + alias UTF16toUTF8!(true) UTF16BEtoUTF8; + alias UTF16toUTF8!(false) UTF16LEtoUTF8; + + char[] data2UTF8(ubyte[] data) + { + if (data.length == 0) + return null; + + char[] text; + BOM bom = tellBOM(data); + + switch (bom) + { + case BOM.None: + // No BOM found. According to the specs the first character + // must be an ASCII character. + if (data.length >= 4) + { + if (data[0..3] == cast(ubyte[3])x"00 00 00") + { + text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX + break; + } + else if (data[1..4] == cast(ubyte[3])x"00 00 00") + { + text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 + break; + } + } + if (data.length >= 2) + { + if (data[0] == 0) // UTF-16BE: 00 XX + { + text = UTF16BEtoUTF8(data); + break; + } + else if (data[1] == 0) // UTF-16LE: XX 00 + { + text = UTF16LEtoUTF8(data); + break; + } + } + text = cast(char[])data; // UTF-8 + break; + case BOM.UTF8: + text = cast(char[])data[3..$]; + break; + case BOM.UTF16BE: + text = UTF16BEtoUTF8(data[2..$]); + break; + case BOM.UTF16LE: + text = UTF16LEtoUTF8(data[2..$]); + break; + case BOM.UTF32BE: + text = UTF32BEtoUTF8(data[4..$]); + break; + case BOM.UTF32LE: + text = UTF32LEtoUTF8(data[4..$]); + break; + default: + assert(0); + } + return text; + } +} diff -r b465c669d70c -r 8f86bb9ef715 trunk/src/dil/File.d --- a/trunk/src/dil/File.d Fri Dec 14 23:10:35 2007 +0100 +++ b/trunk/src/dil/File.d Sat Dec 15 18:55:06 2007 +0100 @@ -3,6 +3,10 @@ License: GPL3 +/ module dil.File; + +import dil.FileBOM; +import dil.Information; +import dil.Converter; import tango.io.File; import std.utf; import common; @@ -10,10 +14,16 @@ /// Loads a file in any valid Unicode format and converts it to UTF-8. char[] loadFile(char[] filePath) { - return data2Utf8(cast(ubyte[]) (new File(filePath)).read()); + return data2UTF8(cast(ubyte[]) (new File(filePath)).read()); } -char[] data2Utf8(ubyte[] data) +char[] loadFile(char[] filePath, InformationManager infoMan) +{ + auto converter = Converter(filePath, infoMan); + return converter.data2UTF8(cast(ubyte[]) (new File(filePath)).read()); +} + +char[] data2UTF8(ubyte[] data) { if (data.length == 0) return null; @@ -139,82 +149,3 @@ ubyte[] test = cast(ubyte[])x"1A 2B 3C 4D"; assert(utf32BEtoLE(test) == cast(ubyte[])x"4D 3C 2B 1A"); } - -/// Byte Order Mark -enum BOM -{ - None, /// No BOM - UTF8, /// UTF-8: EF BB BF - UTF16BE, /// UTF-16 Big Endian: FE FF - UTF16LE, /// UTF-16 Little Endian: FF FE - UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF - UTF32LE /// UTF-32 Little Endian: FF FE 00 00 -} - -BOM tellBOM(ubyte[] data) -{ - BOM bom = BOM.None; - if (data.length < 2) - return bom; - - if (data[0..2] == cast(ubyte[2])x"FE FF") - { - bom = BOM.UTF16BE; // FE FF - } - else if (data[0..2] == cast(ubyte[2])x"FF FE") - { - if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00") - bom = BOM.UTF32LE; // FF FE 00 00 - else - bom = BOM.UTF16LE; // FF FE XX XX - } - else if (data[0..2] == cast(ubyte[2])x"00 00") - { - if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF") - bom = BOM.UTF32BE; // 00 00 FE FF - } - else if (data[0..2] == cast(ubyte[2])x"EF BB") - { - if (data.length >= 3 && data[2] == '\xBF') - bom = BOM.UTF8; // EF BB BF - } - return bom; -} - -unittest -{ - Stdout("Testing function tellBOM().\n"); - - struct Data2BOM - { - ubyte[] data; - BOM bom; - } - alias ubyte[] ub; - const Data2BOM[] map = [ - {cast(ub)x"12", BOM.None}, - {cast(ub)x"12 34", BOM.None}, - {cast(ub)x"00 00 FF FE", BOM.None}, - {cast(ub)x"EF BB FF", BOM.None}, - - {cast(ub)x"EF", BOM.None}, - {cast(ub)x"EF BB", BOM.None}, - {cast(ub)x"FE", BOM.None}, - {cast(ub)x"FF", BOM.None}, - {cast(ub)x"00", BOM.None}, - {cast(ub)x"00 00", BOM.None}, - {cast(ub)x"00 00 FE", BOM.None}, - - {cast(ub)x"FE FF 00", BOM.UTF16BE}, - {cast(ub)x"FE FF 00 FF", BOM.UTF16BE}, - - {cast(ub)x"EF BB BF", BOM.UTF8}, - {cast(ub)x"FE FF", BOM.UTF16BE}, - {cast(ub)x"FF FE", BOM.UTF16LE}, - {cast(ub)x"00 00 FE FF", BOM.UTF32BE}, - {cast(ub)x"FF FE 00 00", BOM.UTF32LE} - ]; - - foreach (pair; map) - assert(tellBOM(pair.data) == pair.bom, Format("Failed at {0}", pair.data)); -} diff -r b465c669d70c -r 8f86bb9ef715 trunk/src/dil/FileBOM.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/dil/FileBOM.d Sat Dec 15 18:55:06 2007 +0100 @@ -0,0 +1,84 @@ +/++ + Author: Aziz Köksal + License: GPL3 ++/ +module dil.FileBOM; + +/// Byte Order Mark +enum BOM +{ + None, /// No BOM + UTF8, /// UTF-8: EF BB BF + UTF16BE, /// UTF-16 Big Endian: FE FF + UTF16LE, /// UTF-16 Little Endian: FF FE + UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF + UTF32LE /// UTF-32 Little Endian: FF FE 00 00 +} + +BOM tellBOM(ubyte[] data) +{ + BOM bom = BOM.None; + if (data.length < 2) + return bom; + + if (data[0..2] == cast(ubyte[2])x"FE FF") + { + bom = BOM.UTF16BE; // FE FF + } + else if (data[0..2] == cast(ubyte[2])x"FF FE") + { + if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00") + bom = BOM.UTF32LE; // FF FE 00 00 + else + bom = BOM.UTF16LE; // FF FE XX XX + } + else if (data[0..2] == cast(ubyte[2])x"00 00") + { + if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF") + bom = BOM.UTF32BE; // 00 00 FE FF + } + else if (data[0..2] == cast(ubyte[2])x"EF BB") + { + if (data.length >= 3 && data[2] == '\xBF') + bom = BOM.UTF8; // EF BB BF + } + return bom; +} + +unittest +{ + Stdout("Testing function tellBOM().\n"); + + struct Data2BOM + { + ubyte[] data; + BOM bom; + } + alias ubyte[] ub; + const Data2BOM[] map = [ + {cast(ub)x"12", BOM.None}, + {cast(ub)x"12 34", BOM.None}, + {cast(ub)x"00 00 FF FE", BOM.None}, + {cast(ub)x"EF BB FF", BOM.None}, + + {cast(ub)x"EF", BOM.None}, + {cast(ub)x"EF BB", BOM.None}, + {cast(ub)x"FE", BOM.None}, + {cast(ub)x"FF", BOM.None}, + {cast(ub)x"00", BOM.None}, + {cast(ub)x"00 00", BOM.None}, + {cast(ub)x"00 00 FE", BOM.None}, + + {cast(ub)x"FE FF 00", BOM.UTF16BE}, + {cast(ub)x"FE FF 00 FF", BOM.UTF16BE}, + + {cast(ub)x"EF BB BF", BOM.UTF8}, + {cast(ub)x"FE FF", BOM.UTF16BE}, + {cast(ub)x"FF FE", BOM.UTF16LE}, + {cast(ub)x"00 00 FE FF", BOM.UTF32BE}, + {cast(ub)x"FF FE 00 00", BOM.UTF32LE} + ]; + + foreach (pair; map) + assert(tellBOM(pair.data) == pair.bom, Format("Failed at {0}", pair.data)); +} diff -r b465c669d70c -r 8f86bb9ef715 trunk/src/dil/Information.d --- a/trunk/src/dil/Information.d Fri Dec 14 23:10:35 2007 +0100 +++ b/trunk/src/dil/Information.d Sat Dec 15 18:55:06 2007 +0100 @@ -17,6 +17,11 @@ class InformationManager { Information[] info; + + void opCatAssign(Information info) + { + this.info ~= info; + } } class Problem : Information diff -r b465c669d70c -r 8f86bb9ef715 trunk/src/dil/Unicode.d --- a/trunk/src/dil/Unicode.d Fri Dec 14 23:10:35 2007 +0100 +++ b/trunk/src/dil/Unicode.d Sat Dec 15 18:55:06 2007 +0100 @@ -127,6 +127,60 @@ } /// Encodes a character and appends it to str. +void encode(ref char[] str, dchar c) +{ + assert(isValidChar(c), "check if character is valid before calling encode()."); + + char[6] b = void; + if (c < 0x80) + str ~= c; + if (c < 0x800) + { + b[0] = 0xC0 | (c >> 6); + b[1] = 0x80 | (c & 0x3F); + str ~= b[0..2]; + } + else if (c < 0x10000) + { + b[0] = 0xE0 | (c >> 12); + b[1] = 0x80 | ((c >> 6) & 0x3F); + b[2] = 0x80 | (c & 0x3F); + str ~= b[0..3]; + } + else if (c < 0x200000) + { + b[0] = 0xF0 | (c >> 18); + b[1] = 0x80 | ((c >> 12) & 0x3F); + b[2] = 0x80 | ((c >> 6) & 0x3F); + b[3] = 0x80 | (c & 0x3F); + str ~= b[0..4]; + } + /+ // There are no 5 and 6 byte UTF-8 sequences yet. + else if (c < 0x4000000) + { + b[0] = 0xF8 | (c >> 24); + b[1] = 0x80 | ((c >> 18) & 0x3F); + b[2] = 0x80 | ((c >> 12) & 0x3F); + b[3] = 0x80 | ((c >> 6) & 0x3F); + b[4] = 0x80 | (c & 0x3F); + str ~= b[0..5]; + } + else if (c < 0x80000000) + { + b[0] = 0xFC | (c >> 30); + b[1] = 0x80 | ((c >> 24) & 0x3F); + b[2] = 0x80 | ((c >> 18) & 0x3F); + b[3] = 0x80 | ((c >> 12) & 0x3F); + b[4] = 0x80 | ((c >> 6) & 0x3F); + b[5] = 0x80 | (c & 0x3F); + str ~= b[0..6]; + } + +/ + else + assert(0); +} + +/// Encodes a character and appends it to str. void encode(ref wchar[] str, dchar c) in { assert(isValidChar(c)); } body