Mercurial > projects > dil
changeset 361:d93dd84cd5f2
- Separated a part of loadFile() to its own function data2text(). Made some fixes to the code as well.
- Added unittest for data2text().
- Fix in utf32BEtoLE(): used wrong binary operations for conversion.
- Fix in tellBOM(): UTF32LE starts with FF FE, not FE FF.
- Some fixes to German translation.
author | aziz |
---|---|
date | Tue, 28 Aug 2007 15:29:01 +0000 |
parents | b6a3755eba94 |
children | 1b6e61915858 |
files | trunk/src/dil/File.d trunk/src/lang_de.d |
diffstat | 2 files changed, 66 insertions(+), 20 deletions(-) [+] |
line wrap: on
line diff
--- a/trunk/src/dil/File.d Tue Aug 28 11:44:00 2007 +0000 +++ b/trunk/src/dil/File.d Tue Aug 28 15:29:01 2007 +0000 @@ -8,7 +8,11 @@ /// Loads a file in any valid Unicode format and converts it to UTF-8. char[] loadFile(char[] fileName) { - ubyte[] data = cast(ubyte[]) std.file.read(fileName); + return data2text(cast(ubyte[]) std.file.read(fileName)); +} + +char[] data2text(ubyte[] data) +{ char[] text; BOM bom = tellBOM(data); @@ -20,23 +24,30 @@ if (data.length >= 4) { if (data[0..3] == cast(ubyte[3])x"00 00 00") + { text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX + break; + } else if (data[1..4] == cast(ubyte[3])x"00 00 00") + { text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00 - else - text = cast(char[])data; // UTF-8 + break; + } } - else if (data.length >= 2) + if (data.length >= 2) { if (data[0] == 0) // UTF-16BE: 00 XX + { text = toUTF8(cast(wchar[])utf16BEtoLE(data)); + break; + } else if (data[1] == 0) // UTF-16LE: XX 00 + { text = toUTF8(cast(wchar[])data); - else - text = cast(char[])data; // UTF-8 + break; + } } - else - text = cast(char[])data; // UTF-8 + text = cast(char[])data; // UTF-8 break; case BOM.UTF8: text = cast(char[])data[3..$]; @@ -60,11 +71,43 @@ return text; } +unittest +{ + writefln("Testing function data2text()."); + struct Data2Text + { + union + { + ubyte[] data; + char[] u8; + } + char[] text; + } + const Data2Text[] map = [ + // Without BOM + {u8:"source", text:"source"}, + {u8:"s\0o\0u\0r\0c\0e\0", text:"source"}, + {u8:"\0s\0o\0u\0r\0c\0e", text:"source"}, + {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, + {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, + // With BOM + {u8:"\xEF\xBB\xBFsource", text:"source"}, + {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"}, + {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"}, + {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, + {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, + ]; + alias data2text f; + foreach (pair; map) + assert(f(pair.data) == pair.text); +} + ubyte[] utf16BEtoLE(ubyte[] data) { if (data.length % 2) throw new Exception("UTF-16 big endian source file byte length must be divisible by 2."); wchar[] result = cast(wchar[]) new ubyte[data.length]; + assert(result.length*2 == data.length); // BE to LE "1A 2B" -> "2B 1A" foreach (i, c; cast(wchar[]) data) result[i] = (c << 8) | (c >> 8); @@ -76,12 +119,13 @@ if (data.length % 4) throw new Exception("UTF-32 big endian source file byte length must be divisible by 4."); dchar[] result = cast(dchar[]) new ubyte[data.length]; + assert(result.length*4 == data.length); // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A" foreach (i, c; cast(dchar[]) data) - result[i] = ((c & 0xFF) << 24) | - ((c & 0xFF00) << 16) | - ((c & 0xFF0000) << 8) | - (c & 0xFF000000); + result[i] = ((c & 0xFF)) | + ((c >> 8) & 0xFF) | + ((c >> 16) & 0xFF) | + (c >> 24); return cast(ubyte[]) result; } @@ -104,13 +148,15 @@ if (data[0..2] == cast(ubyte[2])x"FE FF") { - if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00") - bom = BOM.UTF32LE; // FE FF 00 00 - else - bom = BOM.UTF16BE; // FE FF XX XX + bom = BOM.UTF16BE; // FE FF } else if (data[0..2] == cast(ubyte[2])x"FF FE") - bom = BOM.UTF16LE; // FF FE + { + if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00") + bom = BOM.UTF32LE; // FF FE 00 00 + else + bom = BOM.UTF16LE; // FF FE XX XX + } else if (data[0..2] == cast(ubyte[2])x"00 00") { if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF") @@ -155,7 +201,7 @@ {cast(ub)x"FE FF", BOM.UTF16BE}, {cast(ub)x"FF FE", BOM.UTF16LE}, {cast(ub)x"00 00 FE FF", BOM.UTF32BE}, - {cast(ub)x"FE FF 00 00", BOM.UTF32LE} + {cast(ub)x"FF FE 00 00", BOM.UTF32LE} ]; foreach (pair; map)
--- a/trunk/src/lang_de.d Tue Aug 28 11:44:00 2007 +0000 +++ b/trunk/src/lang_de.d Tue Aug 28 15:29:01 2007 +0000 @@ -57,8 +57,8 @@ "Template-Tupel-Parameter dürfen nur am Ende auftreten.", "der 'in'-Vertrag der Funktion wurde bereits geparsed.", "der 'out'-Vertrag der Funktion wurde bereits geparsed.", - "es wurde kein Verknüpfungstyp angegeben.", - "unbekannter Verknüpfungstyp '{1}'; gültig sind C, C++, D, Windows, Pascal und System.", + "es wurde kein Verbindungstyp angegeben.", + "unbekannter Verbindungstyp '{1}'; gültig sind C, C++, D, Windows, Pascal und System.", // Help messages: `dil v{1}