Mercurial > projects > dil
view trunk/src/dil/File.d @ 391:33b566df6af4
Migrated project to Tango.
Decremented the numbers of the format placeholders in the localized messages
by one.
Replaced all instances of writef/ln with Stdout.
Added module common.d with string aliases and a global Layout!(char) instance.
Replaced %s format specifiers with index placeholders in html/xml_tags.
Changed member Information.arguments to string message.
Copied std.metastring, std.uni and std.utf from Phobos.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sat, 15 Sep 2007 17:12:26 +0200 |
parents | 1b6e61915858 |
children | c99f8aeb7b4a |
line wrap: on
line source
/++ Author: Aziz Köksal License: GPL3 +/ module dil.File; import tango.io.File; import std.utf; import common; /// Loads a file in any valid Unicode format and converts it to UTF-8. char[] loadFile(char[] filePath) { return data2Utf8(cast(ubyte[]) (new File(filePath)).read()); } char[] data2Utf8(ubyte[] data) { char[] text; BOM bom = tellBOM(data); switch (bom) { case BOM.None: // No BOM found. According to the specs the first character // must be an ASCII character. if (data.length >= 4) { if (data[0..3] == cast(ubyte[3])x"00 00 00") { text = toUTF8(cast(dchar[])utf32BEtoLE(data)); // UTF-32BE: 00 00 00 XX break; } else if (data[1..4] == cast(ubyte[3])x"00 00 00") { text = toUTF8(cast(dchar[])data); // UTF-32LE: XX 00 00 00 break; } } if (data.length >= 2) { if (data[0] == 0) // UTF-16BE: 00 XX { text = toUTF8(cast(wchar[])utf16BEtoLE(data)); break; } else if (data[1] == 0) // UTF-16LE: XX 00 { text = toUTF8(cast(wchar[])data); break; } } text = cast(char[])data; // UTF-8 break; case BOM.UTF8: text = cast(char[])data[3..$]; break; case BOM.UTF16BE: text = toUTF8(cast(wchar[])utf16BEtoLE(data[2..$])); break; case BOM.UTF16LE: text = toUTF8(cast(wchar[])data[2..$]); break; case BOM.UTF32BE: text = toUTF8(cast(dchar[])utf32BEtoLE(data[4..$])); break; case BOM.UTF32LE: text = toUTF8(cast(dchar[])data[4..$]); break; default: assert(0); } assert(text !is null); return text; } unittest { Stdout("Testing function data2Utf8().\n"); struct Data2Text { union { ubyte[] data; char[] u8; } char[] text; } const Data2Text[] map = [ // Without BOM {u8:"source", text:"source"}, {u8:"s\0o\0u\0r\0c\0e\0", text:"source"}, {u8:"\0s\0o\0u\0r\0c\0e", text:"source"}, {u8:"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, {u8:"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, // With BOM {u8:"\xEF\xBB\xBFsource", text:"source"}, {u8:"\xFE\xFF\0s\0o\0u\0r\0c\0e", text:"source"}, {u8:"\xFF\xFEs\0o\0u\0r\0c\0e\0", text:"source"}, {u8:"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e", text:"source"}, {u8:"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0", text:"source"}, ]; alias data2Utf8 f; foreach (pair; map) assert(f(pair.data) == pair.text); } ubyte[] utf16BEtoLE(ubyte[] data) { if (data.length % 2) throw new Exception("The byte length of a UTF-16 big endian source file must be divisible by 2."); wchar[] result = cast(wchar[]) new ubyte[data.length]; assert(result.length*2 == data.length); // BE to LE "1A 2B" -> "2B 1A" foreach (i, c; cast(ushort[]) data) result[i] = (c << 8) | (c >> 8); return cast(ubyte[]) result; } ubyte[] utf32BEtoLE(ubyte[] data) { if (data.length % 4) throw new Exception("The byte length of a UTF-32 big endian source file must be divisible by 4."); dchar[] result = cast(dchar[]) new ubyte[data.length]; assert(result.length*4 == data.length); // BE to LE "1A 2B 3C 4D" -> "4D 3C 2B 1A" foreach (i, c; cast(uint[]) data) result[i] = ((c & 0xFF)) | ((c >> 8) & 0xFF) | ((c >> 16) & 0xFF) | (c >> 24); return cast(ubyte[]) result; } /// Byte Order Mark enum BOM { None, /// No BOM UTF8, /// UTF-8: EF BB BF UTF16BE, /// UTF-16 Big Endian: FE FF UTF16LE, /// UTF-16 Little Endian: FF FE UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF UTF32LE /// UTF-32 Little Endian: FF FE 00 00 } BOM tellBOM(ubyte[] data) { BOM bom = BOM.None; if (data.length < 2) return bom; if (data[0..2] == cast(ubyte[2])x"FE FF") { bom = BOM.UTF16BE; // FE FF } else if (data[0..2] == cast(ubyte[2])x"FF FE") { if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"00 00") bom = BOM.UTF32LE; // FF FE 00 00 else bom = BOM.UTF16LE; // FF FE XX XX } else if (data[0..2] == cast(ubyte[2])x"00 00") { if (data.length >= 4 && data[2..4] == cast(ubyte[2])x"FE FF") bom = BOM.UTF32BE; // 00 00 FE FF } else if (data[0..2] == cast(ubyte[2])x"EF BB") { if (data.length >= 3 && data[2] == '\xBF') bom = BOM.UTF8; // EF BB BF } return bom; } unittest { Stdout("Testing function tellBOM().\n"); struct Data2BOM { ubyte[] data; BOM bom; } alias ubyte[] ub; const Data2BOM[] map = [ {cast(ub)x"12", BOM.None}, {cast(ub)x"12 34", BOM.None}, {cast(ub)x"00 00 FF FE", BOM.None}, {cast(ub)x"EF BB FF", BOM.None}, {cast(ub)x"EF", BOM.None}, {cast(ub)x"EF BB", BOM.None}, {cast(ub)x"FE", BOM.None}, {cast(ub)x"FF", BOM.None}, {cast(ub)x"00", BOM.None}, {cast(ub)x"00 00", BOM.None}, {cast(ub)x"00 00 FE", BOM.None}, {cast(ub)x"FE FF 00", BOM.UTF16BE}, {cast(ub)x"FE FF 00 FF", BOM.UTF16BE}, {cast(ub)x"EF BB BF", BOM.UTF8}, {cast(ub)x"FE FF", BOM.UTF16BE}, {cast(ub)x"FF FE", BOM.UTF16LE}, {cast(ub)x"00 00 FE FF", BOM.UTF32BE}, {cast(ub)x"FF FE 00 00", BOM.UTF32LE} ]; foreach (pair; map) assert(tellBOM(pair.data) == pair.bom, Format("Failed at {0}", pair.data)); }