Mercurial > projects > dil
view trunk/src/dil/Converter.d @ 786:3b34f6a95a27
Added and revised documenation comments.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 24 Feb 2008 02:41:11 +0100 |
parents | 4579e8505d5e |
children |
line wrap: on
line source
/++ Author: Aziz Köksal License: GPL3 +/ module dil.Converter; import dil.Information; import dil.Location; import dil.Unicode; import dil.FileBOM; import dil.lexer.Funcs; import dil.Messages; import common; /// Converts various Unicode encoding formats to UTF-8. struct Converter { char[] filePath; /// For error messages. InfoManager infoMan; static Converter opCall(char[] filePath, InfoManager infoMan) { Converter conv; conv.filePath = filePath; conv.infoMan = infoMan; return conv; } /// Byte-swaps c. dchar swapBytes(dchar c) { return c = (c << 24) | ((c >> 8) & 0xFF00) | ((c << 8) & 0xFF0000) | (c >> 24); } /// Byte-swaps c. wchar swapBytes(wchar c) { return (c << 8) | (c >> 8); } /// Swaps the bytes of c on a little-endian machine. dchar BEtoMachineDword(dchar c) { version(LittleEndian) return swapBytes(c); else return c; } /// Swaps the bytes of c on a big-endian machine. dchar LEtoMachineDword(dchar c) { version(LittleEndian) return c; else return swapBytes(c); } /// Swaps the bytes of c on a little-endian machine. wchar BEtoMachineWord(wchar c) { version(LittleEndian) return swapBytes(c); else return c; } /// Swaps the bytes of c on a big-endian machine. wchar LEtoMachineWord(wchar c) { version(LittleEndian) return c; else return swapBytes(c); } /// Converts a UTF-32 text to UTF-8. char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) { if (data.length == 0) return null; char[] result; uint lineNum = 1; dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4. foreach (dchar c; text) { static if (isBigEndian) c = BEtoMachineDword(c); else c = LEtoMachineDword(c); if (!isValidChar(c)) { infoMan ~= new LexerError( new Location(filePath, lineNum), Format(MSG.InvalidUTF32Character, c) ); c = REPLACEMENT_CHAR; } if (isNewline(c)) ++lineNum; dil.Unicode.encode(result, c); } if (data.length % 4) infoMan ~= new LexerError( new Location(filePath, lineNum), MSG.UTF32FileMustBeDivisibleBy4 ); return result; } alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE. alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE. /// Converts a UTF-16 text to UTF-8. char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) { if (data.length == 0) return null; wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two. wchar* p = text.ptr, end = text.ptr + text.length; char[] result; uint lineNum = 1; for (; p < end; p++) { dchar c = *p; static if (isBigEndian) c = BEtoMachineWord(c); else c = LEtoMachineWord(c); if (0xD800 > c || c > 0xDFFF) {} else if (c <= 0xDBFF && p+1 < end) { // Decode surrogate pairs. wchar c2 = p[1]; static if (isBigEndian) c2 = BEtoMachineWord(c2); else c2 = LEtoMachineWord(c2); if (0xDC00 <= c2 && c2 <= 0xDFFF) { c = (c - 0xD7C0) << 10; c |= (c2 & 0x3FF); ++p; } } else { infoMan ~= new LexerError( new Location(filePath, lineNum), Format(MSG.InvalidUTF16Character, c) ); c = REPLACEMENT_CHAR; } if (isNewline(c)) ++lineNum; dil.Unicode.encode(result, c); } if (data.length % 2) infoMan ~= new LexerError( new Location(filePath, lineNum), MSG.UTF16FileMustBeDivisibleBy2 ); return result; } alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE. alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE. /// Converts the text in data to UTF-8. /// Leaves data unchanged if it is in UTF-8 already. char[] data2UTF8(ubyte[] data) { if (data.length == 0) return ""; char[] text; BOM bom = tellBOM(data); switch (bom) { case BOM.None: // No BOM found. According to the specs the first character // must be an ASCII character. if (data.length >= 4) { if (data[0..3] == cast(ubyte[3])x"00 00 00") { text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX break; } else if (data[1..4] == cast(ubyte[3])x"00 00 00") { text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 break; } } if (data.length >= 2) { if (data[0] == 0) // UTF-16BE: 00 XX { text = UTF16BEtoUTF8(data); break; } else if (data[1] == 0) // UTF-16LE: XX 00 { text = UTF16LEtoUTF8(data); break; } } text = cast(char[])data; // UTF-8 break; case BOM.UTF8: text = cast(char[])data[3..$]; break; case BOM.UTF16BE: text = UTF16BEtoUTF8(data[2..$]); break; case BOM.UTF16LE: text = UTF16LEtoUTF8(data[2..$]); break; case BOM.UTF32BE: text = UTF32BEtoUTF8(data[4..$]); break; case BOM.UTF32LE: text = UTF32LEtoUTF8(data[4..$]); break; default: assert(0); } return text; } } /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) /// and Newlines with '\n'. string sanitizeText(string text) { if (!text.length) return null; char* p = text.ptr; char* end = p + text.length; char* q = p; for (; p < end; p++, q++) { assert(q <= p); switch (*p) { case '\r': if (p+1 < end && p[1] == '\n') p++; case '\n': *q = '\n'; continue; default: if (isascii(*p)) break; if (p+2 < end && isUnicodeNewline(p)) { p += 2; goto case '\n'; } auto p2 = p; // Beginning of the UTF-8 sequence. dchar c = decode(p, end); if (c == ERROR_CHAR) { // Skip to next ASCII character or valid UTF-8 sequence. while (++p < end && isTrailByte(*p)) {} alias REPLACEMENT_STR R; if (q+2 < p) // Copy replacement char if there is enough space. (*q = R[0]), (*++q = R[1]), (*++q = R[2]); p--; } else { // Copy the valid UTF-8 sequence. while (p2 <= p) // p points to the last trail byte. *q++ = *p2++; // Copy code units. q--; } continue; } assert(isascii(*p)); *q = *p; } assert(p == end); text.length = text.length - (p - q); //text = text.ptr[0 .. q - text.ptr]; // Another way. return text; } unittest { Stdout("Testing function Converter.\n"); struct Data2Text { char[] text; char[] expected = "source"; ubyte[] data() { return cast(ubyte[])text; } } const Data2Text[] map = [ // Without BOM {"source"}, {"s\0o\0u\0r\0c\0e\0"}, {"\0s\0o\0u\0r\0c\0e"}, {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, // With BOM {"\xEF\xBB\xBFsource"}, {"\xFE\xFF\0s\0o\0u\0r\0c\0e"}, {"\xFF\xFEs\0o\0u\0r\0c\0e\0"}, {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, ]; auto converter = Converter("", new InfoManager); foreach (i, pair; map) assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i)); }