projects/dil: src/dil/Converter.d comparison

comparison src/dil/Converter.d @ 806:bcb74c9b895c

Moved out files in the trunk folder to the root.

author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Sun, 09 Mar 2008 00:12:19 +0100
parents	trunk/src/dil/Converter.d@3b34f6a95a27
children

comparison

equal deleted inserted replaced

-:a3fab8b74a7d
+:bcb74c9b895c
+/++
+Author: Aziz Köksal
+License: GPL3
++/
+module dil.Converter;
+import dil.Information;
+import dil.Location;
+import dil.Unicode;
+import dil.FileBOM;
+import dil.lexer.Funcs;
+import dil.Messages;
+import common;
+/// Converts various Unicode encoding formats to UTF-8.
+struct Converter
+{
+char[] filePath; /// For error messages.
+InfoManager infoMan;
+static Converter opCall(char[] filePath, InfoManager infoMan)
+{
+Converter conv;
+conv.filePath = filePath;
+conv.infoMan = infoMan;
+return conv;
+}
+/// Byte-swaps c.
+dchar swapBytes(dchar c)
+{
+return c = (c << 24) |
+((c >> 8) & 0xFF00) |
+((c << 8) & 0xFF0000) |
+(c >> 24);
+}
+/// Byte-swaps c.
+wchar swapBytes(wchar c)
+{
+return (c << 8) | (c >> 8);
+}
+/// Swaps the bytes of c on a little-endian machine.
+dchar BEtoMachineDword(dchar c)
+{
+version(LittleEndian)
+return swapBytes(c);
+else
+return c;
+}
+/// Swaps the bytes of c on a big-endian machine.
+dchar LEtoMachineDword(dchar c)
+{
+version(LittleEndian)
+return c;
+else
+return swapBytes(c);
+}
+/// Swaps the bytes of c on a little-endian machine.
+wchar BEtoMachineWord(wchar c)
+{
+version(LittleEndian)
+return swapBytes(c);
+else
+return c;
+}
+/// Swaps the bytes of c on a big-endian machine.
+wchar LEtoMachineWord(wchar c)
+{
+version(LittleEndian)
+return c;
+else
+return swapBytes(c);
+}
+/// Converts a UTF-32 text to UTF-8.
+char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
+{
+if (data.length == 0)
+return null;
+char[] result;
+uint lineNum = 1;
+dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4.
+foreach (dchar c; text)
+{
+static if (isBigEndian)
+c = BEtoMachineDword(c);
+else
+c = LEtoMachineDword(c);
+if (!isValidChar(c))
+{
+infoMan ~= new LexerError(
+new Location(filePath, lineNum),
+Format(MSG.InvalidUTF32Character, c)
+);
+c = REPLACEMENT_CHAR;
+}
+if (isNewline(c))
+++lineNum;
+dil.Unicode.encode(result, c);
+}
+if (data.length % 4)
+infoMan ~= new LexerError(
+new Location(filePath, lineNum),
+MSG.UTF32FileMustBeDivisibleBy4
+);
+return result;
+}
+alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE.
+alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE.
+/// Converts a UTF-16 text to UTF-8.
+char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
+{
+if (data.length == 0)
+return null;
+wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two.
+wchar* p = text.ptr,
+end = text.ptr + text.length;
+char[] result;
+uint lineNum = 1;
+for (; p < end; p++)
+{
+dchar c = *p;
+static if (isBigEndian)
+c = BEtoMachineWord(c);
+else
+c = LEtoMachineWord(c);
+if (0xD800 > c || c > 0xDFFF)
+{}
+else if (c <= 0xDBFF && p+1 < end)
+{ // Decode surrogate pairs.
+wchar c2 = p[1];
+static if (isBigEndian)
+c2 = BEtoMachineWord(c2);
+else
+c2 = LEtoMachineWord(c2);
+if (0xDC00 <= c2 && c2 <= 0xDFFF)
+{
+c = (c - 0xD7C0) << 10;
+c |= (c2 & 0x3FF);
+++p;
+}
+}
+else
+{
+infoMan ~= new LexerError(
+new Location(filePath, lineNum),
+Format(MSG.InvalidUTF16Character, c)
+);
+c = REPLACEMENT_CHAR;
+}
+if (isNewline(c))
+++lineNum;
+dil.Unicode.encode(result, c);
+}
+if (data.length % 2)
+infoMan ~= new LexerError(
+new Location(filePath, lineNum),
+MSG.UTF16FileMustBeDivisibleBy2
+);
+return result;
+}
+alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE.
+alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE.
+/// Converts the text in data to UTF-8.
+/// Leaves data unchanged if it is in UTF-8 already.
+char[] data2UTF8(ubyte[] data)
+{
+if (data.length == 0)
+return "";
+char[] text;
+BOM bom = tellBOM(data);
+switch (bom)
+{
+case BOM.None:
+// No BOM found. According to the specs the first character
+// must be an ASCII character.
+if (data.length >= 4)
+{
+if (data[0..3] == cast(ubyte[3])x"00 00 00")
+{
+text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
+break;
+}
+else if (data[1..4] == cast(ubyte[3])x"00 00 00")
+{
+text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
+break;
+}
+}
+if (data.length >= 2)
+{
+if (data[0] == 0) // UTF-16BE: 00 XX
+{
+text = UTF16BEtoUTF8(data);
+break;
+}
+else if (data[1] == 0) // UTF-16LE: XX 00
+{
+text = UTF16LEtoUTF8(data);
+break;
+}
+}
+text = cast(char[])data; // UTF-8
+break;
+case BOM.UTF8:
+text = cast(char[])data[3..$];
+break;
+case BOM.UTF16BE:
+text = UTF16BEtoUTF8(data[2..$]);
+break;
+case BOM.UTF16LE:
+text = UTF16LEtoUTF8(data[2..$]);
+break;
+case BOM.UTF32BE:
+text = UTF32BEtoUTF8(data[4..$]);
+break;
+case BOM.UTF32LE:
+text = UTF32LEtoUTF8(data[4..$]);
+break;
+default:
+assert(0);
+}
+return text;
+}
+}
+/// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
+/// and Newlines with '\n'.
+string sanitizeText(string text)
+{
+if (!text.length)
+return null;
+char* p = text.ptr;
+char* end = p + text.length;
+char* q = p;
+for (; p < end; p++, q++)
+{
+assert(q <= p);
+switch (*p)
+{
+case '\r':
+if (p+1 < end && p[1] == '\n')
+p++;
+case '\n':
+*q = '\n';
+continue;
+default:
+if (isascii(*p))
+break;
+if (p+2 < end && isUnicodeNewline(p))
+{
+p += 2;
+goto case '\n';
+}
+auto p2 = p; // Beginning of the UTF-8 sequence.
+dchar c = decode(p, end);
+if (c == ERROR_CHAR)
+{ // Skip to next ASCII character or valid UTF-8 sequence.
+while (++p < end && isTrailByte(*p))
+{}
+alias REPLACEMENT_STR R;
+if (q+2 < p) // Copy replacement char if there is enough space.
+(*q = R[0]), (*++q = R[1]), (*++q = R[2]);
+p--;
+}
+else
+{ // Copy the valid UTF-8 sequence.
+while (p2 <= p) // p points to the last trail byte.
+*q++ = *p2++; // Copy code units.
+q--;
+}
+continue;
+}
+assert(isascii(*p));
+*q = *p;
+}
+assert(p == end);
+text.length = text.length - (p - q);
+//text = text.ptr[0 .. q - text.ptr]; // Another way.
+return text;
+}
+unittest
+{
+Stdout("Testing function Converter.\n");
+struct Data2Text
+{
+char[] text;
+char[] expected = "source";
+ubyte[] data()
+{ return cast(ubyte[])text; }
+}
+const Data2Text[] map = [
+// Without BOM
+{"source"},
+{"s\0o\0u\0r\0c\0e\0"},
+{"\0s\0o\0u\0r\0c\0e"},
+{"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+{"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+// With BOM
+{"\xEF\xBB\xBFsource"},
+{"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
+{"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
+{"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+{"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+];
+auto converter = Converter("", new InfoManager);
+foreach (i, pair; map)
+assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i));
+}

Mercurial > projects > dil

comparison src/dil/Converter.d @ 806:bcb74c9b895c