Mercurial > projects > dil
view trunk/src/dil/Converter.d @ 739:49fe21aa387c
Added sanitizeText() to dil.Converter.
Cleaned predefined.ddoc up a bit.
Removed makeString() from dil.doc.Macro.
Added REPLACEMENT_STR to dil.Unicode.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sat, 09 Feb 2008 14:24:35 +0100 |
parents | 9e811db780a6 |
children | 90668b83ae5e |
line wrap: on
line source
/++ Author: Aziz Köksal License: GPL3 +/ module dil.Converter; import dil.Information; import dil.Location; import dil.Unicode; import dil.FileBOM; import dil.lexer.Funcs; import dil.Messages; import common; /// Converts various Unicode encoding formats to UTF-8. struct Converter { char[] filePath; /// For error messages. InfoManager infoMan; static Converter opCall(char[] filePath, InfoManager infoMan) { Converter conv; conv.filePath = filePath; conv.infoMan = infoMan; return conv; } dchar swapBytes(dchar c) { return c = (c << 24) | ((c >> 8) & 0xFF00) | ((c << 8) & 0xFF0000) | (c >> 24); } wchar swapBytes(wchar c) { return (c << 8) | (c >> 8); } wchar BEtoMachineDword(dchar c) { version(LittleEndian) return swapBytes(c); else return c; } wchar LEtoMachineDword(dchar c) { version(LittleEndian) return c; else return swapBytes(c); } wchar BEtoMachineWord(wchar c) { version(LittleEndian) return swapBytes(c); else return c; } wchar LEtoMachineWord(wchar c) { version(LittleEndian) return c; else return swapBytes(c); } char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data) { if (data.length == 0) return null; char[] result; uint lineNum = 1; dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4. foreach (dchar c; text) { static if (isBigEndian) c = BEtoMachineDword(c); else c = LEtoMachineDword(c); if (!isValidChar(c)) { infoMan ~= new LexerError( new Location(filePath, lineNum), Format(MSG.InvalidUTF32Character, c) ); c = REPLACEMENT_CHAR; } if (isNewline(c)) ++lineNum; dil.Unicode.encode(result, c); } if (data.length % 4) infoMan ~= new LexerError( new Location(filePath, lineNum), MSG.UTF32FileMustBeDivisibleBy4 ); return result; } alias UTF32toUTF8!(true) UTF32BEtoUTF8; alias UTF32toUTF8!(false) UTF32LEtoUTF8; char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data) { if (data.length == 0) return null; wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two. wchar* p = text.ptr, end = text.ptr + text.length; char[] result; uint lineNum = 1; dchar c = *p; do { static if (isBigEndian) c = BEtoMachineWord(c); else c = LEtoMachineWord(c); if (c < 0xD800 || 0xDFFF > c) {} else if (c <= 0xDBFF && p+1 < end) { wchar c2 = p[1]; static if (isBigEndian) c2 = BEtoMachineWord(c2); else c2 = LEtoMachineWord(c2); if (0xDC00 <= c2 && c2 <= 0xDFFF) { c = (c - 0xD7C0) << 10; c |= (c2 & 0x3FF); ++p; } } else { infoMan ~= new LexerError( new Location(filePath, lineNum), Format(MSG.InvalidUTF16Character, c) ); c = REPLACEMENT_CHAR; } if (isNewline(c)) ++lineNum; ++p; dil.Unicode.encode(result, c); } while (p < end) if (data.length % 2) infoMan ~= new LexerError( new Location(filePath, lineNum), MSG.UTF16FileMustBeDivisibleBy2 ); return result; } alias UTF16toUTF8!(true) UTF16BEtoUTF8; alias UTF16toUTF8!(false) UTF16LEtoUTF8; char[] data2UTF8(ubyte[] data) { if (data.length == 0) return null; char[] text; BOM bom = tellBOM(data); switch (bom) { case BOM.None: // No BOM found. According to the specs the first character // must be an ASCII character. if (data.length >= 4) { if (data[0..3] == cast(ubyte[3])x"00 00 00") { text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX break; } else if (data[1..4] == cast(ubyte[3])x"00 00 00") { text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 break; } } if (data.length >= 2) { if (data[0] == 0) // UTF-16BE: 00 XX { text = UTF16BEtoUTF8(data); break; } else if (data[1] == 0) // UTF-16LE: XX 00 { text = UTF16LEtoUTF8(data); break; } } text = cast(char[])data; // UTF-8 break; case BOM.UTF8: text = cast(char[])data[3..$]; break; case BOM.UTF16BE: text = UTF16BEtoUTF8(data[2..$]); break; case BOM.UTF16LE: text = UTF16LEtoUTF8(data[2..$]); break; case BOM.UTF32BE: text = UTF32BEtoUTF8(data[4..$]); break; case BOM.UTF32LE: text = UTF32LEtoUTF8(data[4..$]); break; default: assert(0); } return text; } } /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) /// and Newlines with '\n'. string sanitizeText(string text) { if (!text.length) return null; char* p = text.ptr; char* end = p + text.length; char* q = p; for (; p < end; p++, q++) { assert(q <= p); switch (*p) { case '\r': if (p+1 < end && p[1] == '\n') p++; case '\n': *q = '\n'; continue; default: if (isascii(*p)) break; if (p+2 < end && isUnicodeNewline(p)) { p += 2; goto case '\n'; } auto p2 = p; // Beginning of the UTF-8 sequence. dchar c = decode(p, end); if (c == ERROR_CHAR) { // Skip to next ASCII character or valid UTF-8 sequence. while (++p < end && isTrailByte(*p)) {} alias REPLACEMENT_STR R; if (q+2 < p) // Copy replacement char if there is enough space. (*q = R[0]), (*++q = R[1]), (*++q = R[2]); p--; } else { // Copy the valid UTF-8 sequence. while (p2 <= p) // p points to the last trail byte. *q++ = *p2++; // Copy code units. q--; } continue; } assert(isascii(*p)); *q = *p; } assert(p == end); text.length = text.length - (p - q); //text = text.ptr[0 .. q - text.ptr]; // Another way. return text; }