Mercurial > projects > dil
diff trunk/src/dil/Converter.d @ 739:49fe21aa387c
Added sanitizeText() to dil.Converter.
Cleaned predefined.ddoc up a bit.
Removed makeString() from dil.doc.Macro.
Added REPLACEMENT_STR to dil.Unicode.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sat, 09 Feb 2008 14:24:35 +0100 |
parents | 9e811db780a6 |
children | 90668b83ae5e |
line wrap: on
line diff
--- a/trunk/src/dil/Converter.d Sat Feb 09 02:06:32 2008 +0100 +++ b/trunk/src/dil/Converter.d Sat Feb 09 14:24:35 2008 +0100 @@ -237,3 +237,61 @@ return text; } } + +/// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) +/// and Newlines with '\n'. +string sanitizeText(string text) +{ + if (!text.length) + return null; + + char* p = text.ptr; + char* end = p + text.length; + char* q = p; + + for (; p < end; p++, q++) + { + assert(q <= p); + switch (*p) + { + case '\r': + if (p+1 < end && p[1] == '\n') + p++; + case '\n': + *q = '\n'; + continue; + default: + if (isascii(*p)) + break; + if (p+2 < end && isUnicodeNewline(p)) + { + p += 2; + goto case '\n'; + } + auto p2 = p; // Beginning of the UTF-8 sequence. + dchar c = decode(p, end); + if (c == ERROR_CHAR) + { // Skip to next ASCII character or valid UTF-8 sequence. + while (++p < end && isTrailByte(*p)) + {} + alias REPLACEMENT_STR R; + if (q+2 < p) // Copy replacement char if there is enough space. + (*q = R[0]), (*++q = R[1]), (*++q = R[2]); + p--; + } + else + { // Copy the valid UTF-8 sequence. + while (p2 <= p) // p points to the last trail byte. + *q++ = *p2++; // Copy code units. + q--; + } + continue; + } + assert(isascii(*p)); + *q = *p; + } + assert(p == end); + text.length = text.length - (p - q); + //text = text.ptr[0 .. q - text.ptr]; // Another way. + return text; +}