projects/dil: trunk/src/dil/Converter.d comparison

comparison trunk/src/dil/Converter.d @ 739:49fe21aa387c

Added sanitizeText() to dil.Converter. Cleaned predefined.ddoc up a bit. Removed makeString() from dil.doc.Macro. Added REPLACEMENT_STR to dil.Unicode.

author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Sat, 09 Feb 2008 14:24:35 +0100
parents	9e811db780a6
children	90668b83ae5e

comparison

equal deleted inserted replaced

-:2afcc305831a
+:49fe21aa387c
 assert(0);
 }
 return text;
 }
 }
+/// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
+/// and Newlines with '\n'.
+string sanitizeText(string text)
+{
+if (!text.length)
+return null;
+char* p = text.ptr;
+char* end = p + text.length;
+char* q = p;
+for (; p < end; p++, q++)
+{
+assert(q <= p);
+switch (*p)
+{
+case '\r':
+if (p+1 < end && p[1] == '\n')
+p++;
+case '\n':
+*q = '\n';
+continue;
+default:
+if (isascii(*p))
+break;
+if (p+2 < end && isUnicodeNewline(p))
+{
+p += 2;
+goto case '\n';
+}
+auto p2 = p; // Beginning of the UTF-8 sequence.
+dchar c = decode(p, end);
+if (c == ERROR_CHAR)
+{ // Skip to next ASCII character or valid UTF-8 sequence.
+while (++p < end && isTrailByte(*p))
+{}
+alias REPLACEMENT_STR R;
+if (q+2 < p) // Copy replacement char if there is enough space.
+(*q = R[0]), (*++q = R[1]), (*++q = R[2]);
+p--;
+}
+else
+{ // Copy the valid UTF-8 sequence.
+while (p2 <= p) // p points to the last trail byte.
+*q++ = *p2++; // Copy code units.
+q--;
+}
+continue;
+}
+assert(isascii(*p));
+*q = *p;
+}
+assert(p == end);
+text.length = text.length - (p - q);
+//text = text.ptr[0 .. q - text.ptr]; // Another way.
+return text;
+}

Mercurial > projects > dil

comparison trunk/src/dil/Converter.d @ 739:49fe21aa387c