diff trunk/src/dil/Converter.d @ 739:49fe21aa387c

Added sanitizeText() to dil.Converter. Cleaned predefined.ddoc up a bit. Removed makeString() from dil.doc.Macro. Added REPLACEMENT_STR to dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 09 Feb 2008 14:24:35 +0100
parents 9e811db780a6
children 90668b83ae5e
line wrap: on
line diff
--- a/trunk/src/dil/Converter.d	Sat Feb 09 02:06:32 2008 +0100
+++ b/trunk/src/dil/Converter.d	Sat Feb 09 14:24:35 2008 +0100
@@ -237,3 +237,61 @@
     return text;
   }
 }
+
+/// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
+/// and Newlines with '\n'.
+string sanitizeText(string text)
+{
+  if (!text.length)
+    return null;
+
+  char* p = text.ptr;
+  char* end = p + text.length;
+  char* q = p;
+
+  for (; p < end; p++, q++)
+  {
+    assert(q <= p);
+    switch (*p)
+    {
+    case '\r':
+      if (p+1 < end && p[1] == '\n')
+        p++;
+    case '\n':
+      *q = '\n';
+      continue;
+    default:
+      if (isascii(*p))
+        break;
+      if (p+2 < end && isUnicodeNewline(p))
+      {
+        p += 2;
+        goto case '\n';
+      }
+      auto p2 = p; // Beginning of the UTF-8 sequence.
+      dchar c = decode(p, end);
+      if (c == ERROR_CHAR)
+      { // Skip to next ASCII character or valid UTF-8 sequence.
+        while (++p < end && isTrailByte(*p))
+        {}
+        alias REPLACEMENT_STR R;
+        if (q+2 < p) // Copy replacement char if there is enough space.
+          (*q = R[0]), (*++q = R[1]), (*++q = R[2]);
+        p--;
+      }
+      else
+      { // Copy the valid UTF-8 sequence.
+        while (p2 <= p) // p points to the last trail byte.
+          *q++ = *p2++; // Copy code units.
+        q--;
+      }
+      continue;
+    }
+    assert(isascii(*p));
+    *q = *p;
+  }
+  assert(p == end);
+  text.length = text.length - (p - q);
+  //text = text.ptr[0 .. q - text.ptr]; // Another way.
+  return text;
+}