Mercurial > projects > dil

diff src/dil/Converter.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author: Aziz K?ksal <aziz.koeksal@gmail.com>
date: Sun, 09 Mar 2008 00:12:19 +0100
parents: trunk/src/dil/Converter.d@3b34f6a95a27
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/dil/Converter.d	Sun Mar 09 00:12:19 2008 +0100
@@ -0,0 +1,334 @@
+/++
+  Author: Aziz Köksal
+  License: GPL3
++/
+module dil.Converter;
+
+import dil.Information;
+import dil.Location;
+import dil.Unicode;
+import dil.FileBOM;
+import dil.lexer.Funcs;
+import dil.Messages;
+import common;
+
+/// Converts various Unicode encoding formats to UTF-8.
+struct Converter
+{
+  char[] filePath; /// For error messages.
+  InfoManager infoMan;
+
+  static Converter opCall(char[] filePath, InfoManager infoMan)
+  {
+    Converter conv;
+    conv.filePath = filePath;
+    conv.infoMan = infoMan;
+    return conv;
+  }
+
+  /// Byte-swaps c.
+  dchar swapBytes(dchar c)
+  {
+    return c = (c << 24) |
+              ((c >> 8) & 0xFF00) |
+              ((c << 8) & 0xFF0000) |
+              (c >> 24);
+  }
+
+  /// Byte-swaps c.
+  wchar swapBytes(wchar c)
+  {
+    return (c << 8) | (c >> 8);
+  }
+
+  /// Swaps the bytes of c on a little-endian machine.
+  dchar BEtoMachineDword(dchar c)
+  {
+    version(LittleEndian)
+      return swapBytes(c);
+    else
+      return c;
+  }
+
+  /// Swaps the bytes of c on a big-endian machine.
+  dchar LEtoMachineDword(dchar c)
+  {
+    version(LittleEndian)
+      return c;
+    else
+      return swapBytes(c);
+  }
+
+  /// Swaps the bytes of c on a little-endian machine.
+  wchar BEtoMachineWord(wchar c)
+  {
+    version(LittleEndian)
+      return swapBytes(c);
+    else
+      return c;
+  }
+
+  /// Swaps the bytes of c on a big-endian machine.
+  wchar LEtoMachineWord(wchar c)
+  {
+    version(LittleEndian)
+      return c;
+    else
+      return swapBytes(c);
+  }
+
+  /// Converts a UTF-32 text to UTF-8.
+  char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
+  {
+    if (data.length == 0)
+      return null;
+
+    char[] result;
+    uint lineNum = 1;
+    dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4.
+    foreach (dchar c; text)
+    {
+      static if (isBigEndian)
+        c = BEtoMachineDword(c);
+      else
+        c = LEtoMachineDword(c);
+
+      if (!isValidChar(c))
+      {
+        infoMan ~= new LexerError(
+          new Location(filePath, lineNum),
+          Format(MSG.InvalidUTF32Character, c)
+        );
+        c = REPLACEMENT_CHAR;
+      }
+
+      if (isNewline(c))
+        ++lineNum;
+      dil.Unicode.encode(result, c);
+    }
+
+    if (data.length % 4)
+      infoMan ~= new LexerError(
+        new Location(filePath, lineNum),
+        MSG.UTF32FileMustBeDivisibleBy4
+      );
+
+    return result;
+  }
+
+  alias UTF32toUTF8!(true) UTF32BEtoUTF8; /// Instantiation for UTF-32 BE.
+  alias UTF32toUTF8!(false) UTF32LEtoUTF8; /// Instantiation for UTF-32 LE.
+
+  /// Converts a UTF-16 text to UTF-8.
+  char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
+  {
+    if (data.length == 0)
+      return null;
+
+    wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two.
+    wchar* p = text.ptr,
+         end = text.ptr + text.length;
+    char[] result;
+    uint lineNum = 1;
+
+    for (; p < end; p++)
+    {
+      dchar c = *p;
+      static if (isBigEndian)
+        c = BEtoMachineWord(c);
+      else
+        c = LEtoMachineWord(c);
+
+      if (0xD800 > c || c > 0xDFFF)
+      {}
+      else if (c <= 0xDBFF && p+1 < end)
+      { // Decode surrogate pairs.
+        wchar c2 = p[1];
+        static if (isBigEndian)
+          c2 = BEtoMachineWord(c2);
+        else
+          c2 = LEtoMachineWord(c2);
+
+        if (0xDC00 <= c2 && c2 <= 0xDFFF)
+        {
+          c = (c - 0xD7C0) << 10;
+          c |= (c2 & 0x3FF);
+          ++p;
+        }
+      }
+      else
+      {
+        infoMan ~= new LexerError(
+          new Location(filePath, lineNum),
+          Format(MSG.InvalidUTF16Character, c)
+        );
+        c = REPLACEMENT_CHAR;
+      }
+
+      if (isNewline(c))
+        ++lineNum;
+      dil.Unicode.encode(result, c);
+    }
+
+    if (data.length % 2)
+      infoMan ~= new LexerError(
+        new Location(filePath, lineNum),
+        MSG.UTF16FileMustBeDivisibleBy2
+      );
+    return result;
+  }
+
+  alias UTF16toUTF8!(true) UTF16BEtoUTF8; /// Instantiation for UTF-16 BE.
+  alias UTF16toUTF8!(false) UTF16LEtoUTF8; /// Instantiation for UTF-16 LE.
+
+  /// Converts the text in data to UTF-8.
+  /// Leaves data unchanged if it is in UTF-8 already.
+  char[] data2UTF8(ubyte[] data)
+  {
+    if (data.length == 0)
+      return "";
+
+    char[] text;
+    BOM bom = tellBOM(data);
+
+    switch (bom)
+    {
+    case BOM.None:
+      // No BOM found. According to the specs the first character
+      // must be an ASCII character.
+      if (data.length >= 4)
+      {
+        if (data[0..3] == cast(ubyte[3])x"00 00 00")
+        {
+          text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
+          break;
+        }
+        else if (data[1..4] == cast(ubyte[3])x"00 00 00")
+        {
+          text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
+          break;
+        }
+      }
+      if (data.length >= 2)
+      {
+        if (data[0] == 0) // UTF-16BE: 00 XX
+        {
+          text = UTF16BEtoUTF8(data);
+          break;
+        }
+        else if (data[1] == 0) // UTF-16LE: XX 00
+        {
+          text = UTF16LEtoUTF8(data);
+          break;
+        }
+      }
+      text = cast(char[])data; // UTF-8
+      break;
+    case BOM.UTF8:
+      text = cast(char[])data[3..$];
+      break;
+    case BOM.UTF16BE:
+      text = UTF16BEtoUTF8(data[2..$]);
+      break;
+    case BOM.UTF16LE:
+      text = UTF16LEtoUTF8(data[2..$]);
+      break;
+    case BOM.UTF32BE:
+      text = UTF32BEtoUTF8(data[4..$]);
+      break;
+    case BOM.UTF32LE:
+      text = UTF32LEtoUTF8(data[4..$]);
+      break;
+    default:
+      assert(0);
+    }
+    return text;
+  }
+}
+
+/// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
+/// and Newlines with '\n'.
+string sanitizeText(string text)
+{
+  if (!text.length)
+    return null;
+
+  char* p = text.ptr;
+  char* end = p + text.length;
+  char* q = p;
+
+  for (; p < end; p++, q++)
+  {
+    assert(q <= p);
+    switch (*p)
+    {
+    case '\r':
+      if (p+1 < end && p[1] == '\n')
+        p++;
+    case '\n':
+      *q = '\n';
+      continue;
+    default:
+      if (isascii(*p))
+        break;
+      if (p+2 < end && isUnicodeNewline(p))
+      {
+        p += 2;
+        goto case '\n';
+      }
+      auto p2 = p; // Beginning of the UTF-8 sequence.
+      dchar c = decode(p, end);
+      if (c == ERROR_CHAR)
+      { // Skip to next ASCII character or valid UTF-8 sequence.
+        while (++p < end && isTrailByte(*p))
+        {}
+        alias REPLACEMENT_STR R;
+        if (q+2 < p) // Copy replacement char if there is enough space.
+          (*q = R[0]), (*++q = R[1]), (*++q = R[2]);
+        p--;
+      }
+      else
+      { // Copy the valid UTF-8 sequence.
+        while (p2 <= p) // p points to the last trail byte.
+          *q++ = *p2++; // Copy code units.
+        q--;
+      }
+      continue;
+    }
+    assert(isascii(*p));
+    *q = *p;
+  }
+  assert(p == end);
+  text.length = text.length - (p - q);
+  //text = text.ptr[0 .. q - text.ptr]; // Another way.
+  return text;
+}
+
+unittest
+{
+  Stdout("Testing function Converter.\n");
+  struct Data2Text
+  {
+    char[] text;
+    char[] expected = "source";
+    ubyte[] data()
+    { return cast(ubyte[])text; }
+  }
+  const Data2Text[] map = [
+    // Without BOM
+    {"source"},
+    {"s\0o\0u\0r\0c\0e\0"},
+    {"\0s\0o\0u\0r\0c\0e"},
+    {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+    {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+    // With BOM
+    {"\xEF\xBB\xBFsource"},
+    {"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
+    {"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
+    {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
+    {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
+  ];
+  auto converter = Converter("", new InfoManager);
+  foreach (i, pair; map)
+    assert(converter.data2UTF8(pair.data) == pair.expected, Format("failed at item {}", i));
+}
author	Aziz K?ksal <aziz.koeksal@gmail.com>
date	Sun, 09 Mar 2008 00:12:19 +0100
parents	trunk/src/dil/Converter.d@3b34f6a95a27
children