Mercurial > projects > dil

--- a/trunk/src/dil/Converter.d	Mon Dec 17 16:10:08 2007 +0100
+++ b/trunk/src/dil/Converter.d	Mon Dec 17 17:35:38 2007 +0100
@@ -8,6 +8,8 @@
 import dil.Location;
 import dil.Unicode;
 import dil.FileBOM;
+import dil.LexerFuncs;
+import dil.Messages;
 import common;

 /// Converts various Unicode encoding formats to UTF-8.
@@ -71,18 +73,13 @@

   char[] UTF32toUTF8(bool isBigEndian)(ubyte[] data)
   {
-    if (data.length % 4)
-    {
-      infoMan.info ~= new LexerError(new Location(filePath, 0),
-        "the byte length of a UTF-32 source file must be divisible by 4."
-      );
-      data = data[0 .. $ - $ % 4]; // Trim to valid size.
-    }
     if (data.length == 0)
       return null;

     char[] result;
-    foreach (dchar c; cast(dchar[])data)
+    uint lineNum = 1;
+    dchar[] text = cast(dchar[]) data[0 .. $-($%4)]; // Trim to multiple of 4.
+    foreach (dchar c; text)
     {
       static if (isBigEndian)
         c = BEtoMachineDword(c);
@@ -91,14 +88,24 @@

       if (!isValidChar(c))
       {
-        // TODO: correct location.
-        auto loc = new Location(filePath, 0);
-        infoMan.info ~= new LexerError(null, Format("invalid UTF-32 character '{:X}'.", c));
+        infoMan ~= new LexerError(
+          new Location(filePath, lineNum),
+          Format(MSG.InvalidUTF32Character, c)
+        );
         c = REPLACEMENT_CHAR;
       }

+      if (isNewline(c))
+        ++lineNum;
       dil.Unicode.encode(result, c);
     }
+
+    if (data.length % 4)
+      infoMan ~= new LexerError(
+        new Location(filePath, lineNum),
+        MSG.UTF32FileMustBeDivisibleBy4
+      );
+
     return result;
   }

@@ -107,22 +114,14 @@

   char[] UTF16toUTF8(bool isBigEndian)(ubyte[] data)
   {
-    if (data.length % 2)
-    {
-      infoMan ~= new LexerError(new Location(filePath, 0),
-        "the byte length of a UTF-16 source file must be divisible by 2."
-      );
-      data = data[0 .. $-1]; // Trim to valid size.
-    }
-
     if (data.length == 0)
       return null;

-    wchar[] text = cast(wchar[])data;
+    wchar[] text = cast(wchar[]) data[0 .. $-($%2)]; // Trim to multiple of two.
     wchar* p = text.ptr,
-          end = text.ptr + text.length;
+         end = text.ptr + text.length;
     char[] result;
-
+    uint lineNum = 1;
     dchar c = *p;

     do
@@ -151,14 +150,25 @@
       }
       else
       {
-        // TODO: correct location.
-        auto loc = new Location(filePath, 0);
-        infoMan ~= new LexerError(loc, Format("invalid UTF-16 character '{:X}'.", c));
+        infoMan ~= new LexerError(
+          new Location(filePath, lineNum),
+          Format(MSG.InvalidUTF16Character, c)
+        );
         c = REPLACEMENT_CHAR;
       }
+
+      if (isNewline(c))
+        ++lineNum;
       ++p;
       dil.Unicode.encode(result, c);
     } while (p < end)
+
+    if (data.length % 2)
+      infoMan ~= new LexerError(
+        new Location(filePath, lineNum),
+        MSG.UTF16FileMustBeDivisibleBy2
+      );
+
     return result;
   }
--- a/trunk/src/dil/LexerFuncs.d	Mon Dec 17 16:10:08 2007 +0100
+++ b/trunk/src/dil/LexerFuncs.d	Mon Dec 17 17:35:38 2007 +0100
@@ -33,6 +33,12 @@
   return *p == '\n' || *p == '\r' || isUnicodeNewline(p);
 }

+/// Returns if c is a Newline character.
+bool isNewline(dchar c)
+{
+  return c == '\n' || c == '\r' || isUnicodeNewlineChar(c);
+}
+
 /++
   Returns true if p points to an EOF character.
   EOF: 0 | _Z_
--- a/trunk/src/dil/Messages.d	Mon Dec 17 16:10:08 2007 +0100
+++ b/trunk/src/dil/Messages.d	Mon Dec 17 17:35:38 2007 +0100
@@ -105,6 +105,11 @@
 struct MSG
 {
 static:
+  // Converter:
+  auto InvalidUTF16Character = "invalid UTF-16 character '\\u{:X4}'.";
+  auto InvalidUTF32Character = "invalid UTF-32 character '\\U{:X8}'.";
+  auto UTF16FileMustBeDivisibleBy2 = "the byte length of a UTF-16 source file must be divisible by 2.";
+  auto UTF32FileMustBeDivisibleBy4 = "the byte length of a UTF-32 source file must be divisible by 4.";
   // Parser messages:
   auto ExpectedIdAfterTypeDot = "expected identifier after '(Type).', not '{}'";
   auto ExpectedModuleIdentifier = "expected module identifier, not '{}'";