changeset 49:7f0fa15dcffc

- Renamed function. - Fix: assign buffer to t.str. - Fix: store escape value in temporary variable, not in t.dchar_. - Fix: decode utf-8 sequence instead of blindly copying it. - Issue error when decoding invalid utf-8 sequences.
author aziz
date Wed, 27 Jun 2007 12:23:02 +0000
parents c2e0e0269c28
children 4a27b7840ea9
files trunk/src/Lexer.d
diffstat 1 files changed, 32 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/Lexer.d	Wed Jun 27 10:22:03 2007 +0000
+++ b/trunk/src/Lexer.d	Wed Jun 27 12:23:02 2007 +0000
@@ -111,6 +111,7 @@
 enum MID
 {
   InvalidUnicodeCharacter,
+  InvalidUTF8Sequence,
   // ''
   UnterminatedCharacterLiteral,
   EmptyCharacterLiteral,
@@ -140,6 +141,7 @@
 
 string[] messages = [
   "invalid Unicode character.",
+  "invalid UTF-8 sequence.",
   // ''
   "unterminated character literal.",
   "empty character literal.",
@@ -266,7 +268,7 @@
       Lidentifier:
         do
         { c = *++p; }
-        while (isident(c) || c & 128 && isUniAlpha(decodeUTF()))
+        while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
 
         t.end = p;
 
@@ -683,7 +685,7 @@
       default:
       }
 
-      if (c & 128 && isUniAlpha(decodeUTF()))
+      if (c & 128 && isUniAlpha(decodeUTF8()))
         goto Lidentifier;
       c = *++p;
     }
@@ -710,16 +712,17 @@
         ++p;
       Lreturn:
         buffer ~= 0;
+        t.str = buffer;
         t.pf = scanPostfix();
         t.end = p;
         return;
       case '\\':
         ++p;
-        t.dchar_ = scanEscapeSequence();
-        if (t.dchar_ < 128)
-          buffer ~= t.dchar_;
+        dchar d = scanEscapeSequence();
+        if (d < 128)
+          buffer ~= d;
         else
-          encodeUTF8(buffer, t.dchar_);
+          encodeUTF8(buffer, d);
         continue;
       case '\r':
         if (p[1] == '\n')
@@ -735,12 +738,16 @@
       default:
         if (*p & 128)
         {
-          if (*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])) {
-            ++p; ++p;
+          char* begin = p;
+          dchar d = decodeUTF8();
+          if (d == LSd || d == PSd)
             goto case '\n';
+
+          if (d != 0xFFFF)
+          {
+            ++p;
+            buffer ~= begin[0 .. p - begin];
           }
-          buffer ~= p[0 .. UTF8stride[*p]];
-          p += UTF8stride[*p];
           continue;
         }
         buffer ~= *p++;
@@ -768,7 +775,7 @@
       uint c = *p;
       if (c & 128)
       {
-        c = decodeUTF();
+        c = decodeUTF8();
         if (c == LSd || c == PSd)
           goto Lerr;
       }
@@ -1100,13 +1107,22 @@
     error(mid);
   }
 
-  uint decodeUTF()
+  uint decodeUTF8()
   {
-    assert(*p & 128);
+    assert(*p & 128, "check for ASCII char before calling decodeUTF8().");
     size_t idx;
-    uint d;
-    d = std.utf.decode(p[0 .. end-p], idx);
-    p += idx -1;
+    uint d = 0xFFFF;
+    try
+    {
+      d = std.utf.decode(p[0 .. end-p], idx);
+      p += idx -1;
+    }
+    catch (UtfException e)
+    {
+      error(MID.InvalidUTF8Sequence);
+      // Skip to next valid utf-8 sequence
+      while (UTF8stride[*++p] != 0xFF) {}
+    }
     return d;
   }