changeset 82:fc645fb2fe72

- scanEscapeSequences() doesn't return 0xFFFF as an error value anymore, because it is a valid codepoint usable by the user. - Added CharaterLiteral tokens with utf-16/32 versions to distinguish between different character literals. - Checking for valid Unicode codepoint when finished lexing hex escape sequences.
author aziz
date Wed, 04 Jul 2007 22:19:01 +0000
parents aa1ea2548dd9
children 9e6d66f647c9
files trunk/src/Lexer.d trunk/src/Token.d trunk/src/main.d
diffstat 3 files changed, 33 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/Lexer.d	Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/Lexer.d	Wed Jul 04 22:19:01 2007 +0000
@@ -268,8 +268,6 @@
         {
           ++p;
           c = scanEscapeSequence();
-          if (c == 0xFFFF)
-            break;
           if (c < 128)
             buffer ~= c;
           else
@@ -564,8 +562,6 @@
       case '\\':
         ++p;
         dchar d = scanEscapeSequence();
-        if (d == 0xFFFF)
-          continue;
         if (d < 128)
           buffer ~= d;
         else
@@ -585,16 +581,17 @@
       default:
         if (*p & 128)
         {
-          char* begin = p;
+//           char* begin = p;
           dchar d = decodeUTF8();
+
           if (d == LSd || d == PSd)
             goto case '\n';
 
-          if (d != 0xFFFF)
-          {
-            ++p;
-            buffer ~= begin[0 .. p - begin];
-          }
+          // We don't copy per pointer because we might include
+          // invalid, skipped utf-8 sequences. See decodeUTF8().
+//           ++p;
+//           buffer ~= begin[0 .. p - begin];
+          encodeUTF8(buffer, d);
           continue;
         }
         buffer ~= *p++;
@@ -608,10 +605,19 @@
     assert(*p == '\'');
     MID id = MID.UnterminatedCharacterLiteral;
     ++p;
+    TOK type = TOK.CharLiteral;
     switch (*p)
     {
     case '\\':
       ++p;
+      switch (*p)
+      {
+      case 'u':
+        type = TOK.WCharLiteral; break;
+      case 'U':
+        type = TOK.DCharLiteral; break;
+      default:
+      }
       t.dchar_ = scanEscapeSequence();
       break;
     case '\'':
@@ -626,6 +632,10 @@
         c = decodeUTF8();
         if (c == LSd || c == PSd)
           goto Lerr;
+        if (c <= 0xFFFF)
+          type = TOK.WCharLiteral;
+        else
+          type = TOK.DCharLiteral;
       }
       t.dchar_ = c;
       ++p;
@@ -634,9 +644,9 @@
     if (*p == '\'')
       ++p;
     else
-  Lerr:
+    Lerr:
       error(id);
-    t.type = TOK.Character;
+    t.type = type;
     t.end = p;
   }
 
@@ -786,11 +796,11 @@
   dchar scanEscapeSequence()
   {
     uint c = char2ev(*p);
-    if (c) {
+    if (c)
+    {
       ++p;
       return c;
     }
-    c = 0xFFFF;
     uint digits = 2;
 
     switch (*p)
@@ -809,7 +819,9 @@
             c += *p - 'A' + 10;
           else
             c += *p - 'a' + 10;
-          if (!--digits) {
+
+          if (!--digits)
+          {
             ++p;
             break;
           }
@@ -817,10 +829,11 @@
         else
         {
           error(MID.InsufficientHexDigits);
-          c = 0xFFFF;
           break;
         }
       }
+      if (!isValidDchar(c))
+        error(MID.InvalidUnicodeCharacter);
       break;
     case 'u':
       digits = 4;
@@ -1399,7 +1412,7 @@
   {
     assert(*p & 128, "check for ASCII char before calling decodeUTF8().");
     size_t idx;
-    uint d = 0xFFFF;
+    dchar d;
     try
     {
       d = std.utf.decode(p[0 .. end-p], idx);
@@ -1410,6 +1423,7 @@
       error(MID.InvalidUTF8Sequence);
       // Skip to next valid utf-8 sequence
       while (UTF8stride[*++p] != 0xFF) {}
+      --p;
     }
     return d;
   }
--- a/trunk/src/Token.d	Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/Token.d	Wed Jul 04 22:19:01 2007 +0000
@@ -15,7 +15,7 @@
   Identifier,
   Comment,
   String,
-  Character,
+  CharLiteral, WCharLiteral, DCharLiteral,
 
   // Numbers
   Number,
--- a/trunk/src/main.d	Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/main.d	Wed Jul 04 22:19:01 2007 +0000
@@ -65,7 +65,7 @@
       case TOK.String:
         writef("<sl>%s</sl>", srcText);
       break;
-      case TOK.Character:
+      case TOK.CharLiteral, TOK.WCharLiteral, TOK.DCharLiteral:
         writef("<cl>%s</cl>", srcText);
       break;
       case TOK.Assign, TOK.Equal,