changeset 419:89e40d43065d

Added new member 'p_newl' to Lexer and applied some fixes. Added method set_p_newl() that sets this.p_newl one character past the previously encountered newline. Not using decodeUTF8() to check for LS and PS in cases where the newline mustn't be skipped. Added a particular assert() to almost every case '\n' statement.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Fri, 28 Sep 2007 12:10:11 +0200
parents 7354f15cd5e9
children ce644d724d87
files trunk/src/dil/Lexer.d
diffstat 1 files changed, 55 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/dil/Lexer.d	Thu Sep 27 17:06:28 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Fri Sep 28 12:10:11 2007 +0200
@@ -49,6 +49,7 @@
   char* p; /// Points to the current character in the source text.
   char* end; /// Points one character past the end of the source text.
 
+  char* p_newl;
   uint loc = 1; /// Actual line of code.
 
   uint loc_old; /// Store actual line number when #line token is scanned.
@@ -182,6 +183,22 @@
     }
   }
 
+  void set_p_newl(char* p)
+  {
+    assert(delegate()
+      {
+        if (!((p-1) >= text.ptr && p < end))
+          return false;
+        // Check that previous character is a newline.
+        if (p[-1] != '\n' &&  p[-1] != '\r' &&
+            p[-1] != LS[2] && p[-1] != PS[2])
+          return false;
+        return true;
+      }() == true
+    );
+    this.p_newl = p;
+  }
+
   public void scan_(out Token t)
   in
   {
@@ -205,8 +222,10 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++p;
         ++loc;
+        set_p_newl(p);
         continue;
       case LS[0]:
         if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
@@ -306,13 +325,12 @@
             {
             case '\r', '\n', 0, _Z_:
               break;
+            case LS[0]:
+              if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+                break;
             default:
               if (c & 128)
-              {
-                c = decodeUTF8();
-                if (c == LSd || c == PSd)
-                  break;
-              }
+                decodeUTF8();
               continue;
             }
             break; // Exit loop.
@@ -685,8 +703,10 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++p;
         ++loc;
+        set_p_newl(p);
         continue;
       case LS[0]:
         if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
@@ -818,13 +838,12 @@
         {
         case '\r', '\n', 0, _Z_:
           break;
+        case LS[0]:
+          if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+            break;
         default:
           if (c & 128)
-          {
-            c = decodeUTF8();
-            if (c == LSd || c == PSd)
-              break;
-          }
+            decodeUTF8();
           continue;
         }
         break; // Exit loop.
@@ -1113,7 +1132,9 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++loc;
+        set_p_newl(p+1);
         continue;
       case 0, _Z_:
         error(MID.UnterminatedBlockComment);
@@ -1160,7 +1181,9 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++loc;
+        set_p_newl(p+1);
         continue;
       case 0, _Z_:
         error(MID.UnterminatedNestedComment);
@@ -1230,8 +1253,10 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++loc;
         c = '\n'; // Convert EndOfLine to \n.
+        set_p_newl(p+1);
         break;
       case 0, _Z_:
         error(MID.UnterminatedString);
@@ -1239,15 +1264,10 @@
       default:
         if (c & 128)
         {
-//           char* begin = p;
           c = decodeUTF8();
           if (c == LSd || c == PSd)
             goto case '\n';
 
-          // We don't copy per pointer because we might include
-          // invalid, skipped utf-8 sequences. See decodeUTF8().
-//           ++p;
-//           buffer ~= begin[0 .. p - begin];
           encodeUTF8(buffer, c);
           continue;
         }
@@ -1280,15 +1300,18 @@
     case '\'':
       ++p;
       id = MID.EmptyCharacterLiteral;
+    // fall through
     case '\n', '\r', 0, _Z_:
       goto Lerr;
+    case LS[0]:
+      if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+        goto Lerr;
+    // fall through
     default:
       uint c = *p;
       if (c & 128)
       {
         c = decodeUTF8();
-        if (c == LSd || c == PSd)
-          goto Lerr;
         if (c <= 0xFFFF)
           type = TOK.WCharLiteral;
         else
@@ -1337,8 +1360,10 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
         ++loc;
+        set_p_newl(p+1);
         break;
       case '`':
       case '"':
@@ -1403,7 +1428,9 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         ++loc;
+        set_p_newl(p+1);
         continue;
       default:
         if (ishexad(c))
@@ -1427,17 +1454,12 @@
           continue;
         }
         else if (isspace(c))
-          continue;
-
-        if (c & 128)
+          continue; // Skip spaces.
+        else if (c & 128)
         {
           c = decodeUTF8();
           if (c == LSd || c == PSd)
-          {
-            ++p; ++p;
-            ++loc;
-            continue;
-          }
+            goto case '\n';
         }
         else if (c == 0 || c == _Z_)
         {
@@ -1487,15 +1509,16 @@
           if (p[1] == '\n')
             ++p;
         case '\n':
+          assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
           ++p;
           ++loc;
+          set_p_newl(p);
           return '\n';
         case LS[0]:
           if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
           {
-            ++p; ++p; ++p;
-            ++loc;
-            return '\n';
+            ++p; ++p;
+            goto case '\n';
           }
         default:
         }
@@ -1504,8 +1527,8 @@
 
       // Skip leading newlines:
       while (scanNewline() != 0){}
-      assert(*p != '\n' && *p != '\r');
-      assert(!(*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])));
+      assert(*p != '\n' && *p != '\r' &&
+             !(*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])));
 
       char* begin = p;
       c = *p;
@@ -1554,8 +1577,10 @@
         if (p[1] == '\n')
           ++p;
       case '\n':
+        assert(*p == '\n' || *p == '\r' || *p == LS[2] || *p == PS[2]);
         c = '\n'; // Convert EndOfLine ('\r','\r\n','\n',LS,PS) to '\n'
         ++loc;
+        set_p_newl(p+1);
         break;
       case 0, _Z_:
         // TODO: error(MID.UnterminatedDelimitedString);