changeset 390:4d36eea1bbc9

Refactored Lexer.scan(). Illegal characters are not ignored anymore. They are reported as errors. Added a new member 'ws' to Token. When a token is scanned the lexer sets ws to the leading whitespace or leaves it at null when no whitespace was found. Added Illegal to enum TOK and IllegalCharacter to enum MID. Added localized messages for MID.IllegalCharacter. Adapted code of cmd.Generate to make use of Token.ws.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Wed, 12 Sep 2007 21:03:41 +0200
parents c4bfceab7246
children 33b566df6af4
files trunk/src/cmd/Generate.d trunk/src/dil/Lexer.d trunk/src/dil/Messages.d trunk/src/dil/Token.d trunk/src/lang_de.d trunk/src/lang_en.d trunk/src/lang_fi.d trunk/src/lang_tr.d
diffstat 8 files changed, 67 insertions(+), 45 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/cmd/Generate.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/cmd/Generate.d	Wed Sep 12 21:03:41 2007 +0200
@@ -251,7 +251,6 @@
   auto lx = parser.lx;
 
   auto token = lx.head;
-  char* end = lx.text.ptr;
 
   writefln(tags[DocPart.Head]);
   // Output error messages.
@@ -312,10 +311,6 @@
   {
     token = token.next;
 
-    // Print whitespace between previous and current token.
-    if (end != token.start)
-      writef("%s", end[0 .. token.start - end]);
-
     Node[]* nodes = token in beginNodes;
 
     if (nodes)
@@ -336,8 +331,6 @@
         else
           writef(tags[DocPart.SyntaxEnd], getTag(node.category));
     }
-
-    end = token.end;
   }
   writef(tags[DocPart.SrcEnd], tags[DocPart.Tail]);
 }
@@ -349,7 +342,6 @@
   auto lx = new Lexer(sourceText, fileName);
 
   auto token = lx.getTokens();
-  char* end = lx.text.ptr;
 
   writefln(tags[DocPart.Head]);
 
@@ -368,12 +360,7 @@
   while (token.type != TOK.EOF)
   {
     token = token.next;
-
-    // Print whitespace between previous and current token.
-    if (end != token.start)
-      writef("%s", end[0 .. token.start - end]);
     printToken(token, tags);
-    end = token.end;
   }
   writef(\n, tags[DocPart.SrcEnd], \n, tags[DocPart.Tail]);
 }
@@ -383,6 +370,10 @@
   alias DocPart DP;
   string srcText = xml_escape(token.srcText);
 
+  // Print whitespace.
+  if (token.ws)
+    writef(token.ws[0..token.start - token.ws]);
+
   switch(token.type)
   {
   case TOK.Identifier:
--- a/trunk/src/dil/Lexer.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Wed Sep 12 21:03:41 2007 +0200
@@ -173,9 +173,41 @@
   }
   body
   {
-    uint c = *p;
+    // Scan whitespace.
+    auto pws = p;
+    while (1)
+    {
+      switch (*p)
+      {
+      case '\r':
+        if (p[1] == '\n')
+          ++p;
+      case '\n':
+        ++p;
+        ++loc;
+        continue;
+      case LS[0]:
+        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+        {
+          p += 3;
+          ++loc;
+          continue;
+        }
+        // goto default;
+      default:
+        if (!isspace(*p))
+          break;
+        ++p;
+        continue;
+      }
+      break; // Exit loop.
+    }
 
-    while (1)
+    if (p != pws)
+      t.ws = pws;
+
+    // Scan token.
+    uint c = *p;
     {
       t.start = p;
 
@@ -189,26 +221,6 @@
         return;
       }
 
-      if (c == '\n')
-      {
-        c = *++p;
-        ++loc;
-        continue;
-      }
-      else if (c == '\r')
-      {
-        c = *++p;
-        if (c != '\n')
-          ++loc;
-        continue;
-      }
-      else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-      {
-        p += 3;
-        c = *p;
-        continue;
-      }
-
       if (isidbeg(c))
       {
         if (c == 'r' && p[1] == '"' && ++p)
@@ -646,9 +658,20 @@
       default:
       }
 
-      if (c & 128 && isUniAlpha(decodeUTF8()))
-        goto Lidentifier;
-      c = *++p;
+      if (c & 128)
+      {
+        c = decodeUTF8();
+        if (isUniAlpha(c))
+          goto Lidentifier;
+      }
+
+      error(MID.IllegalCharacter, cast(dchar)c);
+
+      ++p;
+      t.type = TOK.Illegal;
+      t.dchar_ = c;
+      t.end = p;
+      return;
     }
   }
 
--- a/trunk/src/dil/Messages.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Messages.d	Wed Sep 12 21:03:41 2007 +0200
@@ -10,6 +10,7 @@
 enum MID
 {
   // Lexer messages:
+  IllegalCharacter,
   InvalidUnicodeCharacter,
   InvalidUTF8Sequence,
   // ''
--- a/trunk/src/dil/Token.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Token.d	Wed Sep 12 21:03:41 2007 +0200
@@ -18,12 +18,13 @@
 
   /// Flag for whitespace tokens that must be ignored in the parsing phase.
   Whitespace = 0x8000,
-  Comment = 1 | Whitespace,
-  Shebang = 2 | Whitespace,
-  HashLine = 3 | Whitespace,
-  Filespec = 4 | Whitespace,
+  Illegal = 1 | Whitespace,
+  Comment = 2 | Whitespace,
+  Shebang = 3 | Whitespace,
+  HashLine = 4 | Whitespace,
+  Filespec = 5 | Whitespace,
 
-  Identifier = 5,
+  Identifier = 6,
   String,
   CharLiteral, WCharLiteral, DCharLiteral,
 
@@ -121,8 +122,9 @@
 
   Token* next, prev;
 
-  char* start;
-  char* end;
+  char* ws;    /// Start of whitespace characters before token. Null if no WS.
+  char* start; /// Start of token in source text.
+  char* end;   /// Points one past the end of token in source text.
 
   union
   {
@@ -236,6 +238,7 @@
 const string[] tokToString = [
   "Invalid",
 
+  "Illegal",
   "Comment",
   "#! /shebang/",
   "#line",
--- a/trunk/src/lang_de.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_de.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@
 
 string[] messages = [
   // Lexer messages:
+  "illegales Zeichen gefunden: '{1}'",
   "ungültiges Unicodezeichen.",
   "ungültige UTF-8-Sequenz.",
   // ''
--- a/trunk/src/lang_en.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_en.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@
 
 string[] messages = [
   // Lexer messages:
+  "illegal character found: '{1}'",
   "invalid Unicode character.",
   "invalid UTF-8 sequence.",
   // ''
--- a/trunk/src/lang_fi.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_fi.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@
 
 string[] messages = [
   // Lexer messages:
+  "", // TODO: translate
   "virheellinen Unicode-merkki.",
   "virheellinen UTF-8-merkkijono.",
   // ''
--- a/trunk/src/lang_tr.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_tr.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@
 
 string[] messages = [
   // Lexer messages:
+  "illegal karakter bulundu: '{1}'",
   "geçersiz Unikod karakteri.",
   "geçersiz UTF-8 serisi.",
   // ''