Mercurial > projects > dil

--- a/trunk/src/cmd/Generate.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/cmd/Generate.d	Wed Sep 12 21:03:41 2007 +0200
@@ -251,7 +251,6 @@
   auto lx = parser.lx;

   auto token = lx.head;
-  char* end = lx.text.ptr;

   writefln(tags[DocPart.Head]);
   // Output error messages.
@@ -312,10 +311,6 @@
   {
     token = token.next;

-    // Print whitespace between previous and current token.
-    if (end != token.start)
-      writef("%s", end[0 .. token.start - end]);
-
     Node[]* nodes = token in beginNodes;

     if (nodes)
@@ -336,8 +331,6 @@
         else
           writef(tags[DocPart.SyntaxEnd], getTag(node.category));
     }
-
-    end = token.end;
   }
   writef(tags[DocPart.SrcEnd], tags[DocPart.Tail]);
 }
@@ -349,7 +342,6 @@
   auto lx = new Lexer(sourceText, fileName);

   auto token = lx.getTokens();
-  char* end = lx.text.ptr;

   writefln(tags[DocPart.Head]);

@@ -368,12 +360,7 @@
   while (token.type != TOK.EOF)
   {
     token = token.next;
-
-    // Print whitespace between previous and current token.
-    if (end != token.start)
-      writef("%s", end[0 .. token.start - end]);
     printToken(token, tags);
-    end = token.end;
   }
   writef(\n, tags[DocPart.SrcEnd], \n, tags[DocPart.Tail]);
 }
@@ -383,6 +370,10 @@
   alias DocPart DP;
   string srcText = xml_escape(token.srcText);

+  // Print whitespace.
+  if (token.ws)
+    writef(token.ws[0..token.start - token.ws]);
+
   switch(token.type)
   {
   case TOK.Identifier:
--- a/trunk/src/dil/Lexer.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Wed Sep 12 21:03:41 2007 +0200
@@ -173,9 +173,41 @@
   }
   body
   {
-    uint c = *p;
+    // Scan whitespace.
+    auto pws = p;
+    while (1)
+    {
+      switch (*p)
+      {
+      case '\r':
+        if (p[1] == '\n')
+          ++p;
+      case '\n':
+        ++p;
+        ++loc;
+        continue;
+      case LS[0]:
+        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+        {
+          p += 3;
+          ++loc;
+          continue;
+        }
+        // goto default;
+      default:
+        if (!isspace(*p))
+          break;
+        ++p;
+        continue;
+      }
+      break; // Exit loop.
+    }

-    while (1)
+    if (p != pws)
+      t.ws = pws;
+
+    // Scan token.
+    uint c = *p;
     {
       t.start = p;

@@ -189,26 +221,6 @@
         return;
       }

-      if (c == '\n')
-      {
-        c = *++p;
-        ++loc;
-        continue;
-      }
-      else if (c == '\r')
-      {
-        c = *++p;
-        if (c != '\n')
-          ++loc;
-        continue;
-      }
-      else if (c == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-      {
-        p += 3;
-        c = *p;
-        continue;
-      }
-
       if (isidbeg(c))
       {
         if (c == 'r' && p[1] == '"' && ++p)
@@ -646,9 +658,20 @@
       default:
       }

-      if (c & 128 && isUniAlpha(decodeUTF8()))
-        goto Lidentifier;
-      c = *++p;
+      if (c & 128)
+      {
+        c = decodeUTF8();
+        if (isUniAlpha(c))
+          goto Lidentifier;
+      }
+
+      error(MID.IllegalCharacter, cast(dchar)c);
+
+      ++p;
+      t.type = TOK.Illegal;
+      t.dchar_ = c;
+      t.end = p;
+      return;
     }
   }
--- a/trunk/src/dil/Messages.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Messages.d	Wed Sep 12 21:03:41 2007 +0200
@@ -10,6 +10,7 @@
 enum MID
 {
   // Lexer messages:
+  IllegalCharacter,
   InvalidUnicodeCharacter,
   InvalidUTF8Sequence,
   // ''
--- a/trunk/src/dil/Token.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/dil/Token.d	Wed Sep 12 21:03:41 2007 +0200
@@ -18,12 +18,13 @@

   /// Flag for whitespace tokens that must be ignored in the parsing phase.
   Whitespace = 0x8000,
-  Comment = 1 | Whitespace,
-  Shebang = 2 | Whitespace,
-  HashLine = 3 | Whitespace,
-  Filespec = 4 | Whitespace,
+  Illegal = 1 | Whitespace,
+  Comment = 2 | Whitespace,
+  Shebang = 3 | Whitespace,
+  HashLine = 4 | Whitespace,
+  Filespec = 5 | Whitespace,

-  Identifier = 5,
+  Identifier = 6,
   String,
   CharLiteral, WCharLiteral, DCharLiteral,

@@ -121,8 +122,9 @@

   Token* next, prev;

-  char* start;
-  char* end;
+  char* ws;    /// Start of whitespace characters before token. Null if no WS.
+  char* start; /// Start of token in source text.
+  char* end;   /// Points one past the end of token in source text.

   union
   {
@@ -236,6 +238,7 @@
 const string[] tokToString = [
   "Invalid",

+  "Illegal",
   "Comment",
   "#! /shebang/",
   "#line",
--- a/trunk/src/lang_de.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_de.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@

 string[] messages = [
   // Lexer messages:
+  "illegales Zeichen gefunden: '{1}'",
   "ungültiges Unicodezeichen.",
   "ungültige UTF-8-Sequenz.",
   // ''
--- a/trunk/src/lang_en.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_en.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@

 string[] messages = [
   // Lexer messages:
+  "illegal character found: '{1}'",
   "invalid Unicode character.",
   "invalid UTF-8 sequence.",
   // ''
--- a/trunk/src/lang_fi.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_fi.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@

 string[] messages = [
   // Lexer messages:
+  "", // TODO: translate
   "virheellinen Unicode-merkki.",
   "virheellinen UTF-8-merkkijono.",
   // ''
--- a/trunk/src/lang_tr.d	Wed Sep 12 18:18:29 2007 +0200
+++ b/trunk/src/lang_tr.d	Wed Sep 12 21:03:41 2007 +0200
@@ -7,6 +7,7 @@

 string[] messages = [
   // Lexer messages:
+  "illegal karakter bulundu: '{1}'",
   "geçersiz Unikod karakteri.",
   "geçersiz UTF-8 serisi.",
   // ''