changeset 392:bb935c6f9b7a

Applied fixes and improvements to the Lexer class. In scanDelimitedStringLiteral: - leading newlines are skipped now. - passed c instead of closing_delim to isUniAlpha(). - the matching string delimiter must follow a newline. - corrected if-statements and an assert(). Added inTokenString member: - in scanTokenStringLiteral() this member is increment at the start and decremented at the end. - in scanSpecialTokenSequence() this member is checked and #line is not evaluated if it is not zero. Added missing break to case 'L' in scanNumber(). In scanReal(): - changed debug switch to a delegate inside an assert(). - removing underscores from buffer more efficiently. Made some other trivial changes and fixes to other modules.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Mon, 17 Sep 2007 20:35:08 +0200
parents 33b566df6af4
children fce1e6133dac
files trunk/src/cmd/Generate.d trunk/src/dil/Lexer.d trunk/src/main.d
diffstat 3 files changed, 121 insertions(+), 86 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/cmd/Generate.d	Sat Sep 15 17:12:26 2007 +0200
+++ b/trunk/src/cmd/Generate.d	Mon Sep 17 20:35:08 2007 +0200
@@ -12,14 +12,16 @@
 
 enum DocOption
 {
-  Tokens,
+  Empty,
+  Tokens = 1,
   Syntax = 1<<1,
-  HTML = 1<<2,
-  XML = 1<<3
+  HTML   = 1<<2,
+  XML    = 1<<3
 }
 
 void execute(string fileName, DocOption options)
 {
+  assert(options != DocOption.Empty);
   if (options & DocOption.Syntax)
     syntaxToDoc(fileName, Stdout, options);
   else
--- a/trunk/src/dil/Lexer.d	Sat Sep 15 17:12:26 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Mon Sep 17 20:35:08 2007 +0200
@@ -39,6 +39,7 @@
 
   uint loc_old; /// Store actual line number when #line token is parsed.
   uint loc_hline; /// Line number set by #line.
+  private uint inTokenString; // > 0 if inside q{ }
 
   char[] fileName;
 
@@ -935,11 +936,12 @@
     t.type = TOK.String;
 
     char[] buffer;
-    dchar opening_delim, // 0 if no nested delimiter or '[', '(', '<', '{'
-          closing_delim; // Will be ']', ')', '>', '}', any other character
-                         // or the first, decoded character of an identifier.
-    char[] str_delim; // Identifier delimiter
-    uint level = 1;
+    dchar opening_delim = 0, // 0 if no nested delimiter or '[', '(', '<', '{'
+          closing_delim; // Will be ']', ')', '>', '},
+                         // the first character of an identifier or
+                         // any other Unicode/ASCII character.
+    char[] str_delim; // Identifier delimiter.
+    uint level = 1; // Counter for nestable delimiters.
 
     ++p; ++p; // Skip q"
     uint c = *p;
@@ -947,49 +949,65 @@
     {
     case '(':
       opening_delim = c;
-      closing_delim = ')'; // *p + 1
+      closing_delim = ')'; // c + 1
       break;
     case '[', '<', '{':
       opening_delim = c;
       closing_delim = c + 2; // Get to closing counterpart. Feature of ASCII table.
       break;
     default:
+      dchar scanNewline()
+      {
+        switch (*p)
+        {
+        case '\r':
+          if (p[1] == '\n')
+            ++p;
+        case '\n':
+          ++p;
+          ++loc;
+          return '\n';
+        case LS[0]:
+          if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
+          {
+            ++p; ++p; ++p;
+            ++loc;
+            return '\n';
+          }
+        default:
+        }
+        return 0;
+      }
+
+      // Skip leading newlines:
+      while (scanNewline() != 0){}
+      assert(*p != '\n' && *p != '\r');
+      assert(!(*p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2])));
+
       char* begin = p;
+      c = *p;
       closing_delim = c;
-      // TODO: What to do about newlines? Skip or accept as delimiter?
       // TODO: Check for non-printable characters?
       if (c & 128)
       {
         closing_delim = decodeUTF8();
-        if (!isUniAlpha(c))
-          break;
+        if (!isUniAlpha(closing_delim))
+          break; // Not an identifier.
       }
       else if (!isidbeg(c))
-        break;
-      // Parse identifier + newline
+        break; // Not an identifier.
+
+      // Parse Identifier + EndOfLine
       do
       { c = *++p; }
       while (isident(c) || c & 128 && isUniAlpha(decodeUTF8()))
       // Store identifier
       str_delim = begin[0..p-begin];
       // Scan newline
-      switch (*p)
+      if (scanNewline() == '\n')
+        --p; // Go back one because of "c = *++p;" in main loop.
+      else
       {
-      case '\r':
-        if (p[1] == '\n')
-          ++p;
-      case '\n':
-        ++loc;
-        break;
-      case LS[0]:
-        if (p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]))
-        {
-          ++p; ++p;
-          ++loc;
-          break;
-        }
-        // goto default;
-      default:
         // TODO: error(MID.ExpectedNewlineAfterIdentDelim);
       }
     }
@@ -997,7 +1015,8 @@
     bool checkStringDelim(char* p)
     {
       assert(str_delim.length != 0);
-      if (end-p >= str_delim.length && // Check remaining length.
+      if (buffer[$-1] == '\n' && // Last character copied to buffer must be '\n'.
+          end-p >= str_delim.length && // Check remaining length.
           p[0..str_delim.length] == str_delim) // Compare.
         return true;
       return false;
@@ -1016,7 +1035,7 @@
         ++loc;
         break;
       case 0, _Z_:
-//         error(MID.UnterminatedDelimitedString);
+        // TODO: error(MID.UnterminatedDelimitedString);
         goto Lreturn3;
       default:
         if (c & 128)
@@ -1027,14 +1046,20 @@
             goto case '\n';
           if (c == closing_delim)
           {
-            if (str_delim.length && checkStringDelim(begin))
+            if (str_delim.length)
             {
-              p = begin + str_delim.length;
-              goto Lreturn2;
+              if (checkStringDelim(begin))
+              {
+                p = begin + str_delim.length;
+                goto Lreturn2;
+              }
             }
-            assert(level == 1);
-            --level;
-            goto Lreturn;
+            else
+            {
+              assert(level == 1);
+              --level;
+              goto Lreturn;
+            }
           }
           encodeUTF8(buffer, c);
           continue;
@@ -1045,30 +1070,33 @@
             ++level;
           else if (c == closing_delim)
           {
-            if (str_delim.length && checkStringDelim(p))
+            if (str_delim.length)
             {
-              p += str_delim.length;
-              goto Lreturn2;
+              if (checkStringDelim(p))
+              {
+                p += str_delim.length;
+                goto Lreturn2;
+              }
             }
-            if (--level == 0)
+            else if (--level == 0)
               goto Lreturn;
           }
         }
       }
       buffer ~= c; // copy character to buffer
     }
-  Lreturn:
-    assert(*p == closing_delim);
+  Lreturn: // Character delimiter.
+    assert(c == closing_delim);
     assert(level == 0);
     ++p; // Skip closing delimiter.
-  Lreturn2:
+  Lreturn2: // String delimiter.
     if (*p == '"')
       ++p;
     // else
     // TODO: error(MID.ExpectedDblQuoteAfterDelim, str_delim.length ? str_delim : p[-1]);
 
     t.pf = scanPostfix();
-  Lreturn3:
+  Lreturn3: // Error.
     t.str = buffer ~ '\0';
     t.end = p;
   }
@@ -1077,10 +1105,10 @@
   {
     assert(p[0] == 'q' && p[1] == '{');
     t.type = TOK.String;
-    // Copy members that might be changed by subsequent tokens. Like #line for example.
-    auto loc_old = this.loc_old;
-    auto loc_hline = this.loc_hline;
-    auto filePath = this.fileName;
+
+    // A guard against changes to particular members:
+    // this.loc_old, this.loc_hline and this.fileName
+    ++inTokenString;
 
     uint loc = this.loc;
     uint level = 1;
@@ -1171,12 +1199,9 @@
     assert(buffer[$-1] == '\0');
     t.str = buffer;
 
-    // Restore possibly changed members.
-    this.loc_old = loc_old;
-    this.loc_hline = loc_hline;
-    this.fileName = filePath;
+    --inTokenString;
   }
-}
+} // version(D2)
 
   dchar scanEscapeSequence()
   {
@@ -1314,10 +1339,12 @@
     case 'L':
       if (p[1] == 'i')
         goto LscanReal;
+      break;
     case '.':
       if (p[1] == '.')
         break;
-    case 'i','f','F', 'e', 'E': // Imaginary and float literal suffix
+    case 'i','f','F', // Imaginary and float literal suffixes.
+         'e', 'E':    // Float exponent.
       goto LscanReal;
     default:
       if (*p == '_' || isoctal(*p))
@@ -1504,6 +1531,7 @@
       Long     = 2
     }
 
+    // Scan optional suffix: L, Lu, LU, u, uL, U or UL.
     Suffix suffix;
     while (1)
     {
@@ -1527,6 +1555,7 @@
       break;
     }
 
+    // Determine type of Integer.
     switch (suffix)
     {
     case Suffix.None:
@@ -1584,20 +1613,26 @@
   void scanReal(ref Token t)
   {
     if (*p == '.')
+    {
+      assert(p[1] != '.');
       // This function was called by scan() or scanNumber().
       while (isdigit(*++p) || *p == '_') {}
+    }
     else
-    {
       // This function was called by scanNumber().
-      debug switch (*p)
-      {
-      case 'L':
-        if (p[1] != 'i')
-          assert(0);
-      case 'i', 'f', 'F', 'e', 'E': break;
-      default: assert(0);
-      }
-    }
+      assert(delegate (){
+        switch (*p)
+        {
+        case 'L':
+          if (p[1] != 'i')
+            return false;
+        case 'i', 'f', 'F', 'e', 'E':
+          return true;
+        default:
+        }
+        return false;
+      }()
+      );
 
     // Scan exponent.
     if (*p == 'e' || *p == 'E')
@@ -1611,21 +1646,15 @@
         while (isdigit(*++p) || *p == '_') {}
     }
 
-    // Copy string to buffer ignoring underscores.
-    char[] buffer;
-    char* end = p;
-    p = t.start;
-    do
-    {
-      if (*p == '_')
-      {
-        ++p;
-        continue;
-      }
-      buffer ~= *p;
-      ++p;
-    } while (p != end)
-    buffer ~= 0;
+    // Copy whole number and remove underscores from buffer.
+    char[] buffer = t.start[0..p-t.start].dup;
+    uint j;
+    foreach (c; buffer)
+      if (c != '_')
+        buffer[j++] = c;
+    buffer.length = j; // Adjust length.
+    buffer ~= 0; // Terminate for C functions.
+
     finalizeFloat(t, buffer);
   }
 
@@ -1808,10 +1837,14 @@
       goto Lerr;
     }
 
-    this.loc_old = this.loc;
-    this.loc_hline = t.line_num.uint_ - 1;
-    if (t.line_filespec)
-      this.fileName = t.line_filespec.str;
+    // Evaluate #line only when not in token string.
+    if (!inTokenString)
+    {
+      this.loc_old = this.loc;
+      this.loc_hline = t.line_num.uint_ - 1;
+      if (t.line_filespec)
+        this.fileName = t.line_filespec.str;
+    }
     t.end = p;
 
     return;
--- a/trunk/src/main.d	Sat Sep 15 17:12:26 2007 +0200
+++ b/trunk/src/main.d	Mon Sep 17 20:35:08 2007 +0200
@@ -61,7 +61,7 @@
       else if(strbeg(arg, "-r"))
         regexps ~= arg[2..$];
       else if(strbeg(arg, "-l"))
-        levels = Integer.parse (arg[2..$]);
+        levels = Integer.toInt(arg[2..$]);
       else
         switch (arg)
         {