changeset 510:dd3ce87b3569

Added module dil.Unicode. Moved some functions from dil.Lexer to dil.Unicode. Added isIdentifierString() to dil.Lexer. Renamed isNonReservedIdentifier() to isReservedIdentifier().
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Thu, 13 Dec 2007 18:45:29 +0100
parents baa7c4c0be78
children aa73f669c298
files trunk/src/dil/Lexer.d trunk/src/dil/Module.d trunk/src/dil/Unicode.d
diffstat 3 files changed, 260 insertions(+), 76 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/dil/Lexer.d	Wed Dec 12 22:17:20 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Thu Dec 13 18:45:29 2007 +0100
@@ -11,19 +11,15 @@
 import dil.HtmlEntities;
 import dil.CompilerInfo;
 import dil.IdTable;
+import dil.Unicode;
 import tango.stdc.stdlib : strtof, strtod, strtold;
 import tango.stdc.errno : errno, ERANGE;
 import tango.stdc.time : time_t, time, ctime;
 import tango.stdc.string : strlen;
-import std.utf;
-import std.uni;
 import common;
 
 public import dil.LexerFuncs;
 
-/// U+FFFD = �. Used to replace invalid Unicode characters.
-const dchar REPLACEMENT_CHAR = '\uFFFD';
-
 /++
   The Lexer analyzes the characters of a source text and
   produces a doubly-linked list of tokens.
@@ -1698,7 +1694,7 @@
 
   dchar scanEscapeSequence()
   out(result)
-  { assert(isEncodable(result)); }
+  { assert(isValidChar(result)); }
   body
   {
     assert(*p == '\\');
@@ -1735,7 +1731,7 @@
           if (!--digits)
           {
             ++p;
-            if (isEncodable(c))
+            if (isValidChar(c))
               return c; // Return valid escape value.
 
             error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]);
@@ -2418,80 +2414,35 @@
       table[k.str] = k;
   }
 
-  static bool isNonReservedIdentifier(char[] ident)
+  /// Returns true if str is a valid D identifier.
+  static bool isIdentifierString(char[] str)
   {
-    if (ident.length == 0)
+    if (str.length == 0 || isdigit(str[0]))
+      return false;
+    size_t idx;
+    do
+    {
+      auto c = dil.Unicode.decode(str, idx);
+      if (c == ERROR_CHAR || !(isident(c) || !isascii(c) && isUniAlpha(c)))
+        return false;
+    } while (idx < str.length)
+    return true;
+  }
+
+  /// Returns true if str is a keyword or a special token (__FILE__, __LINE__ etc.)
+  static bool isReservedIdentifier(char[] str)
+  {
+    if (str.length == 0)
       return false;
 
     static Identifier[string] reserved_ids_table;
     if (reserved_ids_table is null)
       Lexer.loadKeywords(reserved_ids_table);
 
-    size_t idx = 1; // Index to the 2nd character in ident.
-    dchar isFirstCharUniAlpha()
-    {
-      idx = 0;
-      // NB: decode() could throw an Exception which would be
-      // caught by the next try-catch-block.
-      return isUniAlpha(std.utf.decode(ident, idx));
-    }
-
-    try
-    {
-      if (isidbeg(ident[0]) || !isascii(ident[0]) && isFirstCharUniAlpha())
-      {
-        foreach (dchar c; ident[idx..$])
-          if (!isident(c) && !isUniAlpha(c))
-            return false;
-      }
-    }
-    catch (Exception)
+    if (!isIdentifierString(str))
       return false;
 
-    return !(ident in reserved_ids_table);
-  }
-
-  /++
-    Returns true if d can be encoded as a UTF-8 sequence.
-  +/
-  bool isEncodable(dchar d)
-  {
-    return d < 0xD800 ||
-          (d > 0xDFFF && d <= 0x10FFFF);
-  }
-
-  /++
-    There are a total of 66 noncharacters.
-    Returns true if this is one of them.
-    See_also: Chapter 16.7 Noncharacters in Unicode 5.0
-  +/
-  bool isNoncharacter(dchar d)
-  {
-    return 0xFDD0 <= d && d <= 0xFDEF || // 32
-           d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
-  }
-
-  /++
-    Returns true if this character is not a noncharacter, not a surrogate
-    code point and not higher than 0x10FFFF.
-  +/
-  bool isValidDecodedChar(dchar d)
-  {
-    return d < 0xD800 ||
-          (d > 0xDFFF && d < 0xFDD0) ||
-          (d > 0xFDEF && d <= 0x10FFFF && (d & 0xFFFF) < 0xFFFE);
-  }
-
-  /// Is this a trail byte of a UTF-8 sequence?
-  bool isTrailByte(ubyte b)
-  {
-    return (b & 0xC0) == 0x80; // 10xx_xxxx
-  }
-
-  /// Is this a lead byte of a UTF-8 sequence?
-  bool isLeadByte(ubyte b)
-  {
-    return (b & 0xC0) == 0xC0; // 11xx_xxxx
+    return (str in reserved_ids_table) !is null;
   }
 
   dchar decodeUTF8()
@@ -2553,7 +2504,7 @@
 
     assert(isTrailByte(*p));
 
-    if (!isEncodable(d))
+    if (!isValidChar(d))
     {
     Lerr:
       // Three cases:
@@ -2582,9 +2533,9 @@
 
   private void encodeUTF8(ref char[] str, dchar d)
   {
-    char[6] b;
+    char[6] b = void;
     assert(!isascii(d), "check for ASCII char before calling encodeUTF8().");
-    assert(isEncodable(d), "check that 'd' is encodable before calling encodeUTF8().");
+    assert(isValidChar(d), "check if character is valid before calling encodeUTF8().");
 
     if (d < 0x800)
     {
--- a/trunk/src/dil/Module.d	Wed Dec 12 22:17:20 2007 +0200
+++ b/trunk/src/dil/Module.d	Thu Dec 13 18:45:29 2007 +0100
@@ -58,7 +58,7 @@
       {
         // Take base name of file path as module name.
         auto str = (new FilePath(filePath)).name();
-        if (Lexer.isNonReservedIdentifier(str))
+        if (!Lexer.isReservedIdentifier(str))
         {
           this.moduleFQN = moduleName = str;
         }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trunk/src/dil/Unicode.d	Thu Dec 13 18:45:29 2007 +0100
@@ -0,0 +1,233 @@
+/++
+  Author: Aziz Köksal
+  License: GPL3
++/
+module dil.Unicode;
+public import std.uni : isUniAlpha;
+
+/// U+FFFD = �. Used to replace invalid Unicode characters.
+const dchar REPLACEMENT_CHAR = '\uFFFD';
+/// Invalid character, returned on errors.
+const dchar ERROR_CHAR = 0xD800;
+
+/++
+  Returns true if this character is not a surrogate
+  code point and not higher than 0x10FFFF.
++/
+bool isValidChar(dchar d)
+{
+  return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF;
+}
+
+/++
+  Returns true if this is one of the
+  There are a total of 66 noncharacters.
+  See_also: Chapter 16.7 Noncharacters in Unicode 5.0
++/
+bool isNoncharacter(dchar d)
+{
+  return 0xFDD0 <= d && d <= 0xFDEF || // 32
+         d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
+}
+
+/// Returns true if this is a trail byte of a UTF-8 sequence?
+bool isTrailByte(ubyte b)
+{
+  return (b & 0xC0) == 0x80; // 10xx_xxxx
+}
+
+/// Returns true if this is a lead byte of a UTF-8 sequence.
+bool isLeadByte(ubyte b)
+{
+  return (b & 0xC0) == 0xC0; // 11xx_xxxx
+}
+
+dchar decode(char[] str, ref size_t index)
+in { assert(str.length); }
+out(c) { assert(isValidChar(c)); }
+body
+{
+  char* p = str.ptr + index;
+  char* end = str.ptr + str.length;
+  dchar c = *p;
+
+  if (!(p < end))
+    return ERROR_CHAR;
+
+  if (c < 0x80)
+  {
+    ++index;
+    return c;
+  }
+
+  ++p; // Move to second byte.
+  if (!(p < end))
+    return ERROR_CHAR;
+
+  // Error if second byte is not a trail byte.
+  if (!isTrailByte(*p))
+    return ERROR_CHAR;
+
+  // Check for overlong sequences.
+  switch (c)
+  {
+  case 0xE0, // 11100000 100xxxxx
+       0xF0, // 11110000 1000xxxx
+       0xF8, // 11111000 10000xxx
+       0xFC: // 11111100 100000xx
+    if ((*p & c) == 0x80)
+      return ERROR_CHAR;
+  default:
+    if ((c & 0xFE) == 0xC0) // 1100000x
+      return ERROR_CHAR;
+  }
+
+  const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))"
+                                "  return ERROR_CHAR;";
+  const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;";
+
+  auto next_index = index;
+  // Decode
+  if ((c & 0b1110_0000) == 0b1100_0000)
+  {
+    // 110xxxxx 10xxxxxx
+    c &= 0b0001_1111;
+    mixin(appendSixBits);
+    next_index += 2;
+  }
+  else if ((c & 0b1111_0000) == 0b1110_0000)
+  {
+    // 1110xxxx 10xxxxxx 10xxxxxx
+    c &= 0b0000_1111;
+    mixin(appendSixBits ~
+          checkNextByte ~ appendSixBits);
+    next_index += 3;
+  }
+  else if ((c & 0b1111_1000) == 0b1111_0000)
+  {
+    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    c &= 0b0000_0111;
+    mixin(appendSixBits ~
+          checkNextByte ~ appendSixBits ~
+          checkNextByte ~ appendSixBits);
+    next_index += 4;
+  }
+  else
+    // 5 and 6 byte UTF-8 sequences are not allowed yet.
+    // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    return ERROR_CHAR;
+
+  assert(isTrailByte(*p));
+
+  if (!isValidChar(c))
+    return ERROR_CHAR;
+  index = next_index;
+  return c;
+}
+
+/// Encodes a character and appends it to str.
+void encode(ref wchar[] str, dchar c)
+in { assert(isValidChar(c)); }
+body
+{
+  if (c < 0x10000)
+    str ~= cast(wchar)c;
+  else
+  {
+    // Encode with surrogate pair.
+    wchar[2] pair = void;
+    c -= 0x10000; // c'
+    // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
+    pair[0] = (c >> 10) | 0xD800;
+    // lower10bits(c') | 0b1101_11yy_yyyy_yyyy
+    pair[1] = (c & 0x3FF) | 0xDC00;
+    str ~= pair;
+  }
+}
+
+/++
+  Returns a decoded character from a UTF-16 sequence.
+  In case of an error in the sequence 0xD800 is returned.
+  Params:
+    str = the UTF-16 sequence.
+    index = where to start from.
++/
+dchar decode(wchar[] str, ref size_t index)
+{
+  assert(str.length && index < str.length);
+  dchar c = str[index];
+  if (0xD800 > c || c > 0xDFFF)
+  {
+    ++index;
+    return c;
+  }
+  if (c <= 0xDBFF && index+1 != str.length)
+  {
+    wchar c2 = str[index+1];
+    if (0xDC00 <= c2 && c2 <= 0xDFFF)
+    {
+      // (c - 0xD800) << 10 + 0x10000 ->
+      // (c - 0xD800 + 0x40) << 10 ->
+      c = (c - 0xD7C0) << 10;
+      c |= (c2 & 0x3FF);
+      index += 2;
+      return c;
+    }
+  }
+  return ERROR_CHAR;
+}
+
+/++
+  Returns a decoded character from a UTF-16 sequence.
+  In case of an error in the sequence 0xD800 is returned.
+  Params:
+    p = start of the UTF-16 sequence.
+    end = one past the end of the sequence.
++/
+dchar decode(ref wchar* p, wchar* end)
+{
+  assert(p && p < end);
+  dchar c = *p;
+  if (0xD800 > c || c > 0xDFFF)
+  {
+    ++p;
+    return c;
+  }
+  if (c <= 0xDBFF && p+1 != end)
+  {
+    wchar c2 = p[1];
+    if (0xDC00 <= c2 && c2 <= 0xDFFF)
+    {
+      c = (c - 0xD7C0) << 10;
+      c |= (c2 & 0x3FF);
+      p += 2;
+      return c;
+    }
+  }
+  return ERROR_CHAR;
+}
+
+/// Decode a character from a zero-terminated string.
+dchar decode(ref wchar* p)
+{
+  assert(p);
+  dchar c = *p;
+  if (0xD800 > c || c > 0xDFFF)
+  {
+    ++p;
+    return c;
+  }
+  if (c <= 0xDBFF)
+  {
+    wchar c2 = p[1];
+    if (0xDC00 <= c2 && c2 <= 0xDFFF)
+    {
+      c = (c - 0xD7C0) << 10;
+      c |= (c2 & 0x3FF);
+      p += 2;
+      return c;
+    }
+  }
+  return ERROR_CHAR;
+}