changeset 430:e6c759e151cd

Fixed a few things regarding encoding/decoding UTF-8 sequences. When an escape sequence was scanned with an invalid Unicode code point then an error is reported and REPLACEMENT_CHAR is encoded instead. The same is done when an invalid UTF-8 sequence was encountered. Added a few stub methods to class Scope.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Wed, 03 Oct 2007 23:00:46 +0200
parents 052cbacb1642
children 7a6bfa569a52
files trunk/src/dil/Lexer.d trunk/src/dil/Scope.d
diffstat 2 files changed, 74 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/dil/Lexer.d	Wed Oct 03 17:13:50 2007 +0200
+++ b/trunk/src/dil/Lexer.d	Wed Oct 03 23:00:46 2007 +0200
@@ -18,12 +18,15 @@
 import std.uni;
 import common;
 
-const char[3] LS = \u2028;
-const char[3] PS = \u2029;
+const char[3] LS = \u2028; /// Line separator.
+const char[3] PS = \u2029; /// Paragraph separator.
 
 const dchar LSd = 0x2028;
 const dchar PSd = 0x2029;
 
+/// U+FFFD = �. Used to replace invalid Unicode characters.
+const dchar REPLACEMENT_CHAR = '\uFFFD';
+
 const uint _Z_ = 26; /// Control+Z
 
 class Lexer
@@ -1829,7 +1832,7 @@
       }
       if (!isEncodable(c))
       {
-        c = 0;
+        c = REPLACEMENT_CHAR;
         error(sequenceStart, MID.InvalidUnicodeCharacter);
       }
       return c;
@@ -2555,13 +2558,35 @@
     return !(ident in reserved_ids_table);
   }
 
-  /+
+  /++
     Returns true if d can be encoded as a UTF-8 sequence.
   +/
   bool isEncodable(dchar d)
   {
     return d < 0xD800 ||
-          (d > 0xDFFF && d <= 0x10FFFF && d != 0xFFFF && d != 0xFFFE);
+          (d > 0xDFFF && d <= 0x10FFFF);
+  }
+
+  /++
+    There are a total of 66 noncharacters.
+    Returns true if this is one of them.
+    See_also: Chapter 16.7 Noncharacters in Unicode 5.0
+  +/
+  bool isNoncharacter(dchar d)
+  {
+    return 0xFDD0 <= d && d <= 0xFDEF || // 32
+           d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
+  }
+
+  /++
+    Returns true if this character is not a noncharacter, not a surrogate
+    code point and not higher than 0x10FFFF.
+  +/
+  bool isValidDecodedChar(dchar d)
+  {
+    return d < 0xD800 ||
+          (d > 0xDFFF && d < 0xFDD0) ||
+          (d > 0xFDEF && d <= 0x10FFFF && (d & 0xFFFF) < 0xFFFE);
   }
 
   /// Is this a trail byte of a UTF-8 sequence?
@@ -2654,7 +2679,7 @@
       --p;
       assert(!isTrailByte(p[1]));
     Lerr2:
-      d = 0;
+      d = REPLACEMENT_CHAR;
       error(this.p, MID.InvalidUTF8Sequence);
     }
 
@@ -2665,10 +2690,9 @@
   private void encodeUTF8(ref char[] str, dchar d)
   {
     char[6] b;
-    assert(!isascii(d) || d == 0, "check for ASCII char before calling encodeUTF8().");
+    assert(!isascii(d), "check for ASCII char before calling encodeUTF8().");
     assert(isEncodable(d), "check that 'd' is encodable before calling encodeUTF8().");
-    if (d == 0)
-      return;
+
     if (d < 0x800)
     {
       b[0] = 0xC0 | (d >> 6);
--- a/trunk/src/dil/Scope.d	Wed Oct 03 17:13:50 2007 +0200
+++ b/trunk/src/dil/Scope.d	Wed Oct 03 23:00:46 2007 +0200
@@ -3,9 +3,50 @@
   License: GPL3
 +/
 module dil.Scope;
+import dil.Symbol;
 import common;
 
 class Scope
 {
+  Scope parent; /// The surrounding scope.
 
+  this()
+  {
+  }
+
+  /++
+    Find an identifier in this scope.
+  +/
+  Symbol find(char[] ident)
+  {
+
+  }
+
+  /++
+    Add a symbol to this scope.
+  +/
+  void add(Symbol sym)
+  {
+
+  }
+
+  /++
+    Create a new inner scope.
+  +/
+  Scope push()
+  {
+    auto sc = new Scope();
+    sc.parent = this;
+    return sc;
+  }
+
+  /++
+    Destroy this scope and return the outer scope.
+  +/
+  Scope pop()
+  {
+    auto sc = parent;
+    // delete this;
+    return sc;
+  }
 }