changeset 789:c1d5cfd7aa44

Implemented string literal conversion. Removed two MID messages. Added MSG.InvalidUTF8SequenceInString. Added toUTF16() and toUTF32(). Fixed escape sequences. Added formatBytes() and findInvalidUTF8Sequence().
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Mon, 25 Feb 2008 02:56:22 +0100
parents 139c9a6a39a8
children a83a07f6233d
files trunk/src/dil/Messages.d trunk/src/dil/Unicode.d trunk/src/dil/lexer/Lexer.d trunk/src/dil/parser/Parser.d trunk/src/lang_de.d trunk/src/lang_en.d trunk/src/lang_fi.d trunk/src/lang_tr.d
diffstat 8 files changed, 176 insertions(+), 70 deletions(-) [+]
line wrap: on
line diff
--- a/trunk/src/dil/Messages.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/dil/Messages.d	Mon Feb 25 02:56:22 2008 +0100
@@ -10,7 +10,7 @@
 {
   // Lexer messages:
   IllegalCharacter,
-  InvalidUnicodeCharacter,
+//   InvalidUnicodeCharacter,
   InvalidUTF8Sequence,
   // ''
   UnterminatedCharacterLiteral,
@@ -18,7 +18,7 @@
   // #line
   ExpectedIdentifierSTLine,
   ExpectedIntegerAfterSTLine,
-  ExpectedFilespec, // Deprecated.
+//   ExpectedFilespec,
   UnterminatedFilespec,
   UnterminatedSpecialToken,
   // ""
@@ -109,6 +109,7 @@
   auto UndefinedDDocMacro = "DDoc macro '{}' is undefined";
   auto UnterminatedDDocMacro = "DDoc macro '{}' has no closing ')'";
   // Parser messages:
+  auto InvalidUTF8SequenceInString = "invalid UTF-8 sequence in string literal: '{0}'";
   auto ModuleDeclarationNotFirst = "a module declaration is only allowed as the first declaration in a file";
   auto StringPostfixMismatch = "string literal has mistmatching postfix character";
   auto ExpectedIdAfterTypeDot = "expected identifier after '(Type).', not '{}'";
--- a/trunk/src/dil/Unicode.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/dil/Unicode.d	Mon Feb 25 02:56:22 2008 +0100
@@ -54,34 +54,37 @@
   return true;
 }
 
-/// index is set one past the last trail byte of the valid UTF-8 sequence.
+/// Decodes a character from str at index.
+/// Params:
+///   index = set to one past the ASCII char or one past the last trail byte
+///           of the valid UTF-8 sequence.
 dchar decode(char[] str, ref size_t index)
 in { assert(str.length && index < str.length); }
-out(c) { assert(isValidChar(c) || c == ERROR_CHAR); }
+out { assert(index <= str.length); }
 body
 {
   char* p = str.ptr + index;
   char* end = str.ptr + str.length;
   dchar c = decode(p, end);
   if (c != ERROR_CHAR)
-    index = p - str.ptr + 1;
+    index = p - str.ptr;
   return c;
 }
 
-/// ref_p is set to the last trail byte of the valid UTF-8 sequence.
+/// Decodes a character starting at ref_p.
+/// Params:
+///   ref_p = set to one past the ASCII char or one past the last trail byte
+///           of the valid UTF-8 sequence.
 dchar decode(ref char* ref_p, char* end)
 in { assert(ref_p && ref_p < end); }
-out(c) { assert(isValidChar(c) || c == ERROR_CHAR); }
+out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
 body
 {
   char* p = ref_p;
   dchar c = *p;
 
   if (c < 0x80)
-  {
-    ref_p++;
-    return c;
-  }
+    return ref_p++, c;
 
   p++; // Move to second byte.
   if (!(p < end))
@@ -141,11 +144,11 @@
 
   if (!isValidChar(c))
     return ERROR_CHAR;
-  ref_p = p;
+  ref_p = p+1;
   return c;
 }
 
-/// Encodes a character and appends it to str.
+/// Encodes c and appends it to str.
 void encode(ref char[] str, dchar c)
 {
   assert(isValidChar(c), "check if character is valid before calling encode().");
@@ -199,7 +202,7 @@
     assert(0);
 }
 
-/// Encodes a character and appends it to str.
+/// Encodes c and appends it to str.
 void encode(ref wchar[] str, dchar c)
 in { assert(isValidChar(c)); }
 body
@@ -218,11 +221,11 @@
   }
 }
 
-/// Returns a decoded character from a UTF-16 sequence.
-/// Returns: ERROR_CHAR in case of an error in the sequence.
+/// Decodes a character from a UTF-16 sequence.
 /// Params:
 ///   str = the UTF-16 sequence.
 ///   index = where to start from.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(wchar[] str, ref size_t index)
 {
   assert(str.length && index < str.length);
@@ -248,11 +251,11 @@
   return ERROR_CHAR;
 }
 
-/// Returns a decoded character from a UTF-16 sequence.
-/// Returns: ERROR_CHAR in case of an error in the sequence.
+/// Decodes a character from a UTF-16 sequence.
 /// Params:
 ///   p = start of the UTF-16 sequence.
 ///   end = one past the end of the sequence.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(ref wchar* p, wchar* end)
 {
   assert(p && p < end);
@@ -276,7 +279,10 @@
   return ERROR_CHAR;
 }
 
-/// Decode a character from a zero-terminated string.
+/// Decodes a character from a zero-terminated UTF-16 string.
+/// Params:
+///   p = start of the UTF-16 sequence.
+/// Returns: ERROR_CHAR in case of an error in the sequence.
 dchar decode(ref wchar* p)
 {
   assert(p);
@@ -299,3 +305,41 @@
   }
   return ERROR_CHAR;
 }
+
+/// Converts a UTF-8 string to a UTF-16 string.
+wchar[] toUTF16(char[] str)
+{
+  wchar[] result;
+  size_t idx;
+  while (idx < str.length)
+  {
+    auto c = decode(str, idx);
+    if (c == ERROR_CHAR)
+    { // Skip trail bytes.
+      while (++idx < str.length && isTrailByte(str[idx]))
+      {}
+      c = REPLACEMENT_CHAR;
+    }
+    encode(result, c);
+  }
+  return result;
+}
+
+/// Converts a UTF-8 string to a UTF-32 string.
+dchar[] toUTF32(char[] str)
+{
+  dchar[] result;
+  size_t idx;
+  while (idx < str.length)
+  {
+    auto c = decode(str, idx);
+    if (c == ERROR_CHAR)
+    { // Skip trail bytes.
+      while (++idx < str.length && isTrailByte(str[idx]))
+      {}
+      c = REPLACEMENT_CHAR;
+    }
+    result ~= c;
+  }
+  return result;
+}
--- a/trunk/src/dil/lexer/Lexer.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/dil/lexer/Lexer.d	Mon Feb 25 02:56:22 2008 +0100
@@ -355,8 +355,9 @@
         char[] buffer;
         do
         {
-          c = scanEscapeSequence();
-          if (isascii(c))
+          bool isBinary;
+          c = scanEscapeSequence(isBinary);
+          if (isascii(c) || isBinary)
             buffer ~= c;
           else
             encodeUTF8(buffer, c);
@@ -923,8 +924,9 @@
       char[] buffer;
       do
       {
-        c = scanEscapeSequence();
-        if (isascii(c))
+        bool isBinary;
+        c = scanEscapeSequence(isBinary);
+        if (isascii(c) || isBinary)
           buffer ~= c;
         else
           encodeUTF8(buffer, c);
@@ -1224,11 +1226,13 @@
         t.end = p;
         return;
       case '\\':
-        c = scanEscapeSequence();
+        bool isBinary;
+        c = scanEscapeSequence(isBinary);
         --p;
-        if (isascii(c))
-          break;
-        encodeUTF8(buffer, c);
+        if (isascii(c) || isBinary)
+          buffer ~= c;
+        else
+          encodeUTF8(buffer, c);
         continue;
       case '\r':
         if (p[1] == '\n')
@@ -1266,7 +1270,8 @@
     switch (*p)
     {
     case '\\':
-      t.dchar_ = scanEscapeSequence();
+      bool notused;
+      t.dchar_ = scanEscapeSequence(notused);
       break;
     case '\'':
       error(t.start, MID.EmptyCharacterLiteral);
@@ -1708,7 +1713,7 @@
   }
 } // version(D2)
 
-  dchar scanEscapeSequence()
+  dchar scanEscapeSequence(ref bool isBinary)
   out(result)
   { assert(isValidChar(result)); }
   body
@@ -1730,7 +1735,10 @@
     switch (*p)
     {
     case 'x':
+      isBinary = true;
+    case_Unicode:
       assert(c == 0);
+      assert(digits == 2 || digits == 4 || digits == 8);
       while (1)
       {
         ++p;
@@ -1744,31 +1752,34 @@
           else
             c += *p - 'a' + 10;
 
-          if (!--digits)
+          if (--digits == 0)
           {
             ++p;
             if (isValidChar(c))
               return c; // Return valid escape value.
 
-            error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]);
+            error(sequenceStart, MID.InvalidUnicodeEscapeSequence,
+                  sequenceStart[0..p-sequenceStart]);
             break;
           }
           continue;
         }
 
-        error(sequenceStart, MID.InsufficientHexDigits);
+        error(sequenceStart, MID.InsufficientHexDigits,
+              sequenceStart[0..p-sequenceStart]);
         break;
       }
       break;
     case 'u':
       digits = 4;
-      goto case 'x';
+      goto case_Unicode;
     case 'U':
       digits = 8;
-      goto case 'x';
+      goto case_Unicode;
     default:
       if (isoctal(*p))
       {
+        isBinary = true;
         assert(c == 0);
         c += *p - '0';
         ++p;
@@ -1782,7 +1793,7 @@
         c *= 8;
         c += *p - '0';
         ++p;
-        return c; // Return valid escape value.
+        return c & 0xFF; // Return valid escape value.
       }
       else if(*p == '&')
       {
@@ -2610,7 +2621,7 @@
       assert(!isTrailByte(p[1]));
     Lerr2:
       d = REPLACEMENT_CHAR;
-      error(this.p, MID.InvalidUTF8Sequence);
+      error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p));
     }
 
     this.p = p;
@@ -2668,6 +2679,39 @@
     else
      assert(0);
   }
+
+  /// Formats the bytes between start and end.
+  /// Returns: e.g.: abc -> \x61\x62\x63
+  static char[] formatBytes(char* start, char* end)
+  {
+    auto strLen = end-start;
+    const formatLen = `\xXX`.length;
+    char[] result = new char[strLen*formatLen]; // Reserve space.
+    result.length = 0;
+    foreach (c; cast(ubyte[])start[0..strLen])
+      result ~= Format("\\x{:X}", c);
+    return result;
+  }
+
+  /// Searches for an invalid UTF-8 sequence in str.
+  /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80).
+  static string findInvalidUTF8Sequence(string str)
+  {
+    char* p = str.ptr, end = p + str.length;
+    while (p < end)
+    {
+      if (decode(p, end) == ERROR_CHAR)
+      {
+        auto begin = p;
+        // Skip trail-bytes.
+        while (++p < end && isTrailByte(*p))
+        {}
+        return Lexer.formatBytes(begin, p);
+      }
+    }
+    assert(p == end);
+    return "";
+  }
 }
 
 unittest
--- a/trunk/src/dil/parser/Parser.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/dil/parser/Parser.d	Mon Feb 25 02:56:22 2008 +0100
@@ -17,12 +17,11 @@
 import dil.Enums;
 import dil.CompilerInfo;
 import dil.SourceText;
+import dil.Unicode;
 import common;
 
-/++
-  The Parser produces a full parse tree by examining
-  the list of tokens provided by the Lexer.
-+/
+/// The Parser produces a full parse tree by examining
+/// the list of tokens provided by the Lexer.
 class Parser
 {
   Lexer lexer; /// Used to lex the source code.
@@ -3160,20 +3159,29 @@
       nT();
       while (token.kind == T.String)
       {
-        if (postfix == '\0')
+        /+if (postfix == 0)
             postfix = token.pf;
-        else if (token.pf && token.pf != postfix)
+        else+/
+        if (token.pf && token.pf != postfix)
           error(token, MSG.StringPostfixMismatch);
-        str.length = str.length - 1;
+        str.length = str.length - 1; // Exclude '\0'.
         str ~= token.str;
         nT();
       }
       switch (postfix)
-      { // TODO: convert string
-      case 'w': e = new StringExpression(/+toUTF16+/(str)); break;
-      case 'd': e = new StringExpression(/+toUTF32+/(str)); break;
+      {
+      case 'w':
+        if (checkString(begin, str))
+          goto default;
+        e = new StringExpression(dil.Unicode.toUTF16(str)); break;
+      case 'd':
+        if (checkString(begin, str))
+          goto default;
+        e = new StringExpression(dil.Unicode.toUTF32(str)); break;
       case 'c':
-      default: e = new StringExpression(str); break;
+      default:
+        // No checking done to allow for binary data.
+        e = new StringExpression(str); break;
       }
       break;
     case T.LBracket:
@@ -4056,6 +4064,15 @@
     return idtok;
   }
 
+  /// Returns true if the string str has an invalid UTF-8 sequence.
+  bool checkString(Token* begin, string str)
+  {
+    auto utf8Seq = Lexer.findInvalidUTF8Sequence(str);
+    if (utf8Seq.length)
+      error(begin, MSG.InvalidUTF8SequenceInString, utf8Seq);
+    return utf8Seq.length != 0;
+  }
+
   /// Reports an error that has no message ID yet.
   void error(Token* token, char[] formatMsg, ...)
   {
--- a/trunk/src/lang_de.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/lang_de.d	Mon Feb 25 02:56:22 2008 +0100
@@ -8,15 +8,15 @@
 string[] messages = [
   // Lexer messages:
   "illegales Zeichen gefunden: '{0}'",
-  "ungültiges Unicodezeichen.",
-  "ungültige UTF-8-Sequenz.",
+//   "ungültiges Unicodezeichen.",
+  "ungültige UTF-8-Sequenz: '{0}'",
   // ''
   "unterminiertes Zeichenliteral.",
   "leeres Zeichenliteral.",
   // #line
   "erwartete 'line' nach '#'.",
   "Ganzzahl nach #line erwartet.",
-  `erwartete Dateispezifikation (z.B. "pfad\zur\datei".)`,
+//   `erwartete Dateispezifikation (z.B. "pfad\zur\datei".)`,
   "unterminierte Dateispezifikation (filespec.)",
   "ein Special Token muss mit einem Zeilenumbruch abgeschlossen werden.",
   // ""
@@ -34,7 +34,7 @@
   // \x \u \U
   "undefinierte Escapesequenz '{0}' gefunden.",
   "ungültige Unicode-Escapesequenz '{0}' gefunden.",
-  "unzureichende Anzahl von Hexziffern in Escapesequenz.",
+  "unzureichende Anzahl von Hexziffern in Escapesequenz: '{0}'",
   // \&[a-zA-Z][a-zA-Z0-9]+;
   "undefinierte HTML-Entität '{0}'",
   "unterminierte HTML-Entität '{0}'.",
@@ -66,7 +66,7 @@
 
   // Help messages:
   `dil v{0}
-Copyright (c) 2007, Aziz Köksal. Lizensiert unter der GPL3.
+Copyright (c) 2007-2008, Aziz Köksal. Lizensiert unter der GPL3.
 
 Befehle:
 {1}
@@ -88,4 +88,4 @@
   dil gen Parser.d --html --syntax > Parser.html`,
 
   ``,
-];
\ No newline at end of file
+];
--- a/trunk/src/lang_en.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/lang_en.d	Mon Feb 25 02:56:22 2008 +0100
@@ -8,15 +8,15 @@
 string[] messages = [
   // Lexer messages:
   "illegal character found: '{0}'",
-  "invalid Unicode character.",
-  "invalid UTF-8 sequence.",
+//   "invalid Unicode character.",
+  "invalid UTF-8 sequence: '{0}'",
   // ''
   "unterminated character literal.",
   "empty character literal.",
   // #line
   "expected 'line' after '#'.",
   "integer expected after #line",
-  `expected filespec string (e.g. "path\to\file".)`,
+//   `expected filespec string (e.g. "path\to\file".)`,
   "unterminated filespec string.",
   "expected a terminating newline after special token.",
   // ""
@@ -34,7 +34,7 @@
   // \x \u \U
   "found undefined escape sequence '{0}'.",
   "found invalid Unicode escape sequence '{0}'.",
-  "insufficient number of hex digits in escape sequence.",
+  "insufficient number of hex digits in escape sequence: '{0}'",
   // \&[a-zA-Z][a-zA-Z0-9]+;
   "undefined HTML entity '{0}'",
   "unterminated HTML entity '{0}'.",
@@ -66,7 +66,7 @@
 
   // Help messages:
   `dil v{0}
-Copyright (c) 2007 by Aziz Köksal. Licensed under the GPL3.
+Copyright (c) 2007-2008 by Aziz Köksal. Licensed under the GPL3.
 
 Subcommands:
 {1}
@@ -115,4 +115,4 @@
 
 Example:
   dil igraph src/main.d`,
-];
\ No newline at end of file
+];
--- a/trunk/src/lang_fi.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/lang_fi.d	Mon Feb 25 02:56:22 2008 +0100
@@ -8,15 +8,15 @@
 string[] messages = [
   // Lexer messages:
   "virheellinen merkki: '{0}'",
-  "virheellinen Unicode-merkki.",
-  "virheellinen UTF-8-merkkijono.",
+//   "virheellinen Unicode-merkki.",
+  "virheellinen UTF-8-merkkijono: '{0}'",
   // ''
   "päättämätön merkkiliteraali.",
   "tyhjä merkkiliteraali.",
   // #line
   "odotettiin rivinumeroa '#':n jälkeen.",
   "odotettiin kokonaislukua #line:n jälkeen",
-  `odotettiin tiedostomäärittelyn merkkijonoa (esim. "polku\tiedostoon")`,
+//   `odotettiin tiedostomäärittelyn merkkijonoa (esim. "polku\tiedostoon")`,
   "päättämätön tiedostomäärittely.",
   "odotettiin päättävää rivinvaihtoa erikoismerkin jälkeen.",
   // ""
@@ -34,7 +34,7 @@
   // \x \u \U
   "määrittelemätön escape-sekvenssi {0}.",
   "virheellinen Unicode escape-merkki '{0}'.",
-  "riittämätön määrä heksanumeroita escape-sekvenssissä.",
+  "riittämätön määrä heksanumeroita escape-sekvenssissä: '{0}'",
   // \&[a-zA-Z][a-zA-Z0-9]+;
   "määrittelemätön HTML-entiteetti '{0}'",
   "päättämätön HTML-entiteetti {0}.",
@@ -66,7 +66,7 @@
 
   // Help messages:
   `dil v{0}
-Copyright (c) 2007, Aziz Köksal. GPL3-lisensöity.
+Copyright (c) 2007-2008, Aziz Köksal. GPL3-lisensöity.
 
 Alikomennot:
 {1}
--- a/trunk/src/lang_tr.d	Sun Feb 24 03:19:02 2008 +0100
+++ b/trunk/src/lang_tr.d	Mon Feb 25 02:56:22 2008 +0100
@@ -8,15 +8,15 @@
 string[] messages = [
   // Lexer messages:
   "illegal karakter bulundu: '{0}'",
-  "geçersiz Unikod karakteri.",
-  "geçersiz UTF-8 serisi.",
+//   "geçersiz Unikod karakteri.",
+  "geçersiz UTF-8 serisi: '{0}'",
   // ''
   "kapanmamış karakter sabiti.",
   "boş karakter sabiti.",
   // #line
   "'#' karakter'den sonra 'line' beklendi.",
   "'#line''den sonra rakam beklendi.",
-  `filespec dizgisi beklendi (e.g. "yol\dosya".)`,
+//   `filespec dizgisi beklendi (e.g. "yol\dosya".)`,
   "kapanmamış filespec dizgisi.",
   "özel belirtici'den (special token) sonra yeni bir satır beklendi.",
   // ""
@@ -34,7 +34,7 @@
   // \x \u \U
   "tanımlanmamış çıkış serisi '{0}' bulundu.",
   "geçersiz Unikod çıkış serisi '{0}' bulundu.",
-  "heksadesimal çıkış serisi sayıları yeterli değil.",
+  "heksadesimal çıkış serisi sayıları yeterli değil: '{0}'",
   // \&[a-zA-Z][a-zA-Z0-9]+;
   "tanımlanmamış HTML varlık '{0}'",
   "kapanmamış HTML varlık '{0}'.",
@@ -66,7 +66,7 @@
 
   // Help messages:
   `dil v{0}
-Copyright (c) 2007, Aziz Köksal. Lisans GPL3.
+Copyright (c) 2007-2008, Aziz Köksal. Lisans GPL3.
 
 Komutlar:
 {1}
@@ -87,4 +87,4 @@
   dil gen Parser.d --html --syntax > Parser.html`,
 
   ``,
-];
\ No newline at end of file
+];