# HG changeset patch # User Aziz K?ksal # Date 1203904582 -3600 # Node ID c1d5cfd7aa4422817a4d34fe3134862e5c51f2b9 # Parent 139c9a6a39a8ba73280fa96b039ddcd9e49a2405 Implemented string literal conversion. Removed two MID messages. Added MSG.InvalidUTF8SequenceInString. Added toUTF16() and toUTF32(). Fixed escape sequences. Added formatBytes() and findInvalidUTF8Sequence(). diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/dil/Messages.d --- a/trunk/src/dil/Messages.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/dil/Messages.d Mon Feb 25 02:56:22 2008 +0100 @@ -10,7 +10,7 @@ { // Lexer messages: IllegalCharacter, - InvalidUnicodeCharacter, +// InvalidUnicodeCharacter, InvalidUTF8Sequence, // '' UnterminatedCharacterLiteral, @@ -18,7 +18,7 @@ // #line ExpectedIdentifierSTLine, ExpectedIntegerAfterSTLine, - ExpectedFilespec, // Deprecated. +// ExpectedFilespec, UnterminatedFilespec, UnterminatedSpecialToken, // "" @@ -109,6 +109,7 @@ auto UndefinedDDocMacro = "DDoc macro '{}' is undefined"; auto UnterminatedDDocMacro = "DDoc macro '{}' has no closing ')'"; // Parser messages: + auto InvalidUTF8SequenceInString = "invalid UTF-8 sequence in string literal: '{0}'"; auto ModuleDeclarationNotFirst = "a module declaration is only allowed as the first declaration in a file"; auto StringPostfixMismatch = "string literal has mistmatching postfix character"; auto ExpectedIdAfterTypeDot = "expected identifier after '(Type).', not '{}'"; diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/dil/Unicode.d --- a/trunk/src/dil/Unicode.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/dil/Unicode.d Mon Feb 25 02:56:22 2008 +0100 @@ -54,34 +54,37 @@ return true; } -/// index is set one past the last trail byte of the valid UTF-8 sequence. +/// Decodes a character from str at index. +/// Params: +/// index = set to one past the ASCII char or one past the last trail byte +/// of the valid UTF-8 sequence. dchar decode(char[] str, ref size_t index) in { assert(str.length && index < str.length); } -out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } +out { assert(index <= str.length); } body { char* p = str.ptr + index; char* end = str.ptr + str.length; dchar c = decode(p, end); if (c != ERROR_CHAR) - index = p - str.ptr + 1; + index = p - str.ptr; return c; } -/// ref_p is set to the last trail byte of the valid UTF-8 sequence. +/// Decodes a character starting at ref_p. +/// Params: +/// ref_p = set to one past the ASCII char or one past the last trail byte +/// of the valid UTF-8 sequence. dchar decode(ref char* ref_p, char* end) in { assert(ref_p && ref_p < end); } -out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } +out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); } body { char* p = ref_p; dchar c = *p; if (c < 0x80) - { - ref_p++; - return c; - } + return ref_p++, c; p++; // Move to second byte. if (!(p < end)) @@ -141,11 +144,11 @@ if (!isValidChar(c)) return ERROR_CHAR; - ref_p = p; + ref_p = p+1; return c; } -/// Encodes a character and appends it to str. +/// Encodes c and appends it to str. void encode(ref char[] str, dchar c) { assert(isValidChar(c), "check if character is valid before calling encode()."); @@ -199,7 +202,7 @@ assert(0); } -/// Encodes a character and appends it to str. +/// Encodes c and appends it to str. void encode(ref wchar[] str, dchar c) in { assert(isValidChar(c)); } body @@ -218,11 +221,11 @@ } } -/// Returns a decoded character from a UTF-16 sequence. -/// Returns: ERROR_CHAR in case of an error in the sequence. +/// Decodes a character from a UTF-16 sequence. /// Params: /// str = the UTF-16 sequence. /// index = where to start from. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(wchar[] str, ref size_t index) { assert(str.length && index < str.length); @@ -248,11 +251,11 @@ return ERROR_CHAR; } -/// Returns a decoded character from a UTF-16 sequence. -/// Returns: ERROR_CHAR in case of an error in the sequence. +/// Decodes a character from a UTF-16 sequence. /// Params: /// p = start of the UTF-16 sequence. /// end = one past the end of the sequence. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(ref wchar* p, wchar* end) { assert(p && p < end); @@ -276,7 +279,10 @@ return ERROR_CHAR; } -/// Decode a character from a zero-terminated string. +/// Decodes a character from a zero-terminated UTF-16 string. +/// Params: +/// p = start of the UTF-16 sequence. +/// Returns: ERROR_CHAR in case of an error in the sequence. dchar decode(ref wchar* p) { assert(p); @@ -299,3 +305,41 @@ } return ERROR_CHAR; } + +/// Converts a UTF-8 string to a UTF-16 string. +wchar[] toUTF16(char[] str) +{ + wchar[] result; + size_t idx; + while (idx < str.length) + { + auto c = decode(str, idx); + if (c == ERROR_CHAR) + { // Skip trail bytes. + while (++idx < str.length && isTrailByte(str[idx])) + {} + c = REPLACEMENT_CHAR; + } + encode(result, c); + } + return result; +} + +/// Converts a UTF-8 string to a UTF-32 string. +dchar[] toUTF32(char[] str) +{ + dchar[] result; + size_t idx; + while (idx < str.length) + { + auto c = decode(str, idx); + if (c == ERROR_CHAR) + { // Skip trail bytes. + while (++idx < str.length && isTrailByte(str[idx])) + {} + c = REPLACEMENT_CHAR; + } + result ~= c; + } + return result; +} diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/dil/lexer/Lexer.d --- a/trunk/src/dil/lexer/Lexer.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/dil/lexer/Lexer.d Mon Feb 25 02:56:22 2008 +0100 @@ -355,8 +355,9 @@ char[] buffer; do { - c = scanEscapeSequence(); - if (isascii(c)) + bool isBinary; + c = scanEscapeSequence(isBinary); + if (isascii(c) || isBinary) buffer ~= c; else encodeUTF8(buffer, c); @@ -923,8 +924,9 @@ char[] buffer; do { - c = scanEscapeSequence(); - if (isascii(c)) + bool isBinary; + c = scanEscapeSequence(isBinary); + if (isascii(c) || isBinary) buffer ~= c; else encodeUTF8(buffer, c); @@ -1224,11 +1226,13 @@ t.end = p; return; case '\\': - c = scanEscapeSequence(); + bool isBinary; + c = scanEscapeSequence(isBinary); --p; - if (isascii(c)) - break; - encodeUTF8(buffer, c); + if (isascii(c) || isBinary) + buffer ~= c; + else + encodeUTF8(buffer, c); continue; case '\r': if (p[1] == '\n') @@ -1266,7 +1270,8 @@ switch (*p) { case '\\': - t.dchar_ = scanEscapeSequence(); + bool notused; + t.dchar_ = scanEscapeSequence(notused); break; case '\'': error(t.start, MID.EmptyCharacterLiteral); @@ -1708,7 +1713,7 @@ } } // version(D2) - dchar scanEscapeSequence() + dchar scanEscapeSequence(ref bool isBinary) out(result) { assert(isValidChar(result)); } body @@ -1730,7 +1735,10 @@ switch (*p) { case 'x': + isBinary = true; + case_Unicode: assert(c == 0); + assert(digits == 2 || digits == 4 || digits == 8); while (1) { ++p; @@ -1744,31 +1752,34 @@ else c += *p - 'a' + 10; - if (!--digits) + if (--digits == 0) { ++p; if (isValidChar(c)) return c; // Return valid escape value. - error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]); + error(sequenceStart, MID.InvalidUnicodeEscapeSequence, + sequenceStart[0..p-sequenceStart]); break; } continue; } - error(sequenceStart, MID.InsufficientHexDigits); + error(sequenceStart, MID.InsufficientHexDigits, + sequenceStart[0..p-sequenceStart]); break; } break; case 'u': digits = 4; - goto case 'x'; + goto case_Unicode; case 'U': digits = 8; - goto case 'x'; + goto case_Unicode; default: if (isoctal(*p)) { + isBinary = true; assert(c == 0); c += *p - '0'; ++p; @@ -1782,7 +1793,7 @@ c *= 8; c += *p - '0'; ++p; - return c; // Return valid escape value. + return c & 0xFF; // Return valid escape value. } else if(*p == '&') { @@ -2610,7 +2621,7 @@ assert(!isTrailByte(p[1])); Lerr2: d = REPLACEMENT_CHAR; - error(this.p, MID.InvalidUTF8Sequence); + error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p)); } this.p = p; @@ -2668,6 +2679,39 @@ else assert(0); } + + /// Formats the bytes between start and end. + /// Returns: e.g.: abc -> \x61\x62\x63 + static char[] formatBytes(char* start, char* end) + { + auto strLen = end-start; + const formatLen = `\xXX`.length; + char[] result = new char[strLen*formatLen]; // Reserve space. + result.length = 0; + foreach (c; cast(ubyte[])start[0..strLen]) + result ~= Format("\\x{:X}", c); + return result; + } + + /// Searches for an invalid UTF-8 sequence in str. + /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80). + static string findInvalidUTF8Sequence(string str) + { + char* p = str.ptr, end = p + str.length; + while (p < end) + { + if (decode(p, end) == ERROR_CHAR) + { + auto begin = p; + // Skip trail-bytes. + while (++p < end && isTrailByte(*p)) + {} + return Lexer.formatBytes(begin, p); + } + } + assert(p == end); + return ""; + } } unittest diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/dil/parser/Parser.d --- a/trunk/src/dil/parser/Parser.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/dil/parser/Parser.d Mon Feb 25 02:56:22 2008 +0100 @@ -17,12 +17,11 @@ import dil.Enums; import dil.CompilerInfo; import dil.SourceText; +import dil.Unicode; import common; -/++ - The Parser produces a full parse tree by examining - the list of tokens provided by the Lexer. -+/ +/// The Parser produces a full parse tree by examining +/// the list of tokens provided by the Lexer. class Parser { Lexer lexer; /// Used to lex the source code. @@ -3160,20 +3159,29 @@ nT(); while (token.kind == T.String) { - if (postfix == '\0') + /+if (postfix == 0) postfix = token.pf; - else if (token.pf && token.pf != postfix) + else+/ + if (token.pf && token.pf != postfix) error(token, MSG.StringPostfixMismatch); - str.length = str.length - 1; + str.length = str.length - 1; // Exclude '\0'. str ~= token.str; nT(); } switch (postfix) - { // TODO: convert string - case 'w': e = new StringExpression(/+toUTF16+/(str)); break; - case 'd': e = new StringExpression(/+toUTF32+/(str)); break; + { + case 'w': + if (checkString(begin, str)) + goto default; + e = new StringExpression(dil.Unicode.toUTF16(str)); break; + case 'd': + if (checkString(begin, str)) + goto default; + e = new StringExpression(dil.Unicode.toUTF32(str)); break; case 'c': - default: e = new StringExpression(str); break; + default: + // No checking done to allow for binary data. + e = new StringExpression(str); break; } break; case T.LBracket: @@ -4056,6 +4064,15 @@ return idtok; } + /// Returns true if the string str has an invalid UTF-8 sequence. + bool checkString(Token* begin, string str) + { + auto utf8Seq = Lexer.findInvalidUTF8Sequence(str); + if (utf8Seq.length) + error(begin, MSG.InvalidUTF8SequenceInString, utf8Seq); + return utf8Seq.length != 0; + } + /// Reports an error that has no message ID yet. void error(Token* token, char[] formatMsg, ...) { diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/lang_de.d --- a/trunk/src/lang_de.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/lang_de.d Mon Feb 25 02:56:22 2008 +0100 @@ -8,15 +8,15 @@ string[] messages = [ // Lexer messages: "illegales Zeichen gefunden: '{0}'", - "ungültiges Unicodezeichen.", - "ungültige UTF-8-Sequenz.", +// "ungültiges Unicodezeichen.", + "ungültige UTF-8-Sequenz: '{0}'", // '' "unterminiertes Zeichenliteral.", "leeres Zeichenliteral.", // #line "erwartete 'line' nach '#'.", "Ganzzahl nach #line erwartet.", - `erwartete Dateispezifikation (z.B. "pfad\zur\datei".)`, +// `erwartete Dateispezifikation (z.B. "pfad\zur\datei".)`, "unterminierte Dateispezifikation (filespec.)", "ein Special Token muss mit einem Zeilenumbruch abgeschlossen werden.", // "" @@ -34,7 +34,7 @@ // \x \u \U "undefinierte Escapesequenz '{0}' gefunden.", "ungültige Unicode-Escapesequenz '{0}' gefunden.", - "unzureichende Anzahl von Hexziffern in Escapesequenz.", + "unzureichende Anzahl von Hexziffern in Escapesequenz: '{0}'", // \&[a-zA-Z][a-zA-Z0-9]+; "undefinierte HTML-Entität '{0}'", "unterminierte HTML-Entität '{0}'.", @@ -66,7 +66,7 @@ // Help messages: `dil v{0} -Copyright (c) 2007, Aziz Köksal. Lizensiert unter der GPL3. +Copyright (c) 2007-2008, Aziz Köksal. Lizensiert unter der GPL3. Befehle: {1} @@ -88,4 +88,4 @@ dil gen Parser.d --html --syntax > Parser.html`, ``, -]; \ No newline at end of file +]; diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/lang_en.d --- a/trunk/src/lang_en.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/lang_en.d Mon Feb 25 02:56:22 2008 +0100 @@ -8,15 +8,15 @@ string[] messages = [ // Lexer messages: "illegal character found: '{0}'", - "invalid Unicode character.", - "invalid UTF-8 sequence.", +// "invalid Unicode character.", + "invalid UTF-8 sequence: '{0}'", // '' "unterminated character literal.", "empty character literal.", // #line "expected 'line' after '#'.", "integer expected after #line", - `expected filespec string (e.g. "path\to\file".)`, +// `expected filespec string (e.g. "path\to\file".)`, "unterminated filespec string.", "expected a terminating newline after special token.", // "" @@ -34,7 +34,7 @@ // \x \u \U "found undefined escape sequence '{0}'.", "found invalid Unicode escape sequence '{0}'.", - "insufficient number of hex digits in escape sequence.", + "insufficient number of hex digits in escape sequence: '{0}'", // \&[a-zA-Z][a-zA-Z0-9]+; "undefined HTML entity '{0}'", "unterminated HTML entity '{0}'.", @@ -66,7 +66,7 @@ // Help messages: `dil v{0} -Copyright (c) 2007 by Aziz Köksal. Licensed under the GPL3. +Copyright (c) 2007-2008 by Aziz Köksal. Licensed under the GPL3. Subcommands: {1} @@ -115,4 +115,4 @@ Example: dil igraph src/main.d`, -]; \ No newline at end of file +]; diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/lang_fi.d --- a/trunk/src/lang_fi.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/lang_fi.d Mon Feb 25 02:56:22 2008 +0100 @@ -8,15 +8,15 @@ string[] messages = [ // Lexer messages: "virheellinen merkki: '{0}'", - "virheellinen Unicode-merkki.", - "virheellinen UTF-8-merkkijono.", +// "virheellinen Unicode-merkki.", + "virheellinen UTF-8-merkkijono: '{0}'", // '' "päättämätön merkkiliteraali.", "tyhjä merkkiliteraali.", // #line "odotettiin rivinumeroa '#':n jälkeen.", "odotettiin kokonaislukua #line:n jälkeen", - `odotettiin tiedostomäärittelyn merkkijonoa (esim. "polku\tiedostoon")`, +// `odotettiin tiedostomäärittelyn merkkijonoa (esim. "polku\tiedostoon")`, "päättämätön tiedostomäärittely.", "odotettiin päättävää rivinvaihtoa erikoismerkin jälkeen.", // "" @@ -34,7 +34,7 @@ // \x \u \U "määrittelemätön escape-sekvenssi {0}.", "virheellinen Unicode escape-merkki '{0}'.", - "riittämätön määrä heksanumeroita escape-sekvenssissä.", + "riittämätön määrä heksanumeroita escape-sekvenssissä: '{0}'", // \&[a-zA-Z][a-zA-Z0-9]+; "määrittelemätön HTML-entiteetti '{0}'", "päättämätön HTML-entiteetti {0}.", @@ -66,7 +66,7 @@ // Help messages: `dil v{0} -Copyright (c) 2007, Aziz Köksal. GPL3-lisensöity. +Copyright (c) 2007-2008, Aziz Köksal. GPL3-lisensöity. Alikomennot: {1} diff -r 139c9a6a39a8 -r c1d5cfd7aa44 trunk/src/lang_tr.d --- a/trunk/src/lang_tr.d Sun Feb 24 03:19:02 2008 +0100 +++ b/trunk/src/lang_tr.d Mon Feb 25 02:56:22 2008 +0100 @@ -8,15 +8,15 @@ string[] messages = [ // Lexer messages: "illegal karakter bulundu: '{0}'", - "geçersiz Unikod karakteri.", - "geçersiz UTF-8 serisi.", +// "geçersiz Unikod karakteri.", + "geçersiz UTF-8 serisi: '{0}'", // '' "kapanmamış karakter sabiti.", "boş karakter sabiti.", // #line "'#' karakter'den sonra 'line' beklendi.", "'#line''den sonra rakam beklendi.", - `filespec dizgisi beklendi (e.g. "yol\dosya".)`, +// `filespec dizgisi beklendi (e.g. "yol\dosya".)`, "kapanmamış filespec dizgisi.", "özel belirtici'den (special token) sonra yeni bir satır beklendi.", // "" @@ -34,7 +34,7 @@ // \x \u \U "tanımlanmamış çıkış serisi '{0}' bulundu.", "geçersiz Unikod çıkış serisi '{0}' bulundu.", - "heksadesimal çıkış serisi sayıları yeterli değil.", + "heksadesimal çıkış serisi sayıları yeterli değil: '{0}'", // \&[a-zA-Z][a-zA-Z0-9]+; "tanımlanmamış HTML varlık '{0}'", "kapanmamış HTML varlık '{0}'.", @@ -66,7 +66,7 @@ // Help messages: `dil v{0} -Copyright (c) 2007, Aziz Köksal. Lisans GPL3. +Copyright (c) 2007-2008, Aziz Köksal. Lisans GPL3. Komutlar: {1} @@ -87,4 +87,4 @@ dil gen Parser.d --html --syntax > Parser.html`, ``, -]; \ No newline at end of file +];