# HG changeset patch # User Aziz K?ksal # Date 1191171082 -7200 # Node ID bb3cb00feeb23454cc9e4de3a4c625c57b8b3c40 # Parent 6057113f9a44a7216bc31152867f9d9517bb90f0 Applied some fixes to class Lexer. The method encodeUTF8() doesn't report errors anymore. Any character passed to it must be encodable as a UTF-8 sequence. Moved decodeUTF8() down a few places. Added method isEncodable(). Fix: the return value of scanEscapeSequence() wasn't tested correctly before calling encodeUTF8(). Simplified test for UTF-8 trail byte in Location.calculateColumn(). diff -r 6057113f9a44 -r bb3cb00feeb2 trunk/src/dil/Information.d --- a/trunk/src/dil/Information.d Sun Sep 30 16:59:32 2007 +0200 +++ b/trunk/src/dil/Information.d Sun Sep 30 18:51:22 2007 +0200 @@ -127,7 +127,7 @@ ); // Skip this byte if it is a trail byte of a UTF-8 sequence. - if (*p & 0x80 && !(*p & 0x40)) + if (*p & 0xC0 == 0x80) continue; // *p == 0b10xx_xxxx // Only count ASCII characters and the first byte of a UTF-8 sequence. ++col; diff -r 6057113f9a44 -r bb3cb00feeb2 trunk/src/dil/Lexer.d --- a/trunk/src/dil/Lexer.d Sun Sep 30 16:59:32 2007 +0200 +++ b/trunk/src/dil/Lexer.d Sun Sep 30 18:51:22 2007 +0200 @@ -1256,10 +1256,9 @@ case '\\': c = scanEscapeSequence(); --p; - if (c & 128) - encodeUTF8(buffer, c); - else + if (c < 128) break; + encodeUTF8(buffer, c); continue; case '\r': if (p[1] == '\n') @@ -1284,7 +1283,7 @@ continue; } } - // Copy ASCII character. + assert(isascii(c)); buffer ~= c; } assert(0); @@ -1407,7 +1406,8 @@ continue; } } - buffer ~= c; // copy character to buffer + assert(isascii(c)); + buffer ~= c; } assert(0); } @@ -1657,7 +1657,8 @@ } } } - buffer ~= c; // copy character to buffer + assert(isascii(c)); + buffer ~= c; } Lreturn: // Character delimiter. assert(c == closing_delim); @@ -1826,9 +1827,11 @@ return c; } } - // TODO: when c is encoded again by encodeUTF8() the same error is reported twice. - if (!isValidDchar(c)) + if (!isEncodable(c)) + { + c = 0; error(sequenceStart, MID.InvalidUnicodeCharacter); + } return c; case 'u': digits = 4; @@ -1886,12 +1889,11 @@ } else { - dchar d = *p; char[] str = `\`; - if (d & 128) + if (*p & 128) encodeUTF8(str, decodeUTF8()); else - str ~= d; + str ~= *p; ++p; // TODO: check for unprintable character? error(sequenceStart, MID.UndefinedEscapeSequence, str); @@ -2455,27 +2457,6 @@ error(errorAtColumn, mid); } - dchar decodeUTF8() - { - assert(*p & 128, "check for ASCII char before calling decodeUTF8()."); - size_t idx; - dchar d; - try - { - d = std.utf.decode(p[0 .. end-p], idx); - p += idx -1; - } - catch (UtfException e) - { - error(p, MID.InvalidUTF8Sequence); - // Move to next valid UTF-8 sequence or ASCII character. - while (++p < end && *p & 0xC0 == 0x80) {} - assert(p < end); - --p; - } - return d; - } - /+ Insert an empty dummy token before t. Useful in the parsing phase for representing a node in the AST @@ -2574,10 +2555,41 @@ return !(ident in reserved_ids_table); } - private void encodeUTF8(inout char[] str, dchar d) + /+ + Returns true if d can be encoded as a UTF-8 sequence. + +/ + bool isEncodable(dchar d) + { + return d < 0xD800 || + (d > 0xDFFF && d <= 0x10FFFF && d != 0xFFFF && d != 0xFFFE); + } + + dchar decodeUTF8() + { + assert(!isascii(*p), "check for ASCII char before calling decodeUTF8()."); + size_t idx; + dchar d; + try + { + d = std.utf.decode(p[0 .. end-p], idx); + p += idx -1; + } + catch (UtfException e) + { + error(p, MID.InvalidUTF8Sequence); + // Move to next valid UTF-8 sequence or ASCII character. + while (++p < end && *p & 0xC0 == 0x80) {} + assert(p < end); + --p; + } + return d; + } + + private void encodeUTF8(ref char[] str, dchar d) { char[6] b; - assert(d > 0x7F, "check for ASCII char before calling encodeUTF8()."); + assert(!isascii(d), "check for ASCII char before calling encodeUTF8()."); + assert(isEncodable(d), "check that 'd' is encodable before calling encodeUTF8()."); if (d < 0x800) { b[0] = 0xC0 | (d >> 6); @@ -2599,6 +2611,7 @@ b[3] = 0x80 | (d & 0x3F); str ~= b[0..4]; } + /+ // There are no 5 and 6 byte UTF-8 sequences yet. else if (d < 0x4000000) { b[0] = 0xF8 | (d >> 24); @@ -2618,8 +2631,9 @@ b[5] = 0x80 | (d & 0x3F); str ~= b[0..6]; } -// else -// error(MID.InvalidUnicodeCharacter); + +/ + else + assert(0); } } @@ -2761,6 +2775,7 @@ int isident(char c) { return ptable[c] & (CP.Alpha | CP.Underscore | CP.Digit); } int isspace(char c) { return ptable[c] & CP.Whitespace; } int char2ev(char c) { return ptable[c] >> 8; /*(ptable[c] & EVMask) >> 8;*/ } +int isascii(uint c) { return c < 128; } version(gen_ptable) static this() diff -r 6057113f9a44 -r bb3cb00feeb2 trunk/src/dil/Parser.d --- a/trunk/src/dil/Parser.d Sun Sep 30 16:59:32 2007 +0200 +++ b/trunk/src/dil/Parser.d Sun Sep 30 18:51:22 2007 +0200 @@ -327,7 +327,6 @@ decl = new IllegalDeclaration(token); nT(); } -// writef("§%s§", decl.classinfo.name); set(decl, begin); return decl; } @@ -1573,7 +1572,6 @@ Statement parseStatement() { -// writefln("°parseStatement:(%d)token='%s'°", lx.loc, token.srcText); auto begin = token; Statement s; Declaration d; @@ -3006,8 +3004,6 @@ e = new CommaExpression(e, parseAssignExpression(), comma); set(e, begin); } -// if (!trying) -// writef("§%s§", e.classinfo.name); return e; }