# HG changeset patch # User aziz # Date 1183587541 0 # Node ID fc645fb2fe722fbdbf75973d20634708ab1d093d # Parent aa1ea2548dd9cd5af0275be774a803aeffd8999a - scanEscapeSequences() doesn't return 0xFFFF as an error value anymore, because it is a valid codepoint usable by the user. - Added CharaterLiteral tokens with utf-16/32 versions to distinguish between different character literals. - Checking for valid Unicode codepoint when finished lexing hex escape sequences. diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/Lexer.d --- a/trunk/src/Lexer.d Tue Jul 03 11:03:02 2007 +0000 +++ b/trunk/src/Lexer.d Wed Jul 04 22:19:01 2007 +0000 @@ -268,8 +268,6 @@ { ++p; c = scanEscapeSequence(); - if (c == 0xFFFF) - break; if (c < 128) buffer ~= c; else @@ -564,8 +562,6 @@ case '\\': ++p; dchar d = scanEscapeSequence(); - if (d == 0xFFFF) - continue; if (d < 128) buffer ~= d; else @@ -585,16 +581,17 @@ default: if (*p & 128) { - char* begin = p; +// char* begin = p; dchar d = decodeUTF8(); + if (d == LSd || d == PSd) goto case '\n'; - if (d != 0xFFFF) - { - ++p; - buffer ~= begin[0 .. p - begin]; - } + // We don't copy per pointer because we might include + // invalid, skipped utf-8 sequences. See decodeUTF8(). +// ++p; +// buffer ~= begin[0 .. p - begin]; + encodeUTF8(buffer, d); continue; } buffer ~= *p++; @@ -608,10 +605,19 @@ assert(*p == '\''); MID id = MID.UnterminatedCharacterLiteral; ++p; + TOK type = TOK.CharLiteral; switch (*p) { case '\\': ++p; + switch (*p) + { + case 'u': + type = TOK.WCharLiteral; break; + case 'U': + type = TOK.DCharLiteral; break; + default: + } t.dchar_ = scanEscapeSequence(); break; case '\'': @@ -626,6 +632,10 @@ c = decodeUTF8(); if (c == LSd || c == PSd) goto Lerr; + if (c <= 0xFFFF) + type = TOK.WCharLiteral; + else + type = TOK.DCharLiteral; } t.dchar_ = c; ++p; @@ -634,9 +644,9 @@ if (*p == '\'') ++p; else - Lerr: + Lerr: error(id); - t.type = TOK.Character; + t.type = type; t.end = p; } @@ -786,11 +796,11 @@ dchar scanEscapeSequence() { uint c = char2ev(*p); - if (c) { + if (c) + { ++p; return c; } - c = 0xFFFF; uint digits = 2; switch (*p) @@ -809,7 +819,9 @@ c += *p - 'A' + 10; else c += *p - 'a' + 10; - if (!--digits) { + + if (!--digits) + { ++p; break; } @@ -817,10 +829,11 @@ else { error(MID.InsufficientHexDigits); - c = 0xFFFF; break; } } + if (!isValidDchar(c)) + error(MID.InvalidUnicodeCharacter); break; case 'u': digits = 4; @@ -1399,7 +1412,7 @@ { assert(*p & 128, "check for ASCII char before calling decodeUTF8()."); size_t idx; - uint d = 0xFFFF; + dchar d; try { d = std.utf.decode(p[0 .. end-p], idx); @@ -1410,6 +1423,7 @@ error(MID.InvalidUTF8Sequence); // Skip to next valid utf-8 sequence while (UTF8stride[*++p] != 0xFF) {} + --p; } return d; } diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/Token.d --- a/trunk/src/Token.d Tue Jul 03 11:03:02 2007 +0000 +++ b/trunk/src/Token.d Wed Jul 04 22:19:01 2007 +0000 @@ -15,7 +15,7 @@ Identifier, Comment, String, - Character, + CharLiteral, WCharLiteral, DCharLiteral, // Numbers Number, diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/main.d --- a/trunk/src/main.d Tue Jul 03 11:03:02 2007 +0000 +++ b/trunk/src/main.d Wed Jul 04 22:19:01 2007 +0000 @@ -65,7 +65,7 @@ case TOK.String: writef("%s", srcText); break; - case TOK.Character: + case TOK.CharLiteral, TOK.WCharLiteral, TOK.DCharLiteral: writef("%s", srcText); break; case TOK.Assign, TOK.Equal,