# HG changeset patch
# User aziz
# Date 1183587541 0
# Node ID fc645fb2fe722fbdbf75973d20634708ab1d093d
# Parent aa1ea2548dd9cd5af0275be774a803aeffd8999a
- scanEscapeSequences() doesn't return 0xFFFF as an error value anymore, because it is a valid codepoint usable by the user.
- Added CharaterLiteral tokens with utf-16/32 versions to distinguish between different character literals.
- Checking for valid Unicode codepoint when finished lexing hex escape sequences.
diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/Lexer.d
--- a/trunk/src/Lexer.d Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/Lexer.d Wed Jul 04 22:19:01 2007 +0000
@@ -268,8 +268,6 @@
{
++p;
c = scanEscapeSequence();
- if (c == 0xFFFF)
- break;
if (c < 128)
buffer ~= c;
else
@@ -564,8 +562,6 @@
case '\\':
++p;
dchar d = scanEscapeSequence();
- if (d == 0xFFFF)
- continue;
if (d < 128)
buffer ~= d;
else
@@ -585,16 +581,17 @@
default:
if (*p & 128)
{
- char* begin = p;
+// char* begin = p;
dchar d = decodeUTF8();
+
if (d == LSd || d == PSd)
goto case '\n';
- if (d != 0xFFFF)
- {
- ++p;
- buffer ~= begin[0 .. p - begin];
- }
+ // We don't copy per pointer because we might include
+ // invalid, skipped utf-8 sequences. See decodeUTF8().
+// ++p;
+// buffer ~= begin[0 .. p - begin];
+ encodeUTF8(buffer, d);
continue;
}
buffer ~= *p++;
@@ -608,10 +605,19 @@
assert(*p == '\'');
MID id = MID.UnterminatedCharacterLiteral;
++p;
+ TOK type = TOK.CharLiteral;
switch (*p)
{
case '\\':
++p;
+ switch (*p)
+ {
+ case 'u':
+ type = TOK.WCharLiteral; break;
+ case 'U':
+ type = TOK.DCharLiteral; break;
+ default:
+ }
t.dchar_ = scanEscapeSequence();
break;
case '\'':
@@ -626,6 +632,10 @@
c = decodeUTF8();
if (c == LSd || c == PSd)
goto Lerr;
+ if (c <= 0xFFFF)
+ type = TOK.WCharLiteral;
+ else
+ type = TOK.DCharLiteral;
}
t.dchar_ = c;
++p;
@@ -634,9 +644,9 @@
if (*p == '\'')
++p;
else
- Lerr:
+ Lerr:
error(id);
- t.type = TOK.Character;
+ t.type = type;
t.end = p;
}
@@ -786,11 +796,11 @@
dchar scanEscapeSequence()
{
uint c = char2ev(*p);
- if (c) {
+ if (c)
+ {
++p;
return c;
}
- c = 0xFFFF;
uint digits = 2;
switch (*p)
@@ -809,7 +819,9 @@
c += *p - 'A' + 10;
else
c += *p - 'a' + 10;
- if (!--digits) {
+
+ if (!--digits)
+ {
++p;
break;
}
@@ -817,10 +829,11 @@
else
{
error(MID.InsufficientHexDigits);
- c = 0xFFFF;
break;
}
}
+ if (!isValidDchar(c))
+ error(MID.InvalidUnicodeCharacter);
break;
case 'u':
digits = 4;
@@ -1399,7 +1412,7 @@
{
assert(*p & 128, "check for ASCII char before calling decodeUTF8().");
size_t idx;
- uint d = 0xFFFF;
+ dchar d;
try
{
d = std.utf.decode(p[0 .. end-p], idx);
@@ -1410,6 +1423,7 @@
error(MID.InvalidUTF8Sequence);
// Skip to next valid utf-8 sequence
while (UTF8stride[*++p] != 0xFF) {}
+ --p;
}
return d;
}
diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/Token.d
--- a/trunk/src/Token.d Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/Token.d Wed Jul 04 22:19:01 2007 +0000
@@ -15,7 +15,7 @@
Identifier,
Comment,
String,
- Character,
+ CharLiteral, WCharLiteral, DCharLiteral,
// Numbers
Number,
diff -r aa1ea2548dd9 -r fc645fb2fe72 trunk/src/main.d
--- a/trunk/src/main.d Tue Jul 03 11:03:02 2007 +0000
+++ b/trunk/src/main.d Wed Jul 04 22:19:01 2007 +0000
@@ -65,7 +65,7 @@
case TOK.String:
writef("%s", srcText);
break;
- case TOK.Character:
+ case TOK.CharLiteral, TOK.WCharLiteral, TOK.DCharLiteral:
writef("%s", srcText);
break;
case TOK.Assign, TOK.Equal,