Mercurial > projects > dil
changeset 48:c2e0e0269c28
- Added code for scanning escape string literals.
- Added own UTF8 encoding function. It issues an error when processing an invalid Unicode character.
- Fixed scanner of hex escape sequences. Conversion was faulty and p wasn't incremented after finishing scanning hex digits.
- Relocated if else construct into default case.
author | aziz |
---|---|
date | Wed, 27 Jun 2007 10:22:03 +0000 |
parents | 8aa37a78937b |
children | 7f0fa15dcffc |
files | trunk/src/Lexer.d |
diffstat | 1 files changed, 115 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/trunk/src/Lexer.d Wed Jun 27 07:57:05 2007 +0000 +++ b/trunk/src/Lexer.d Wed Jun 27 10:22:03 2007 +0000 @@ -110,6 +110,8 @@ /// Index into table of error messages. enum MID { + InvalidUnicodeCharacter, + // '' UnterminatedCharacterLiteral, EmptyCharacterLiteral, // #line @@ -137,6 +139,8 @@ } string[] messages = [ + "invalid Unicode character.", + // '' "unterminated character literal.", "empty character literal.", // #line @@ -413,17 +417,30 @@ } } - if (c == '\'') - return scanCharacterLiteral(t); - - if (c == '`') - return scanRawStringLiteral(t); - - if (c == '"') - return scanNormalStringLiteral(t); - switch (c) { + case '\'': + return scanCharacterLiteral(t); + case '`': + return scanRawStringLiteral(t); + case '"': + return scanNormalStringLiteral(t); + case '\\': + char[] buffer; + do + { + ++p; + c = scanEscapeSequence(); + if (c < 128) + buffer ~= c; + else + encodeUTF8(buffer, c); + } while (*p == '\\') + buffer ~= 0; + t.type = TOK.String; + t.str = buffer; + t.end = p; + return; case '>': /* > >= >> >>= >>> >>>= */ c = *++p; switch (c) @@ -702,7 +719,7 @@ if (t.dchar_ < 128) buffer ~= t.dchar_; else - encode(buffer, t.dchar_); + encodeUTF8(buffer, t.dchar_); continue; case '\r': if (p[1] == '\n') @@ -932,13 +949,15 @@ { c *= 16; if (*p <= '9') - c = *p - '0'; + c += *p - '0'; else if (*p <= 'F') - c = *p - 'A' - 10; + c += *p - 'A' + 10; else - c = *p - 'a' - 10; - if (!--digits) + c += *p - 'a' + 10; + if (!--digits) { + ++p; break; + } } else { @@ -954,47 +973,47 @@ digits = 8; goto case 'x'; default: - } - if (isoctal(*p)) - { - c = 0; - c += *p - '0'; - ++p; - if (!isoctal(*p)) - return c; - c *= 8; - c += *p - '0'; - ++p; - if (!isoctal(*p)) - return c; - c *= 8; - c += *p - '0'; - ++p; - } - else if(*p == '&') - { - if (isalpha(*++p)) + if (isoctal(*p)) { - while (1) + c = 0; + c += *p - '0'; + ++p; + if (!isoctal(*p)) + return c; + c *= 8; + c += *p - '0'; + ++p; + if (!isoctal(*p)) + return c; + c *= 8; + c += *p - '0'; + ++p; + } + else if(*p == '&') + { + if (isalpha(*++p)) { - if (isalnum(*++p)) - continue; - if (*p == ';') { - // TODO: convert entity to unicode codepoint. - ++p; - break; - } - else { - error(MID.UnterminatedHTMLEntity); - break; + while (1) + { + if (isalnum(*++p)) + continue; + if (*p == ';') { + // TODO: convert entity to unicode codepoint. + ++p; + break; + } + else { + error(MID.UnterminatedHTMLEntity); + break; + } } } + else + error(MID.InvalidBeginHTMLEntity); } else - error(MID.InvalidBeginHTMLEntity); + error(MID.UndefinedEscapeSequence); } - else - error(MID.UndefinedEscapeSequence); return c; } @@ -1116,6 +1135,54 @@ tokens ~= this.token; return tokens; } + + private void encodeUTF8(inout char[] str, dchar d) + { + char[6] b; + assert(d > 0x7F, "check for ASCII char before calling encodeUTF8()."); + if (d < 0x800) + { + b[0] = 0xC0 | (d >> 6); + b[1] = 0x80 | (d & 0x3F); + str ~= b[0..2]; + } + else if (d < 0x10000) + { + b[0] = 0xE0 | (d >> 12); + b[1] = 0x80 | ((d >> 6) & 0x3F); + b[2] = 0x80 | (d & 0x3F); + str ~= b[0..3]; + } + else if (d < 0x200000) + { + b[0] = 0xF0 | (d >> 18); + b[1] = 0x80 | ((d >> 12) & 0x3F); + b[2] = 0x80 | ((d >> 6) & 0x3F); + b[3] = 0x80 | (d & 0x3F); + str ~= b[0..4]; + } + else if (d < 0x4000000) + { + b[0] = 0xF8 | (d >> 24); + b[1] = 0x80 | ((d >> 18) & 0x3F); + b[2] = 0x80 | ((d >> 12) & 0x3F); + b[3] = 0x80 | ((d >> 6) & 0x3F); + b[4] = 0x80 | (d & 0x3F); + str ~= b[0..5]; + } + else if (d < 0x80000000) + { + b[0] = 0xFC | (d >> 30); + b[1] = 0x80 | ((d >> 24) & 0x3F); + b[2] = 0x80 | ((d >> 18) & 0x3F); + b[3] = 0x80 | ((d >> 12) & 0x3F); + b[4] = 0x80 | ((d >> 6) & 0x3F); + b[5] = 0x80 | (d & 0x3F); + str ~= b[0..6]; + } + else + error(MID.InvalidUnicodeCharacter); + } } unittest