# HG changeset patch # User Aziz K?ksal # Date 1197567929 -3600 # Node ID dd3ce87b35690d06397e0b11cca5b4967587a50a # Parent baa7c4c0be7873290f7c22aa753a8fea1eee3d00 Added module dil.Unicode. Moved some functions from dil.Lexer to dil.Unicode. Added isIdentifierString() to dil.Lexer. Renamed isNonReservedIdentifier() to isReservedIdentifier(). diff -r baa7c4c0be78 -r dd3ce87b3569 trunk/src/dil/Lexer.d --- a/trunk/src/dil/Lexer.d Wed Dec 12 22:17:20 2007 +0200 +++ b/trunk/src/dil/Lexer.d Thu Dec 13 18:45:29 2007 +0100 @@ -11,19 +11,15 @@ import dil.HtmlEntities; import dil.CompilerInfo; import dil.IdTable; +import dil.Unicode; import tango.stdc.stdlib : strtof, strtod, strtold; import tango.stdc.errno : errno, ERANGE; import tango.stdc.time : time_t, time, ctime; import tango.stdc.string : strlen; -import std.utf; -import std.uni; import common; public import dil.LexerFuncs; -/// U+FFFD = �. Used to replace invalid Unicode characters. -const dchar REPLACEMENT_CHAR = '\uFFFD'; - /++ The Lexer analyzes the characters of a source text and produces a doubly-linked list of tokens. @@ -1698,7 +1694,7 @@ dchar scanEscapeSequence() out(result) - { assert(isEncodable(result)); } + { assert(isValidChar(result)); } body { assert(*p == '\\'); @@ -1735,7 +1731,7 @@ if (!--digits) { ++p; - if (isEncodable(c)) + if (isValidChar(c)) return c; // Return valid escape value. error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]); @@ -2418,80 +2414,35 @@ table[k.str] = k; } - static bool isNonReservedIdentifier(char[] ident) + /// Returns true if str is a valid D identifier. + static bool isIdentifierString(char[] str) { - if (ident.length == 0) + if (str.length == 0 || isdigit(str[0])) + return false; + size_t idx; + do + { + auto c = dil.Unicode.decode(str, idx); + if (c == ERROR_CHAR || !(isident(c) || !isascii(c) && isUniAlpha(c))) + return false; + } while (idx < str.length) + return true; + } + + /// Returns true if str is a keyword or a special token (__FILE__, __LINE__ etc.) + static bool isReservedIdentifier(char[] str) + { + if (str.length == 0) return false; static Identifier[string] reserved_ids_table; if (reserved_ids_table is null) Lexer.loadKeywords(reserved_ids_table); - size_t idx = 1; // Index to the 2nd character in ident. - dchar isFirstCharUniAlpha() - { - idx = 0; - // NB: decode() could throw an Exception which would be - // caught by the next try-catch-block. - return isUniAlpha(std.utf.decode(ident, idx)); - } - - try - { - if (isidbeg(ident[0]) || !isascii(ident[0]) && isFirstCharUniAlpha()) - { - foreach (dchar c; ident[idx..$]) - if (!isident(c) && !isUniAlpha(c)) - return false; - } - } - catch (Exception) + if (!isIdentifierString(str)) return false; - return !(ident in reserved_ids_table); - } - - /++ - Returns true if d can be encoded as a UTF-8 sequence. - +/ - bool isEncodable(dchar d) - { - return d < 0xD800 || - (d > 0xDFFF && d <= 0x10FFFF); - } - - /++ - There are a total of 66 noncharacters. - Returns true if this is one of them. - See_also: Chapter 16.7 Noncharacters in Unicode 5.0 - +/ - bool isNoncharacter(dchar d) - { - return 0xFDD0 <= d && d <= 0xFDEF || // 32 - d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 - } - - /++ - Returns true if this character is not a noncharacter, not a surrogate - code point and not higher than 0x10FFFF. - +/ - bool isValidDecodedChar(dchar d) - { - return d < 0xD800 || - (d > 0xDFFF && d < 0xFDD0) || - (d > 0xFDEF && d <= 0x10FFFF && (d & 0xFFFF) < 0xFFFE); - } - - /// Is this a trail byte of a UTF-8 sequence? - bool isTrailByte(ubyte b) - { - return (b & 0xC0) == 0x80; // 10xx_xxxx - } - - /// Is this a lead byte of a UTF-8 sequence? - bool isLeadByte(ubyte b) - { - return (b & 0xC0) == 0xC0; // 11xx_xxxx + return (str in reserved_ids_table) !is null; } dchar decodeUTF8() @@ -2553,7 +2504,7 @@ assert(isTrailByte(*p)); - if (!isEncodable(d)) + if (!isValidChar(d)) { Lerr: // Three cases: @@ -2582,9 +2533,9 @@ private void encodeUTF8(ref char[] str, dchar d) { - char[6] b; + char[6] b = void; assert(!isascii(d), "check for ASCII char before calling encodeUTF8()."); - assert(isEncodable(d), "check that 'd' is encodable before calling encodeUTF8()."); + assert(isValidChar(d), "check if character is valid before calling encodeUTF8()."); if (d < 0x800) { diff -r baa7c4c0be78 -r dd3ce87b3569 trunk/src/dil/Module.d --- a/trunk/src/dil/Module.d Wed Dec 12 22:17:20 2007 +0200 +++ b/trunk/src/dil/Module.d Thu Dec 13 18:45:29 2007 +0100 @@ -58,7 +58,7 @@ { // Take base name of file path as module name. auto str = (new FilePath(filePath)).name(); - if (Lexer.isNonReservedIdentifier(str)) + if (!Lexer.isReservedIdentifier(str)) { this.moduleFQN = moduleName = str; } diff -r baa7c4c0be78 -r dd3ce87b3569 trunk/src/dil/Unicode.d --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trunk/src/dil/Unicode.d Thu Dec 13 18:45:29 2007 +0100 @@ -0,0 +1,233 @@ +/++ + Author: Aziz Köksal + License: GPL3 ++/ +module dil.Unicode; +public import std.uni : isUniAlpha; + +/// U+FFFD = �. Used to replace invalid Unicode characters. +const dchar REPLACEMENT_CHAR = '\uFFFD'; +/// Invalid character, returned on errors. +const dchar ERROR_CHAR = 0xD800; + +/++ + Returns true if this character is not a surrogate + code point and not higher than 0x10FFFF. ++/ +bool isValidChar(dchar d) +{ + return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF; +} + +/++ + Returns true if this is one of the + There are a total of 66 noncharacters. + See_also: Chapter 16.7 Noncharacters in Unicode 5.0 ++/ +bool isNoncharacter(dchar d) +{ + return 0xFDD0 <= d && d <= 0xFDEF || // 32 + d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 +} + +/// Returns true if this is a trail byte of a UTF-8 sequence? +bool isTrailByte(ubyte b) +{ + return (b & 0xC0) == 0x80; // 10xx_xxxx +} + +/// Returns true if this is a lead byte of a UTF-8 sequence. +bool isLeadByte(ubyte b) +{ + return (b & 0xC0) == 0xC0; // 11xx_xxxx +} + +dchar decode(char[] str, ref size_t index) +in { assert(str.length); } +out(c) { assert(isValidChar(c)); } +body +{ + char* p = str.ptr + index; + char* end = str.ptr + str.length; + dchar c = *p; + + if (!(p < end)) + return ERROR_CHAR; + + if (c < 0x80) + { + ++index; + return c; + } + + ++p; // Move to second byte. + if (!(p < end)) + return ERROR_CHAR; + + // Error if second byte is not a trail byte. + if (!isTrailByte(*p)) + return ERROR_CHAR; + + // Check for overlong sequences. + switch (c) + { + case 0xE0, // 11100000 100xxxxx + 0xF0, // 11110000 1000xxxx + 0xF8, // 11111000 10000xxx + 0xFC: // 11111100 100000xx + if ((*p & c) == 0x80) + return ERROR_CHAR; + default: + if ((c & 0xFE) == 0xC0) // 1100000x + return ERROR_CHAR; + } + + const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))" + " return ERROR_CHAR;"; + const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; + + auto next_index = index; + // Decode + if ((c & 0b1110_0000) == 0b1100_0000) + { + // 110xxxxx 10xxxxxx + c &= 0b0001_1111; + mixin(appendSixBits); + next_index += 2; + } + else if ((c & 0b1111_0000) == 0b1110_0000) + { + // 1110xxxx 10xxxxxx 10xxxxxx + c &= 0b0000_1111; + mixin(appendSixBits ~ + checkNextByte ~ appendSixBits); + next_index += 3; + } + else if ((c & 0b1111_1000) == 0b1111_0000) + { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + c &= 0b0000_0111; + mixin(appendSixBits ~ + checkNextByte ~ appendSixBits ~ + checkNextByte ~ appendSixBits); + next_index += 4; + } + else + // 5 and 6 byte UTF-8 sequences are not allowed yet. + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + return ERROR_CHAR; + + assert(isTrailByte(*p)); + + if (!isValidChar(c)) + return ERROR_CHAR; + index = next_index; + return c; +} + +/// Encodes a character and appends it to str. +void encode(ref wchar[] str, dchar c) +in { assert(isValidChar(c)); } +body +{ + if (c < 0x10000) + str ~= cast(wchar)c; + else + { + // Encode with surrogate pair. + wchar[2] pair = void; + c -= 0x10000; // c' + // higher10bits(c') | 0b1101_10xx_xxxx_xxxx + pair[0] = (c >> 10) | 0xD800; + // lower10bits(c') | 0b1101_11yy_yyyy_yyyy + pair[1] = (c & 0x3FF) | 0xDC00; + str ~= pair; + } +} + +/++ + Returns a decoded character from a UTF-16 sequence. + In case of an error in the sequence 0xD800 is returned. + Params: + str = the UTF-16 sequence. + index = where to start from. ++/ +dchar decode(wchar[] str, ref size_t index) +{ + assert(str.length && index < str.length); + dchar c = str[index]; + if (0xD800 > c || c > 0xDFFF) + { + ++index; + return c; + } + if (c <= 0xDBFF && index+1 != str.length) + { + wchar c2 = str[index+1]; + if (0xDC00 <= c2 && c2 <= 0xDFFF) + { + // (c - 0xD800) << 10 + 0x10000 -> + // (c - 0xD800 + 0x40) << 10 -> + c = (c - 0xD7C0) << 10; + c |= (c2 & 0x3FF); + index += 2; + return c; + } + } + return ERROR_CHAR; +} + +/++ + Returns a decoded character from a UTF-16 sequence. + In case of an error in the sequence 0xD800 is returned. + Params: + p = start of the UTF-16 sequence. + end = one past the end of the sequence. ++/ +dchar decode(ref wchar* p, wchar* end) +{ + assert(p && p < end); + dchar c = *p; + if (0xD800 > c || c > 0xDFFF) + { + ++p; + return c; + } + if (c <= 0xDBFF && p+1 != end) + { + wchar c2 = p[1]; + if (0xDC00 <= c2 && c2 <= 0xDFFF) + { + c = (c - 0xD7C0) << 10; + c |= (c2 & 0x3FF); + p += 2; + return c; + } + } + return ERROR_CHAR; +} + +/// Decode a character from a zero-terminated string. +dchar decode(ref wchar* p) +{ + assert(p); + dchar c = *p; + if (0xD800 > c || c > 0xDFFF) + { + ++p; + return c; + } + if (c <= 0xDBFF) + { + wchar c2 = p[1]; + if (0xDC00 <= c2 && c2 <= 0xDFFF) + { + c = (c - 0xD7C0) << 10; + c |= (c2 & 0x3FF); + p += 2; + return c; + } + } + return ERROR_CHAR; +}