Mercurial > projects > dil
changeset 426:3f7790d3f9d6
Improved decodeUTF8().
Removed old decodeUTF8() method.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Mon, 01 Oct 2007 15:38:30 +0200 |
parents | 6bf936bf3356 |
children | e2bbc6406a14 |
files | trunk/src/dil/Lexer.d |
diffstat | 1 files changed, 41 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/trunk/src/dil/Lexer.d Sun Sep 30 23:42:01 2007 +0200 +++ b/trunk/src/dil/Lexer.d Mon Oct 01 15:38:30 2007 +0200 @@ -2564,33 +2564,17 @@ (d > 0xDFFF && d <= 0x10FFFF && d != 0xFFFF && d != 0xFFFE); } + /// Is this a trail byte of a UTF-8 sequence? bool isTrailByte(ubyte b) { - return (b & 0xC0) == 0x80; + return (b & 0xC0) == 0x80; // 10xx_xxxx } -/+ - dchar decodeUTF8() + /// Is this a lead byte of a UTF-8 sequence? + bool isLeadByte(ubyte b) { - assert(!isascii(*p), "check for ASCII char before calling decodeUTF8()."); - size_t idx; - dchar d = 0; - try - { - d = std.utf.decode(p[0 .. end-p], idx); - p += idx -1; - } - catch (UtfException e) - { - error(p, MID.InvalidUTF8Sequence); - // Move to next valid UTF-8 sequence or ASCII character. - while (++p < end && isTrailByte(*p)) {} - assert(p < end); - --p; - } - return d; + return (b & 0xC0) == 0xC0; // 11xx_xxxx } -+/ dchar decodeUTF8() { @@ -2598,54 +2582,50 @@ char* p = this.p; dchar d = *p; + ++p; // Move to second byte. + // Error if second byte is not a trail byte. + if (!isTrailByte(*p)) + goto Lerr2; + // Check for overlong sequences. switch (d) { - // 11100000 100xxxxx - // 11110000 1000xxxx - // 11111000 10000xxx - // 11111100 100000xx - case 0xE0, 0xF0, 0xF8, 0xFC: - if ((p[1] & d) == 0x80) + case 0xE0, // 11100000 100xxxxx + 0xF0, // 11110000 1000xxxx + 0xF8, // 11111000 10000xxx + 0xFC: // 11111100 100000xx + if ((*p & d) == 0x80) goto Lerr; default: if ((d & 0xFE) == 0xC0) // 1100000x goto Lerr; } + const char[] checkNextByte = "if (!isTrailByte(*++p))" + " goto Lerr2;"; + const char[] appendSixBits = "d = (d << 6) | *p & 0b0011_1111;"; + // Decode if ((d & 0b1110_0000) == 0b1100_0000) { // 110xxxxx 10xxxxxx d &= 0b0001_1111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; + mixin(appendSixBits); } else if ((d & 0b1111_0000) == 0b1110_0000) { // 1110xxxx 10xxxxxx 10xxxxxx d &= 0b0000_1111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; + mixin(appendSixBits ~ + checkNextByte ~ appendSixBits); } else if ((d & 0b1111_1000) == 0b1111_0000) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx d &= 0b0000_0111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; - if (!isTrailByte(*++p)) - goto Lerr2; - d = (d << 6) | *p & 0b0011_1111; + mixin(appendSixBits ~ + checkNextByte ~ appendSixBits ~ + checkNextByte ~ appendSixBits); } else // 5 and 6 byte UTF-8 sequences are not allowed yet. @@ -2653,13 +2633,26 @@ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx goto Lerr; + assert(isTrailByte(*p)); + if (!isEncodable(d)) { Lerr: - // Move to next valid UTF-8 sequence or ASCII character. - while (++p < end && isTrailByte(*p)) {} - assert(p < end); + // Three cases: + // *) the UTF-8 sequence was successfully decoded but the resulting + // character is invalid. + // p points to last trail byte in the sequence. + // *) the UTF-8 sequence is overlong. + // p points to second byte in the sequence. + // *) the UTF-8 sequence has more than 4 bytes or starts with + // a trail byte. + // p points to second byte in the sequence. + assert(isTrailByte(*p)); + // Move to next ASCII character or lead byte of a UTF-8 sequence. + while (p < (end-1) && isTrailByte(*p)) + ++p; --p; + assert(!isTrailByte(p[1])); Lerr2: d = 0; error(this.p, MID.InvalidUTF8Sequence);