Mercurial > projects > dil
comparison trunk/src/dil/lexer/Lexer.d @ 789:c1d5cfd7aa44
Implemented string literal conversion.
Removed two MID messages.
Added MSG.InvalidUTF8SequenceInString.
Added toUTF16() and toUTF32().
Fixed escape sequences.
Added formatBytes() and findInvalidUTF8Sequence().
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Mon, 25 Feb 2008 02:56:22 +0100 |
parents | 580d4ca9f1ff |
children | cf2ad5df025c |
comparison
equal
deleted
inserted
replaced
788:139c9a6a39a8 | 789:c1d5cfd7aa44 |
---|---|
353 return scanNormalStringLiteral(t); | 353 return scanNormalStringLiteral(t); |
354 case '\\': | 354 case '\\': |
355 char[] buffer; | 355 char[] buffer; |
356 do | 356 do |
357 { | 357 { |
358 c = scanEscapeSequence(); | 358 bool isBinary; |
359 if (isascii(c)) | 359 c = scanEscapeSequence(isBinary); |
360 if (isascii(c) || isBinary) | |
360 buffer ~= c; | 361 buffer ~= c; |
361 else | 362 else |
362 encodeUTF8(buffer, c); | 363 encodeUTF8(buffer, c); |
363 } while (*p == '\\') | 364 } while (*p == '\\') |
364 buffer ~= 0; | 365 buffer ~= 0; |
921 return scanNormalStringLiteral(t); | 922 return scanNormalStringLiteral(t); |
922 case '\\': | 923 case '\\': |
923 char[] buffer; | 924 char[] buffer; |
924 do | 925 do |
925 { | 926 { |
926 c = scanEscapeSequence(); | 927 bool isBinary; |
927 if (isascii(c)) | 928 c = scanEscapeSequence(isBinary); |
929 if (isascii(c) || isBinary) | |
928 buffer ~= c; | 930 buffer ~= c; |
929 else | 931 else |
930 encodeUTF8(buffer, c); | 932 encodeUTF8(buffer, c); |
931 } while (*p == '\\') | 933 } while (*p == '\\') |
932 buffer ~= 0; | 934 buffer ~= 0; |
1222 Lreturn: | 1224 Lreturn: |
1223 t.str = buffer ~ '\0'; | 1225 t.str = buffer ~ '\0'; |
1224 t.end = p; | 1226 t.end = p; |
1225 return; | 1227 return; |
1226 case '\\': | 1228 case '\\': |
1227 c = scanEscapeSequence(); | 1229 bool isBinary; |
1230 c = scanEscapeSequence(isBinary); | |
1228 --p; | 1231 --p; |
1229 if (isascii(c)) | 1232 if (isascii(c) || isBinary) |
1230 break; | 1233 buffer ~= c; |
1231 encodeUTF8(buffer, c); | 1234 else |
1235 encodeUTF8(buffer, c); | |
1232 continue; | 1236 continue; |
1233 case '\r': | 1237 case '\r': |
1234 if (p[1] == '\n') | 1238 if (p[1] == '\n') |
1235 ++p; | 1239 ++p; |
1236 case '\n': | 1240 case '\n': |
1264 ++p; | 1268 ++p; |
1265 t.kind = TOK.CharLiteral; | 1269 t.kind = TOK.CharLiteral; |
1266 switch (*p) | 1270 switch (*p) |
1267 { | 1271 { |
1268 case '\\': | 1272 case '\\': |
1269 t.dchar_ = scanEscapeSequence(); | 1273 bool notused; |
1274 t.dchar_ = scanEscapeSequence(notused); | |
1270 break; | 1275 break; |
1271 case '\'': | 1276 case '\'': |
1272 error(t.start, MID.EmptyCharacterLiteral); | 1277 error(t.start, MID.EmptyCharacterLiteral); |
1273 break; | 1278 break; |
1274 default: | 1279 default: |
1706 | 1711 |
1707 --inTokenString; | 1712 --inTokenString; |
1708 } | 1713 } |
1709 } // version(D2) | 1714 } // version(D2) |
1710 | 1715 |
1711 dchar scanEscapeSequence() | 1716 dchar scanEscapeSequence(ref bool isBinary) |
1712 out(result) | 1717 out(result) |
1713 { assert(isValidChar(result)); } | 1718 { assert(isValidChar(result)); } |
1714 body | 1719 body |
1715 { | 1720 { |
1716 assert(*p == '\\'); | 1721 assert(*p == '\\'); |
1728 uint digits = 2; | 1733 uint digits = 2; |
1729 | 1734 |
1730 switch (*p) | 1735 switch (*p) |
1731 { | 1736 { |
1732 case 'x': | 1737 case 'x': |
1738 isBinary = true; | |
1739 case_Unicode: | |
1733 assert(c == 0); | 1740 assert(c == 0); |
1741 assert(digits == 2 || digits == 4 || digits == 8); | |
1734 while (1) | 1742 while (1) |
1735 { | 1743 { |
1736 ++p; | 1744 ++p; |
1737 if (ishexad(*p)) | 1745 if (ishexad(*p)) |
1738 { | 1746 { |
1742 else if (*p <= 'F') | 1750 else if (*p <= 'F') |
1743 c += *p - 'A' + 10; | 1751 c += *p - 'A' + 10; |
1744 else | 1752 else |
1745 c += *p - 'a' + 10; | 1753 c += *p - 'a' + 10; |
1746 | 1754 |
1747 if (!--digits) | 1755 if (--digits == 0) |
1748 { | 1756 { |
1749 ++p; | 1757 ++p; |
1750 if (isValidChar(c)) | 1758 if (isValidChar(c)) |
1751 return c; // Return valid escape value. | 1759 return c; // Return valid escape value. |
1752 | 1760 |
1753 error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]); | 1761 error(sequenceStart, MID.InvalidUnicodeEscapeSequence, |
1762 sequenceStart[0..p-sequenceStart]); | |
1754 break; | 1763 break; |
1755 } | 1764 } |
1756 continue; | 1765 continue; |
1757 } | 1766 } |
1758 | 1767 |
1759 error(sequenceStart, MID.InsufficientHexDigits); | 1768 error(sequenceStart, MID.InsufficientHexDigits, |
1769 sequenceStart[0..p-sequenceStart]); | |
1760 break; | 1770 break; |
1761 } | 1771 } |
1762 break; | 1772 break; |
1763 case 'u': | 1773 case 'u': |
1764 digits = 4; | 1774 digits = 4; |
1765 goto case 'x'; | 1775 goto case_Unicode; |
1766 case 'U': | 1776 case 'U': |
1767 digits = 8; | 1777 digits = 8; |
1768 goto case 'x'; | 1778 goto case_Unicode; |
1769 default: | 1779 default: |
1770 if (isoctal(*p)) | 1780 if (isoctal(*p)) |
1771 { | 1781 { |
1782 isBinary = true; | |
1772 assert(c == 0); | 1783 assert(c == 0); |
1773 c += *p - '0'; | 1784 c += *p - '0'; |
1774 ++p; | 1785 ++p; |
1775 if (!isoctal(*p)) | 1786 if (!isoctal(*p)) |
1776 return c; | 1787 return c; |
1780 if (!isoctal(*p)) | 1791 if (!isoctal(*p)) |
1781 return c; | 1792 return c; |
1782 c *= 8; | 1793 c *= 8; |
1783 c += *p - '0'; | 1794 c += *p - '0'; |
1784 ++p; | 1795 ++p; |
1785 return c; // Return valid escape value. | 1796 return c & 0xFF; // Return valid escape value. |
1786 } | 1797 } |
1787 else if(*p == '&') | 1798 else if(*p == '&') |
1788 { | 1799 { |
1789 if (isalpha(*++p)) | 1800 if (isalpha(*++p)) |
1790 { | 1801 { |
2608 ++p; | 2619 ++p; |
2609 --p; | 2620 --p; |
2610 assert(!isTrailByte(p[1])); | 2621 assert(!isTrailByte(p[1])); |
2611 Lerr2: | 2622 Lerr2: |
2612 d = REPLACEMENT_CHAR; | 2623 d = REPLACEMENT_CHAR; |
2613 error(this.p, MID.InvalidUTF8Sequence); | 2624 error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p)); |
2614 } | 2625 } |
2615 | 2626 |
2616 this.p = p; | 2627 this.p = p; |
2617 return d; | 2628 return d; |
2618 } | 2629 } |
2665 str ~= b[0..6]; | 2676 str ~= b[0..6]; |
2666 } | 2677 } |
2667 +/ | 2678 +/ |
2668 else | 2679 else |
2669 assert(0); | 2680 assert(0); |
2681 } | |
2682 | |
2683 /// Formats the bytes between start and end. | |
2684 /// Returns: e.g.: abc -> \x61\x62\x63 | |
2685 static char[] formatBytes(char* start, char* end) | |
2686 { | |
2687 auto strLen = end-start; | |
2688 const formatLen = `\xXX`.length; | |
2689 char[] result = new char[strLen*formatLen]; // Reserve space. | |
2690 result.length = 0; | |
2691 foreach (c; cast(ubyte[])start[0..strLen]) | |
2692 result ~= Format("\\x{:X}", c); | |
2693 return result; | |
2694 } | |
2695 | |
2696 /// Searches for an invalid UTF-8 sequence in str. | |
2697 /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80). | |
2698 static string findInvalidUTF8Sequence(string str) | |
2699 { | |
2700 char* p = str.ptr, end = p + str.length; | |
2701 while (p < end) | |
2702 { | |
2703 if (decode(p, end) == ERROR_CHAR) | |
2704 { | |
2705 auto begin = p; | |
2706 // Skip trail-bytes. | |
2707 while (++p < end && isTrailByte(*p)) | |
2708 {} | |
2709 return Lexer.formatBytes(begin, p); | |
2710 } | |
2711 } | |
2712 assert(p == end); | |
2713 return ""; | |
2670 } | 2714 } |
2671 } | 2715 } |
2672 | 2716 |
2673 unittest | 2717 unittest |
2674 { | 2718 { |