comparison trunk/src/dil/lexer/Lexer.d @ 789:c1d5cfd7aa44

Implemented string literal conversion. Removed two MID messages. Added MSG.InvalidUTF8SequenceInString. Added toUTF16() and toUTF32(). Fixed escape sequences. Added formatBytes() and findInvalidUTF8Sequence().
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Mon, 25 Feb 2008 02:56:22 +0100
parents 580d4ca9f1ff
children cf2ad5df025c
comparison
equal deleted inserted replaced
788:139c9a6a39a8 789:c1d5cfd7aa44
353 return scanNormalStringLiteral(t); 353 return scanNormalStringLiteral(t);
354 case '\\': 354 case '\\':
355 char[] buffer; 355 char[] buffer;
356 do 356 do
357 { 357 {
358 c = scanEscapeSequence(); 358 bool isBinary;
359 if (isascii(c)) 359 c = scanEscapeSequence(isBinary);
360 if (isascii(c) || isBinary)
360 buffer ~= c; 361 buffer ~= c;
361 else 362 else
362 encodeUTF8(buffer, c); 363 encodeUTF8(buffer, c);
363 } while (*p == '\\') 364 } while (*p == '\\')
364 buffer ~= 0; 365 buffer ~= 0;
921 return scanNormalStringLiteral(t); 922 return scanNormalStringLiteral(t);
922 case '\\': 923 case '\\':
923 char[] buffer; 924 char[] buffer;
924 do 925 do
925 { 926 {
926 c = scanEscapeSequence(); 927 bool isBinary;
927 if (isascii(c)) 928 c = scanEscapeSequence(isBinary);
929 if (isascii(c) || isBinary)
928 buffer ~= c; 930 buffer ~= c;
929 else 931 else
930 encodeUTF8(buffer, c); 932 encodeUTF8(buffer, c);
931 } while (*p == '\\') 933 } while (*p == '\\')
932 buffer ~= 0; 934 buffer ~= 0;
1222 Lreturn: 1224 Lreturn:
1223 t.str = buffer ~ '\0'; 1225 t.str = buffer ~ '\0';
1224 t.end = p; 1226 t.end = p;
1225 return; 1227 return;
1226 case '\\': 1228 case '\\':
1227 c = scanEscapeSequence(); 1229 bool isBinary;
1230 c = scanEscapeSequence(isBinary);
1228 --p; 1231 --p;
1229 if (isascii(c)) 1232 if (isascii(c) || isBinary)
1230 break; 1233 buffer ~= c;
1231 encodeUTF8(buffer, c); 1234 else
1235 encodeUTF8(buffer, c);
1232 continue; 1236 continue;
1233 case '\r': 1237 case '\r':
1234 if (p[1] == '\n') 1238 if (p[1] == '\n')
1235 ++p; 1239 ++p;
1236 case '\n': 1240 case '\n':
1264 ++p; 1268 ++p;
1265 t.kind = TOK.CharLiteral; 1269 t.kind = TOK.CharLiteral;
1266 switch (*p) 1270 switch (*p)
1267 { 1271 {
1268 case '\\': 1272 case '\\':
1269 t.dchar_ = scanEscapeSequence(); 1273 bool notused;
1274 t.dchar_ = scanEscapeSequence(notused);
1270 break; 1275 break;
1271 case '\'': 1276 case '\'':
1272 error(t.start, MID.EmptyCharacterLiteral); 1277 error(t.start, MID.EmptyCharacterLiteral);
1273 break; 1278 break;
1274 default: 1279 default:
1706 1711
1707 --inTokenString; 1712 --inTokenString;
1708 } 1713 }
1709 } // version(D2) 1714 } // version(D2)
1710 1715
1711 dchar scanEscapeSequence() 1716 dchar scanEscapeSequence(ref bool isBinary)
1712 out(result) 1717 out(result)
1713 { assert(isValidChar(result)); } 1718 { assert(isValidChar(result)); }
1714 body 1719 body
1715 { 1720 {
1716 assert(*p == '\\'); 1721 assert(*p == '\\');
1728 uint digits = 2; 1733 uint digits = 2;
1729 1734
1730 switch (*p) 1735 switch (*p)
1731 { 1736 {
1732 case 'x': 1737 case 'x':
1738 isBinary = true;
1739 case_Unicode:
1733 assert(c == 0); 1740 assert(c == 0);
1741 assert(digits == 2 || digits == 4 || digits == 8);
1734 while (1) 1742 while (1)
1735 { 1743 {
1736 ++p; 1744 ++p;
1737 if (ishexad(*p)) 1745 if (ishexad(*p))
1738 { 1746 {
1742 else if (*p <= 'F') 1750 else if (*p <= 'F')
1743 c += *p - 'A' + 10; 1751 c += *p - 'A' + 10;
1744 else 1752 else
1745 c += *p - 'a' + 10; 1753 c += *p - 'a' + 10;
1746 1754
1747 if (!--digits) 1755 if (--digits == 0)
1748 { 1756 {
1749 ++p; 1757 ++p;
1750 if (isValidChar(c)) 1758 if (isValidChar(c))
1751 return c; // Return valid escape value. 1759 return c; // Return valid escape value.
1752 1760
1753 error(sequenceStart, MID.InvalidUnicodeEscapeSequence, sequenceStart[0..p-sequenceStart]); 1761 error(sequenceStart, MID.InvalidUnicodeEscapeSequence,
1762 sequenceStart[0..p-sequenceStart]);
1754 break; 1763 break;
1755 } 1764 }
1756 continue; 1765 continue;
1757 } 1766 }
1758 1767
1759 error(sequenceStart, MID.InsufficientHexDigits); 1768 error(sequenceStart, MID.InsufficientHexDigits,
1769 sequenceStart[0..p-sequenceStart]);
1760 break; 1770 break;
1761 } 1771 }
1762 break; 1772 break;
1763 case 'u': 1773 case 'u':
1764 digits = 4; 1774 digits = 4;
1765 goto case 'x'; 1775 goto case_Unicode;
1766 case 'U': 1776 case 'U':
1767 digits = 8; 1777 digits = 8;
1768 goto case 'x'; 1778 goto case_Unicode;
1769 default: 1779 default:
1770 if (isoctal(*p)) 1780 if (isoctal(*p))
1771 { 1781 {
1782 isBinary = true;
1772 assert(c == 0); 1783 assert(c == 0);
1773 c += *p - '0'; 1784 c += *p - '0';
1774 ++p; 1785 ++p;
1775 if (!isoctal(*p)) 1786 if (!isoctal(*p))
1776 return c; 1787 return c;
1780 if (!isoctal(*p)) 1791 if (!isoctal(*p))
1781 return c; 1792 return c;
1782 c *= 8; 1793 c *= 8;
1783 c += *p - '0'; 1794 c += *p - '0';
1784 ++p; 1795 ++p;
1785 return c; // Return valid escape value. 1796 return c & 0xFF; // Return valid escape value.
1786 } 1797 }
1787 else if(*p == '&') 1798 else if(*p == '&')
1788 { 1799 {
1789 if (isalpha(*++p)) 1800 if (isalpha(*++p))
1790 { 1801 {
2608 ++p; 2619 ++p;
2609 --p; 2620 --p;
2610 assert(!isTrailByte(p[1])); 2621 assert(!isTrailByte(p[1]));
2611 Lerr2: 2622 Lerr2:
2612 d = REPLACEMENT_CHAR; 2623 d = REPLACEMENT_CHAR;
2613 error(this.p, MID.InvalidUTF8Sequence); 2624 error(this.p, MID.InvalidUTF8Sequence, formatBytes(this.p, p));
2614 } 2625 }
2615 2626
2616 this.p = p; 2627 this.p = p;
2617 return d; 2628 return d;
2618 } 2629 }
2665 str ~= b[0..6]; 2676 str ~= b[0..6];
2666 } 2677 }
2667 +/ 2678 +/
2668 else 2679 else
2669 assert(0); 2680 assert(0);
2681 }
2682
2683 /// Formats the bytes between start and end.
2684 /// Returns: e.g.: abc -> \x61\x62\x63
2685 static char[] formatBytes(char* start, char* end)
2686 {
2687 auto strLen = end-start;
2688 const formatLen = `\xXX`.length;
2689 char[] result = new char[strLen*formatLen]; // Reserve space.
2690 result.length = 0;
2691 foreach (c; cast(ubyte[])start[0..strLen])
2692 result ~= Format("\\x{:X}", c);
2693 return result;
2694 }
2695
2696 /// Searches for an invalid UTF-8 sequence in str.
2697 /// Returns: a formatted string of the invalid sequence (e.g. \xC0\x80).
2698 static string findInvalidUTF8Sequence(string str)
2699 {
2700 char* p = str.ptr, end = p + str.length;
2701 while (p < end)
2702 {
2703 if (decode(p, end) == ERROR_CHAR)
2704 {
2705 auto begin = p;
2706 // Skip trail-bytes.
2707 while (++p < end && isTrailByte(*p))
2708 {}
2709 return Lexer.formatBytes(begin, p);
2710 }
2711 }
2712 assert(p == end);
2713 return "";
2670 } 2714 }
2671 } 2715 }
2672 2716
2673 unittest 2717 unittest
2674 { 2718 {