comparison trunk/src/dil/Unicode.d @ 722:ceaac6a24258

Added isUnicodeAlpha() for DDocParser and MacroParser.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Fri, 01 Feb 2008 19:44:00 +0100
parents d050e211402b
children f88b5285b86b
comparison
equal deleted inserted replaced
721:8955296dd807 722:ceaac6a24258
40 bool isLeadByte(ubyte b) 40 bool isLeadByte(ubyte b)
41 { 41 {
42 return (b & 0xC0) == 0xC0; // 11xx_xxxx 42 return (b & 0xC0) == 0xC0; // 11xx_xxxx
43 } 43 }
44 44
45 /// Advances ref_p only if this is a valid Unicode alpha character.
46 bool isUnicodeAlpha(ref char* ref_p, char* end)
47 in { assert(ref_p && ref_p < end); }
48 body
49 {
50 if (*ref_p < 0x80)
51 return false;
52 auto p = ref_p;
53 auto c = decode(p, end);
54 if (!isUniAlpha(c))
55 return false;
56 ref_p = p;
57 return true;
58 }
59
60 /// index is set one past the last trail byte of the valid UTF-8 sequence.
45 dchar decode(char[] str, ref size_t index) 61 dchar decode(char[] str, ref size_t index)
46 in { assert(str.length); } 62 in { assert(str.length && index < str.length); }
47 out(c) { assert(isValidChar(c)); } 63 out(c) { assert(isValidChar(c)); }
48 body 64 body
49 { 65 {
50 char* p = str.ptr + index; 66 char* p = str.ptr + index;
51 char* end = str.ptr + str.length; 67 char* end = str.ptr + str.length;
68 dchar c = decode(p, end);
69 if (c != ERROR_CHAR)
70 index = p - str.ptr + 1;
71 return c;
72 }
73
74 /// ref_p is set to the last trail byte of the valid UTF-8 sequence.
75 dchar decode(ref char* ref_p, char* end)
76 in { assert(ref_p && ref_p < end); }
77 out(c) { assert(isValidChar(c)); }
78 body
79 {
80 char* p = ref_p;
52 dchar c = *p; 81 dchar c = *p;
53 82
54 if (!(p < end))
55 return ERROR_CHAR;
56
57 if (c < 0x80) 83 if (c < 0x80)
58 { 84 {
59 ++index; 85 ref_p++;
60 return c; 86 return c;
61 } 87 }
62 88
63 ++p; // Move to second byte. 89 p++; // Move to second byte.
64 if (!(p < end)) 90 if (!(p < end))
65 return ERROR_CHAR; 91 return ERROR_CHAR;
66 92
67 // Error if second byte is not a trail byte. 93 // Error if second byte is not a trail byte.
68 if (!isTrailByte(*p)) 94 if (!isTrailByte(*p))
80 default: 106 default:
81 if ((c & 0xFE) == 0xC0) // 1100000x 107 if ((c & 0xFE) == 0xC0) // 1100000x
82 return ERROR_CHAR; 108 return ERROR_CHAR;
83 } 109 }
84 110
85 const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))" 111 const char[] checkNextByte = "if (!(++p < end && isTrailByte(*p)))"
86 " return ERROR_CHAR;"; 112 " return ERROR_CHAR;";
87 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; 113 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;";
88 114
89 auto next_index = index;
90 // Decode 115 // Decode
91 if ((c & 0b1110_0000) == 0b1100_0000) 116 if ((c & 0b1110_0000) == 0b1100_0000)
92 { 117 {
93 // 110xxxxx 10xxxxxx 118 // 110xxxxx 10xxxxxx
94 c &= 0b0001_1111; 119 c &= 0b0001_1111;
95 mixin(appendSixBits); 120 mixin(appendSixBits);
96 next_index += 2;
97 } 121 }
98 else if ((c & 0b1111_0000) == 0b1110_0000) 122 else if ((c & 0b1111_0000) == 0b1110_0000)
99 { 123 {
100 // 1110xxxx 10xxxxxx 10xxxxxx 124 // 1110xxxx 10xxxxxx 10xxxxxx
101 c &= 0b0000_1111; 125 c &= 0b0000_1111;
102 mixin(appendSixBits ~ 126 mixin(appendSixBits ~
103 checkNextByte ~ appendSixBits); 127 checkNextByte ~ appendSixBits);
104 next_index += 3;
105 } 128 }
106 else if ((c & 0b1111_1000) == 0b1111_0000) 129 else if ((c & 0b1111_1000) == 0b1111_0000)
107 { 130 {
108 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 131 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
109 c &= 0b0000_0111; 132 c &= 0b0000_0111;
110 mixin(appendSixBits ~ 133 mixin(appendSixBits ~
111 checkNextByte ~ appendSixBits ~ 134 checkNextByte ~ appendSixBits ~
112 checkNextByte ~ appendSixBits); 135 checkNextByte ~ appendSixBits);
113 next_index += 4;
114 } 136 }
115 else 137 else
116 // 5 and 6 byte UTF-8 sequences are not allowed yet. 138 // 5 and 6 byte UTF-8 sequences are not allowed yet.
117 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 139 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
118 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 140 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
120 142
121 assert(isTrailByte(*p)); 143 assert(isTrailByte(*p));
122 144
123 if (!isValidChar(c)) 145 if (!isValidChar(c))
124 return ERROR_CHAR; 146 return ERROR_CHAR;
125 index = next_index; 147 ref_p = p;
126 return c; 148 return c;
127 } 149 }
128 150
129 /// Encodes a character and appends it to str. 151 /// Encodes a character and appends it to str.
130 void encode(ref char[] str, dchar c) 152 void encode(ref char[] str, dchar c)