Mercurial > projects > dil
comparison src/dil/Unicode.d @ 806:bcb74c9b895c
Moved out files in the trunk folder to the root.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Sun, 09 Mar 2008 00:12:19 +0100 |
parents | trunk/src/dil/Unicode.d@c1d5cfd7aa44 |
children |
comparison
equal
deleted
inserted
replaced
805:a3fab8b74a7d | 806:bcb74c9b895c |
---|---|
1 /++ | |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.Unicode; | |
6 public import util.uni : isUniAlpha; | |
7 | |
8 /// U+FFFD = �. Used to replace invalid Unicode characters. | |
9 const dchar REPLACEMENT_CHAR = '\uFFFD'; | |
10 const char[3] REPLACEMENT_STR = \uFFFD; /// Ditto | |
11 /// Invalid character, returned on errors. | |
12 const dchar ERROR_CHAR = 0xD800; | |
13 | |
14 /// Returns: true if this character is not a surrogate | |
15 /// code point and not higher than 0x10FFFF. | |
16 bool isValidChar(dchar d) | |
17 { | |
18 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF; | |
19 } | |
20 | |
21 /// There are a total of 66 noncharacters. | |
22 /// Returns: true if this is one of them. | |
23 /// See_also: Chapter 16.7 Noncharacters in Unicode 5.0 | |
24 bool isNoncharacter(dchar d) | |
25 { | |
26 return 0xFDD0 <= d && d <= 0xFDEF || // 32 | |
27 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 | |
28 } | |
29 | |
30 /// Returns: true if this is a trail byte of a UTF-8 sequence. | |
31 bool isTrailByte(ubyte b) | |
32 { | |
33 return (b & 0xC0) == 0x80; // 10xx_xxxx | |
34 } | |
35 | |
36 /// Returns: true if this is a lead byte of a UTF-8 sequence. | |
37 bool isLeadByte(ubyte b) | |
38 { | |
39 return (b & 0xC0) == 0xC0; // 11xx_xxxx | |
40 } | |
41 | |
42 /// Advances ref_p only if this is a valid Unicode alpha character. | |
43 bool isUnicodeAlpha(ref char* ref_p, char* end) | |
44 in { assert(ref_p && ref_p < end); } | |
45 body | |
46 { | |
47 if (*ref_p < 0x80) | |
48 return false; | |
49 auto p = ref_p; | |
50 auto c = decode(p, end); | |
51 if (!isUniAlpha(c)) | |
52 return false; | |
53 ref_p = p; | |
54 return true; | |
55 } | |
56 | |
57 /// Decodes a character from str at index. | |
58 /// Params: | |
59 /// index = set to one past the ASCII char or one past the last trail byte | |
60 /// of the valid UTF-8 sequence. | |
61 dchar decode(char[] str, ref size_t index) | |
62 in { assert(str.length && index < str.length); } | |
63 out { assert(index <= str.length); } | |
64 body | |
65 { | |
66 char* p = str.ptr + index; | |
67 char* end = str.ptr + str.length; | |
68 dchar c = decode(p, end); | |
69 if (c != ERROR_CHAR) | |
70 index = p - str.ptr; | |
71 return c; | |
72 } | |
73 | |
74 /// Decodes a character starting at ref_p. | |
75 /// Params: | |
76 /// ref_p = set to one past the ASCII char or one past the last trail byte | |
77 /// of the valid UTF-8 sequence. | |
78 dchar decode(ref char* ref_p, char* end) | |
79 in { assert(ref_p && ref_p < end); } | |
80 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); } | |
81 body | |
82 { | |
83 char* p = ref_p; | |
84 dchar c = *p; | |
85 | |
86 if (c < 0x80) | |
87 return ref_p++, c; | |
88 | |
89 p++; // Move to second byte. | |
90 if (!(p < end)) | |
91 return ERROR_CHAR; | |
92 | |
93 // Error if second byte is not a trail byte. | |
94 if (!isTrailByte(*p)) | |
95 return ERROR_CHAR; | |
96 | |
97 // Check for overlong sequences. | |
98 switch (c) | |
99 { | |
100 case 0xE0, // 11100000 100xxxxx | |
101 0xF0, // 11110000 1000xxxx | |
102 0xF8, // 11111000 10000xxx | |
103 0xFC: // 11111100 100000xx | |
104 if ((*p & c) == 0x80) | |
105 return ERROR_CHAR; | |
106 default: | |
107 if ((c & 0xFE) == 0xC0) // 1100000x | |
108 return ERROR_CHAR; | |
109 } | |
110 | |
111 const char[] checkNextByte = "if (!(++p < end && isTrailByte(*p)))" | |
112 " return ERROR_CHAR;"; | |
113 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; | |
114 | |
115 // Decode | |
116 if ((c & 0b1110_0000) == 0b1100_0000) | |
117 { | |
118 // 110xxxxx 10xxxxxx | |
119 c &= 0b0001_1111; | |
120 mixin(appendSixBits); | |
121 } | |
122 else if ((c & 0b1111_0000) == 0b1110_0000) | |
123 { | |
124 // 1110xxxx 10xxxxxx 10xxxxxx | |
125 c &= 0b0000_1111; | |
126 mixin(appendSixBits ~ | |
127 checkNextByte ~ appendSixBits); | |
128 } | |
129 else if ((c & 0b1111_1000) == 0b1111_0000) | |
130 { | |
131 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
132 c &= 0b0000_0111; | |
133 mixin(appendSixBits ~ | |
134 checkNextByte ~ appendSixBits ~ | |
135 checkNextByte ~ appendSixBits); | |
136 } | |
137 else | |
138 // 5 and 6 byte UTF-8 sequences are not allowed yet. | |
139 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
140 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
141 return ERROR_CHAR; | |
142 | |
143 assert(isTrailByte(*p)); | |
144 | |
145 if (!isValidChar(c)) | |
146 return ERROR_CHAR; | |
147 ref_p = p+1; | |
148 return c; | |
149 } | |
150 | |
151 /// Encodes c and appends it to str. | |
152 void encode(ref char[] str, dchar c) | |
153 { | |
154 assert(isValidChar(c), "check if character is valid before calling encode()."); | |
155 | |
156 char[6] b = void; | |
157 if (c < 0x80) | |
158 str ~= c; | |
159 else if (c < 0x800) | |
160 { | |
161 b[0] = 0xC0 | (c >> 6); | |
162 b[1] = 0x80 | (c & 0x3F); | |
163 str ~= b[0..2]; | |
164 } | |
165 else if (c < 0x10000) | |
166 { | |
167 b[0] = 0xE0 | (c >> 12); | |
168 b[1] = 0x80 | ((c >> 6) & 0x3F); | |
169 b[2] = 0x80 | (c & 0x3F); | |
170 str ~= b[0..3]; | |
171 } | |
172 else if (c < 0x200000) | |
173 { | |
174 b[0] = 0xF0 | (c >> 18); | |
175 b[1] = 0x80 | ((c >> 12) & 0x3F); | |
176 b[2] = 0x80 | ((c >> 6) & 0x3F); | |
177 b[3] = 0x80 | (c & 0x3F); | |
178 str ~= b[0..4]; | |
179 } | |
180 /+ // There are no 5 and 6 byte UTF-8 sequences yet. | |
181 else if (c < 0x4000000) | |
182 { | |
183 b[0] = 0xF8 | (c >> 24); | |
184 b[1] = 0x80 | ((c >> 18) & 0x3F); | |
185 b[2] = 0x80 | ((c >> 12) & 0x3F); | |
186 b[3] = 0x80 | ((c >> 6) & 0x3F); | |
187 b[4] = 0x80 | (c & 0x3F); | |
188 str ~= b[0..5]; | |
189 } | |
190 else if (c < 0x80000000) | |
191 { | |
192 b[0] = 0xFC | (c >> 30); | |
193 b[1] = 0x80 | ((c >> 24) & 0x3F); | |
194 b[2] = 0x80 | ((c >> 18) & 0x3F); | |
195 b[3] = 0x80 | ((c >> 12) & 0x3F); | |
196 b[4] = 0x80 | ((c >> 6) & 0x3F); | |
197 b[5] = 0x80 | (c & 0x3F); | |
198 str ~= b[0..6]; | |
199 } | |
200 +/ | |
201 else | |
202 assert(0); | |
203 } | |
204 | |
205 /// Encodes c and appends it to str. | |
206 void encode(ref wchar[] str, dchar c) | |
207 in { assert(isValidChar(c)); } | |
208 body | |
209 { | |
210 if (c < 0x10000) | |
211 str ~= cast(wchar)c; | |
212 else | |
213 { // Encode with surrogate pair. | |
214 wchar[2] pair = void; | |
215 c -= 0x10000; // c' | |
216 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx | |
217 pair[0] = (c >> 10) | 0xD800; | |
218 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy | |
219 pair[1] = (c & 0x3FF) | 0xDC00; | |
220 str ~= pair; | |
221 } | |
222 } | |
223 | |
224 /// Decodes a character from a UTF-16 sequence. | |
225 /// Params: | |
226 /// str = the UTF-16 sequence. | |
227 /// index = where to start from. | |
228 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
229 dchar decode(wchar[] str, ref size_t index) | |
230 { | |
231 assert(str.length && index < str.length); | |
232 dchar c = str[index]; | |
233 if (0xD800 > c || c > 0xDFFF) | |
234 { | |
235 ++index; | |
236 return c; | |
237 } | |
238 if (c <= 0xDBFF && index+1 != str.length) | |
239 { | |
240 wchar c2 = str[index+1]; | |
241 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
242 { // Decode surrogate pair. | |
243 // (c - 0xD800) << 10 + 0x10000 -> | |
244 // (c - 0xD800 + 0x40) << 10 -> | |
245 c = (c - 0xD7C0) << 10; | |
246 c |= (c2 & 0x3FF); | |
247 index += 2; | |
248 return c; | |
249 } | |
250 } | |
251 return ERROR_CHAR; | |
252 } | |
253 | |
254 /// Decodes a character from a UTF-16 sequence. | |
255 /// Params: | |
256 /// p = start of the UTF-16 sequence. | |
257 /// end = one past the end of the sequence. | |
258 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
259 dchar decode(ref wchar* p, wchar* end) | |
260 { | |
261 assert(p && p < end); | |
262 dchar c = *p; | |
263 if (0xD800 > c || c > 0xDFFF) | |
264 { | |
265 ++p; | |
266 return c; | |
267 } | |
268 if (c <= 0xDBFF && p+1 != end) | |
269 { | |
270 wchar c2 = p[1]; | |
271 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
272 { | |
273 c = (c - 0xD7C0) << 10; | |
274 c |= (c2 & 0x3FF); | |
275 p += 2; | |
276 return c; | |
277 } | |
278 } | |
279 return ERROR_CHAR; | |
280 } | |
281 | |
282 /// Decodes a character from a zero-terminated UTF-16 string. | |
283 /// Params: | |
284 /// p = start of the UTF-16 sequence. | |
285 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
286 dchar decode(ref wchar* p) | |
287 { | |
288 assert(p); | |
289 dchar c = *p; | |
290 if (0xD800 > c || c > 0xDFFF) | |
291 { | |
292 ++p; | |
293 return c; | |
294 } | |
295 if (c <= 0xDBFF) | |
296 { | |
297 wchar c2 = p[1]; | |
298 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
299 { | |
300 c = (c - 0xD7C0) << 10; | |
301 c |= (c2 & 0x3FF); | |
302 p += 2; | |
303 return c; | |
304 } | |
305 } | |
306 return ERROR_CHAR; | |
307 } | |
308 | |
309 /// Converts a UTF-8 string to a UTF-16 string. | |
310 wchar[] toUTF16(char[] str) | |
311 { | |
312 wchar[] result; | |
313 size_t idx; | |
314 while (idx < str.length) | |
315 { | |
316 auto c = decode(str, idx); | |
317 if (c == ERROR_CHAR) | |
318 { // Skip trail bytes. | |
319 while (++idx < str.length && isTrailByte(str[idx])) | |
320 {} | |
321 c = REPLACEMENT_CHAR; | |
322 } | |
323 encode(result, c); | |
324 } | |
325 return result; | |
326 } | |
327 | |
328 /// Converts a UTF-8 string to a UTF-32 string. | |
329 dchar[] toUTF32(char[] str) | |
330 { | |
331 dchar[] result; | |
332 size_t idx; | |
333 while (idx < str.length) | |
334 { | |
335 auto c = decode(str, idx); | |
336 if (c == ERROR_CHAR) | |
337 { // Skip trail bytes. | |
338 while (++idx < str.length && isTrailByte(str[idx])) | |
339 {} | |
340 c = REPLACEMENT_CHAR; | |
341 } | |
342 result ~= c; | |
343 } | |
344 return result; | |
345 } |