Mercurial > projects > ldc
comparison tango/lib/compiler/llvmdc/util/utf.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:5825d48b27d1 | 132:1700239cab2e |
---|---|
1 // utf.d | |
2 | |
3 /* | |
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com | |
5 * Written by Walter Bright | |
6 * | |
7 * This software is provided 'as-is', without any express or implied | |
8 * warranty. In no event will the authors be held liable for any damages | |
9 * arising from the use of this software. | |
10 * | |
11 * Permission is granted to anyone to use this software for any purpose, | |
12 * including commercial applications, and to alter it and redistribute it | |
13 * freely, subject to the following restrictions: | |
14 * | |
15 * o The origin of this software must not be misrepresented; you must not | |
16 * claim that you wrote the original software. If you use this software | |
17 * in a product, an acknowledgment in the product documentation would be | |
18 * appreciated but is not required. | |
19 * o Altered source versions must be plainly marked as such, and must not | |
20 * be misrepresented as being the original software. | |
21 * o This notice may not be removed or altered from any source | |
22 * distribution. | |
23 */ | |
24 | |
25 // Description of UTF-8 at: | |
26 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 | |
27 // http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335 | |
28 | |
29 | |
30 module util.utf; | |
31 | |
32 | |
33 extern (C) void onUnicodeError( char[] msg, size_t idx ); | |
34 | |
35 | |
36 bool isValidDchar(dchar c) | |
37 { | |
38 /* Note: FFFE and FFFF are specifically permitted by the | |
39 * Unicode standard for application internal use, but are not | |
40 * allowed for interchange. | |
41 * (thanks to Arcane Jill) | |
42 */ | |
43 | |
44 return c < 0xD800 || | |
45 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); | |
46 } | |
47 | |
48 unittest | |
49 { | |
50 debug(utf) printf("utf.isValidDchar.unittest\n"); | |
51 assert(isValidDchar(cast(dchar)'a') == true); | |
52 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); | |
53 } | |
54 | |
55 | |
56 /* This array gives the length of a UTF-8 sequence indexed by the value | |
57 * of the leading byte. An FF represents an illegal starting value of | |
58 * a UTF-8 sequence. | |
59 * FF is used instead of 0 to avoid having loops hang. | |
60 */ | |
61 | |
62 ubyte[256] UTF8stride = | |
63 [ | |
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
73 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
76 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
77 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
78 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
79 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, | |
80 ]; | |
81 | |
82 uint stride(char[] s, size_t i) | |
83 { | |
84 return UTF8stride[s[i]]; | |
85 } | |
86 | |
87 uint stride(wchar[] s, size_t i) | |
88 { uint u = s[i]; | |
89 return 1 + (u >= 0xD800 && u <= 0xDBFF); | |
90 } | |
91 | |
92 uint stride(dchar[] s, size_t i) | |
93 { | |
94 return 1; | |
95 } | |
96 | |
97 /******************************************* | |
98 * Given an index into an array of char's, | |
99 * and assuming that index is at the start of a UTF character, | |
100 * determine the number of UCS characters up to that index. | |
101 */ | |
102 | |
103 size_t toUCSindex(char[] s, size_t i) | |
104 { | |
105 size_t n; | |
106 size_t j; | |
107 size_t stride; | |
108 | |
109 for (j = 0; j < i; j += stride) | |
110 { | |
111 stride = UTF8stride[s[j]]; | |
112 if (stride == 0xFF) | |
113 goto Lerr; | |
114 n++; | |
115 } | |
116 if (j > i) | |
117 { | |
118 Lerr: | |
119 onUnicodeError("invalid UTF-8 sequence", j); | |
120 } | |
121 return n; | |
122 } | |
123 | |
124 size_t toUCSindex(wchar[] s, size_t i) | |
125 { | |
126 size_t n; | |
127 size_t j; | |
128 | |
129 for (j = 0; j < i; ) | |
130 { uint u = s[j]; | |
131 | |
132 j += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
133 n++; | |
134 } | |
135 if (j > i) | |
136 { | |
137 Lerr: | |
138 onUnicodeError("invalid UTF-16 sequence", j); | |
139 } | |
140 return n; | |
141 } | |
142 | |
143 size_t toUCSindex(dchar[] s, size_t i) | |
144 { | |
145 return i; | |
146 } | |
147 | |
148 /****************************************** | |
149 * Given a UCS index into an array of characters, return the UTF index. | |
150 */ | |
151 | |
152 size_t toUTFindex(char[] s, size_t n) | |
153 { | |
154 size_t i; | |
155 | |
156 while (n--) | |
157 { | |
158 uint j = UTF8stride[s[i]]; | |
159 if (j == 0xFF) | |
160 onUnicodeError("invalid UTF-8 sequence", i); | |
161 i += j; | |
162 } | |
163 return i; | |
164 } | |
165 | |
166 size_t toUTFindex(wchar[] s, size_t n) | |
167 { | |
168 size_t i; | |
169 | |
170 while (n--) | |
171 { wchar u = s[i]; | |
172 | |
173 i += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
174 } | |
175 return i; | |
176 } | |
177 | |
178 size_t toUTFindex(dchar[] s, size_t n) | |
179 { | |
180 return n; | |
181 } | |
182 | |
183 /* =================== Decode ======================= */ | |
184 | |
185 dchar decode(char[] s, inout size_t idx) | |
186 in | |
187 { | |
188 assert(idx >= 0 && idx < s.length); | |
189 } | |
190 out (result) | |
191 { | |
192 assert(isValidDchar(result)); | |
193 } | |
194 body | |
195 { | |
196 size_t len = s.length; | |
197 dchar V; | |
198 size_t i = idx; | |
199 char u = s[i]; | |
200 | |
201 if (u & 0x80) | |
202 { uint n; | |
203 char u2; | |
204 | |
205 /* The following encodings are valid, except for the 5 and 6 byte | |
206 * combinations: | |
207 * 0xxxxxxx | |
208 * 110xxxxx 10xxxxxx | |
209 * 1110xxxx 10xxxxxx 10xxxxxx | |
210 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
211 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
212 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
213 */ | |
214 for (n = 1; ; n++) | |
215 { | |
216 if (n > 4) | |
217 goto Lerr; // only do the first 4 of 6 encodings | |
218 if (((u << n) & 0x80) == 0) | |
219 { | |
220 if (n == 1) | |
221 goto Lerr; | |
222 break; | |
223 } | |
224 } | |
225 | |
226 // Pick off (7 - n) significant bits of B from first byte of octet | |
227 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); | |
228 | |
229 if (i + (n - 1) >= len) | |
230 goto Lerr; // off end of string | |
231 | |
232 /* The following combinations are overlong, and illegal: | |
233 * 1100000x (10xxxxxx) | |
234 * 11100000 100xxxxx (10xxxxxx) | |
235 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
236 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
237 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
238 */ | |
239 u2 = s[i + 1]; | |
240 if ((u & 0xFE) == 0xC0 || | |
241 (u == 0xE0 && (u2 & 0xE0) == 0x80) || | |
242 (u == 0xF0 && (u2 & 0xF0) == 0x80) || | |
243 (u == 0xF8 && (u2 & 0xF8) == 0x80) || | |
244 (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
245 goto Lerr; // overlong combination | |
246 | |
247 for (uint j = 1; j != n; j++) | |
248 { | |
249 u = s[i + j]; | |
250 if ((u & 0xC0) != 0x80) | |
251 goto Lerr; // trailing bytes are 10xxxxxx | |
252 V = (V << 6) | (u & 0x3F); | |
253 } | |
254 if (!isValidDchar(V)) | |
255 goto Lerr; | |
256 i += n; | |
257 } | |
258 else | |
259 { | |
260 V = cast(dchar) u; | |
261 i++; | |
262 } | |
263 | |
264 idx = i; | |
265 return V; | |
266 | |
267 Lerr: | |
268 onUnicodeError("invalid UTF-8 sequence", i); | |
269 return V; // dummy return | |
270 } | |
271 | |
272 unittest | |
273 { size_t i; | |
274 dchar c; | |
275 | |
276 debug(utf) printf("utf.decode.unittest\n"); | |
277 | |
278 static char[] s1 = "abcd"; | |
279 i = 0; | |
280 c = decode(s1, i); | |
281 assert(c == cast(dchar)'a'); | |
282 assert(i == 1); | |
283 c = decode(s1, i); | |
284 assert(c == cast(dchar)'b'); | |
285 assert(i == 2); | |
286 | |
287 static char[] s2 = "\xC2\xA9"; | |
288 i = 0; | |
289 c = decode(s2, i); | |
290 assert(c == cast(dchar)'\u00A9'); | |
291 assert(i == 2); | |
292 | |
293 static char[] s3 = "\xE2\x89\xA0"; | |
294 i = 0; | |
295 c = decode(s3, i); | |
296 assert(c == cast(dchar)'\u2260'); | |
297 assert(i == 3); | |
298 | |
299 static char[][] s4 = | |
300 [ "\xE2\x89", // too short | |
301 "\xC0\x8A", | |
302 "\xE0\x80\x8A", | |
303 "\xF0\x80\x80\x8A", | |
304 "\xF8\x80\x80\x80\x8A", | |
305 "\xFC\x80\x80\x80\x80\x8A", | |
306 ]; | |
307 | |
308 for (int j = 0; j < s4.length; j++) | |
309 { | |
310 try | |
311 { | |
312 i = 0; | |
313 c = decode(s4[j], i); | |
314 assert(0); | |
315 } | |
316 catch (Object o) | |
317 { | |
318 i = 23; | |
319 } | |
320 assert(i == 23); | |
321 } | |
322 } | |
323 | |
324 /********************************************************/ | |
325 | |
326 dchar decode(wchar[] s, inout size_t idx) | |
327 in | |
328 { | |
329 assert(idx >= 0 && idx < s.length); | |
330 } | |
331 out (result) | |
332 { | |
333 assert(isValidDchar(result)); | |
334 } | |
335 body | |
336 { | |
337 char[] msg; | |
338 dchar V; | |
339 size_t i = idx; | |
340 uint u = s[i]; | |
341 | |
342 if (u & ~0x7F) | |
343 { if (u >= 0xD800 && u <= 0xDBFF) | |
344 { uint u2; | |
345 | |
346 if (i + 1 == s.length) | |
347 { msg = "surrogate UTF-16 high value past end of string"; | |
348 goto Lerr; | |
349 } | |
350 u2 = s[i + 1]; | |
351 if (u2 < 0xDC00 || u2 > 0xDFFF) | |
352 { msg = "surrogate UTF-16 low value out of range"; | |
353 goto Lerr; | |
354 } | |
355 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
356 i += 2; | |
357 } | |
358 else if (u >= 0xDC00 && u <= 0xDFFF) | |
359 { msg = "unpaired surrogate UTF-16 value"; | |
360 goto Lerr; | |
361 } | |
362 else if (u == 0xFFFE || u == 0xFFFF) | |
363 { msg = "illegal UTF-16 value"; | |
364 goto Lerr; | |
365 } | |
366 else | |
367 i++; | |
368 } | |
369 else | |
370 { | |
371 i++; | |
372 } | |
373 | |
374 idx = i; | |
375 return cast(dchar)u; | |
376 | |
377 Lerr: | |
378 onUnicodeError(msg, i); | |
379 return cast(dchar)u; // dummy return | |
380 } | |
381 | |
382 /********************************************************/ | |
383 | |
384 dchar decode(dchar[] s, inout size_t idx) | |
385 in | |
386 { | |
387 assert(idx >= 0 && idx < s.length); | |
388 } | |
389 body | |
390 { | |
391 size_t i = idx; | |
392 dchar c = s[i]; | |
393 | |
394 if (!isValidDchar(c)) | |
395 goto Lerr; | |
396 idx = i + 1; | |
397 return c; | |
398 | |
399 Lerr: | |
400 onUnicodeError("invalid UTF-32 value", i); | |
401 return c; // dummy return | |
402 } | |
403 | |
404 | |
405 /* =================== Encode ======================= */ | |
406 | |
407 void encode(inout char[] s, dchar c) | |
408 in | |
409 { | |
410 assert(isValidDchar(c)); | |
411 } | |
412 body | |
413 { | |
414 char[] r = s; | |
415 | |
416 if (c <= 0x7F) | |
417 { | |
418 r ~= cast(char) c; | |
419 } | |
420 else | |
421 { | |
422 char[4] buf; | |
423 uint L; | |
424 | |
425 if (c <= 0x7FF) | |
426 { | |
427 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
428 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
429 L = 2; | |
430 } | |
431 else if (c <= 0xFFFF) | |
432 { | |
433 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
434 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
435 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
436 L = 3; | |
437 } | |
438 else if (c <= 0x10FFFF) | |
439 { | |
440 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
441 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
442 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
443 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
444 L = 4; | |
445 } | |
446 else | |
447 { | |
448 assert(0); | |
449 } | |
450 r ~= buf[0 .. L]; | |
451 } | |
452 s = r; | |
453 } | |
454 | |
455 unittest | |
456 { | |
457 debug(utf) printf("utf.encode.unittest\n"); | |
458 | |
459 char[] s = "abcd"; | |
460 encode(s, cast(dchar)'a'); | |
461 assert(s.length == 5); | |
462 assert(s == "abcda"); | |
463 | |
464 encode(s, cast(dchar)'\u00A9'); | |
465 assert(s.length == 7); | |
466 assert(s == "abcda\xC2\xA9"); | |
467 //assert(s == "abcda\u00A9"); // BUG: fix compiler | |
468 | |
469 encode(s, cast(dchar)'\u2260'); | |
470 assert(s.length == 10); | |
471 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); | |
472 } | |
473 | |
474 /********************************************************/ | |
475 | |
476 void encode(inout wchar[] s, dchar c) | |
477 in | |
478 { | |
479 assert(isValidDchar(c)); | |
480 } | |
481 body | |
482 { | |
483 wchar[] r = s; | |
484 | |
485 if (c <= 0xFFFF) | |
486 { | |
487 r ~= cast(wchar) c; | |
488 } | |
489 else | |
490 { | |
491 wchar[2] buf; | |
492 | |
493 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
494 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
495 r ~= buf; | |
496 } | |
497 s = r; | |
498 } | |
499 | |
500 void encode(inout dchar[] s, dchar c) | |
501 in | |
502 { | |
503 assert(isValidDchar(c)); | |
504 } | |
505 body | |
506 { | |
507 s ~= c; | |
508 } | |
509 | |
510 /* =================== Validation ======================= */ | |
511 | |
512 void validate(char[] s) | |
513 { | |
514 size_t len = s.length; | |
515 size_t i; | |
516 | |
517 for (i = 0; i < len; ) | |
518 { | |
519 decode(s, i); | |
520 } | |
521 } | |
522 | |
523 void validate(wchar[] s) | |
524 { | |
525 size_t len = s.length; | |
526 size_t i; | |
527 | |
528 for (i = 0; i < len; ) | |
529 { | |
530 decode(s, i); | |
531 } | |
532 } | |
533 | |
534 void validate(dchar[] s) | |
535 { | |
536 size_t len = s.length; | |
537 size_t i; | |
538 | |
539 for (i = 0; i < len; ) | |
540 { | |
541 decode(s, i); | |
542 } | |
543 } | |
544 | |
545 /* =================== Conversion to UTF8 ======================= */ | |
546 | |
547 char[] toUTF8(char[4] buf, dchar c) | |
548 in | |
549 { | |
550 assert(isValidDchar(c)); | |
551 } | |
552 body | |
553 { | |
554 if (c <= 0x7F) | |
555 { | |
556 buf[0] = cast(char) c; | |
557 return buf[0 .. 1]; | |
558 } | |
559 else if (c <= 0x7FF) | |
560 { | |
561 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
562 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
563 return buf[0 .. 2]; | |
564 } | |
565 else if (c <= 0xFFFF) | |
566 { | |
567 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
568 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
569 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
570 return buf[0 .. 3]; | |
571 } | |
572 else if (c <= 0x10FFFF) | |
573 { | |
574 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
575 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
576 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
577 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
578 return buf[0 .. 4]; | |
579 } | |
580 assert(0); | |
581 } | |
582 | |
583 char[] toUTF8(char[] s) | |
584 in | |
585 { | |
586 validate(s); | |
587 } | |
588 body | |
589 { | |
590 return s; | |
591 } | |
592 | |
593 char[] toUTF8(wchar[] s) | |
594 { | |
595 char[] r; | |
596 size_t i; | |
597 size_t slen = s.length; | |
598 | |
599 r.length = slen; | |
600 | |
601 for (i = 0; i < slen; i++) | |
602 { wchar c = s[i]; | |
603 | |
604 if (c <= 0x7F) | |
605 r[i] = cast(char)c; // fast path for ascii | |
606 else | |
607 { | |
608 r.length = i; | |
609 foreach (dchar c; s[i .. slen]) | |
610 { | |
611 encode(r, c); | |
612 } | |
613 break; | |
614 } | |
615 } | |
616 return r; | |
617 } | |
618 | |
619 char[] toUTF8(dchar[] s) | |
620 { | |
621 char[] r; | |
622 size_t i; | |
623 size_t slen = s.length; | |
624 | |
625 r.length = slen; | |
626 | |
627 for (i = 0; i < slen; i++) | |
628 { dchar c = s[i]; | |
629 | |
630 if (c <= 0x7F) | |
631 r[i] = cast(char)c; // fast path for ascii | |
632 else | |
633 { | |
634 r.length = i; | |
635 foreach (dchar d; s[i .. slen]) | |
636 { | |
637 encode(r, d); | |
638 } | |
639 break; | |
640 } | |
641 } | |
642 return r; | |
643 } | |
644 | |
645 /* =================== Conversion to UTF16 ======================= */ | |
646 | |
647 wchar[] toUTF16(wchar[2] buf, dchar c) | |
648 in | |
649 { | |
650 assert(isValidDchar(c)); | |
651 } | |
652 body | |
653 { | |
654 if (c <= 0xFFFF) | |
655 { | |
656 buf[0] = cast(wchar) c; | |
657 return buf[0 .. 1]; | |
658 } | |
659 else | |
660 { | |
661 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
662 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
663 return buf[0 .. 2]; | |
664 } | |
665 } | |
666 | |
667 wchar[] toUTF16(char[] s) | |
668 { | |
669 wchar[] r; | |
670 size_t slen = s.length; | |
671 | |
672 r.length = slen; | |
673 r.length = 0; | |
674 for (size_t i = 0; i < slen; ) | |
675 { | |
676 dchar c = s[i]; | |
677 if (c <= 0x7F) | |
678 { | |
679 i++; | |
680 r ~= cast(wchar)c; | |
681 } | |
682 else | |
683 { | |
684 c = decode(s, i); | |
685 encode(r, c); | |
686 } | |
687 } | |
688 return r; | |
689 } | |
690 | |
691 wchar* toUTF16z(char[] s) | |
692 { | |
693 wchar[] r; | |
694 size_t slen = s.length; | |
695 | |
696 r.length = slen + 1; | |
697 r.length = 0; | |
698 for (size_t i = 0; i < slen; ) | |
699 { | |
700 dchar c = s[i]; | |
701 if (c <= 0x7F) | |
702 { | |
703 i++; | |
704 r ~= cast(wchar)c; | |
705 } | |
706 else | |
707 { | |
708 c = decode(s, i); | |
709 encode(r, c); | |
710 } | |
711 } | |
712 r ~= "\000"; | |
713 return r.ptr; | |
714 } | |
715 | |
716 wchar[] toUTF16(wchar[] s) | |
717 in | |
718 { | |
719 validate(s); | |
720 } | |
721 body | |
722 { | |
723 return s; | |
724 } | |
725 | |
726 wchar[] toUTF16(dchar[] s) | |
727 { | |
728 wchar[] r; | |
729 size_t slen = s.length; | |
730 | |
731 r.length = slen; | |
732 r.length = 0; | |
733 for (size_t i = 0; i < slen; i++) | |
734 { | |
735 encode(r, s[i]); | |
736 } | |
737 return r; | |
738 } | |
739 | |
740 /* =================== Conversion to UTF32 ======================= */ | |
741 | |
742 dchar[] toUTF32(char[] s) | |
743 { | |
744 dchar[] r; | |
745 size_t slen = s.length; | |
746 size_t j = 0; | |
747 | |
748 r.length = slen; // r[] will never be longer than s[] | |
749 for (size_t i = 0; i < slen; ) | |
750 { | |
751 dchar c = s[i]; | |
752 if (c >= 0x80) | |
753 c = decode(s, i); | |
754 else | |
755 i++; // c is ascii, no need for decode | |
756 r[j++] = c; | |
757 } | |
758 return r[0 .. j]; | |
759 } | |
760 | |
761 dchar[] toUTF32(wchar[] s) | |
762 { | |
763 dchar[] r; | |
764 size_t slen = s.length; | |
765 size_t j = 0; | |
766 | |
767 r.length = slen; // r[] will never be longer than s[] | |
768 for (size_t i = 0; i < slen; ) | |
769 { | |
770 dchar c = s[i]; | |
771 if (c >= 0x80) | |
772 c = decode(s, i); | |
773 else | |
774 i++; // c is ascii, no need for decode | |
775 r[j++] = c; | |
776 } | |
777 return r[0 .. j]; | |
778 } | |
779 | |
780 dchar[] toUTF32(dchar[] s) | |
781 in | |
782 { | |
783 validate(s); | |
784 } | |
785 body | |
786 { | |
787 return s; | |
788 } | |
789 | |
790 /* ================================ tests ================================== */ | |
791 | |
792 unittest | |
793 { | |
794 debug(utf) printf("utf.toUTF.unittest\n"); | |
795 | |
796 char[] c; | |
797 wchar[] w; | |
798 dchar[] d; | |
799 | |
800 c = "hello"; | |
801 w = toUTF16(c); | |
802 assert(w == "hello"); | |
803 d = toUTF32(c); | |
804 assert(d == "hello"); | |
805 | |
806 c = toUTF8(w); | |
807 assert(c == "hello"); | |
808 d = toUTF32(w); | |
809 assert(d == "hello"); | |
810 | |
811 c = toUTF8(d); | |
812 assert(c == "hello"); | |
813 w = toUTF16(d); | |
814 assert(w == "hello"); | |
815 | |
816 | |
817 c = "hel\u1234o"; | |
818 w = toUTF16(c); | |
819 assert(w == "hel\u1234o"); | |
820 d = toUTF32(c); | |
821 assert(d == "hel\u1234o"); | |
822 | |
823 c = toUTF8(w); | |
824 assert(c == "hel\u1234o"); | |
825 d = toUTF32(w); | |
826 assert(d == "hel\u1234o"); | |
827 | |
828 c = toUTF8(d); | |
829 assert(c == "hel\u1234o"); | |
830 w = toUTF16(d); | |
831 assert(w == "hel\u1234o"); | |
832 | |
833 | |
834 c = "he\U0010AAAAllo"; | |
835 w = toUTF16(c); | |
836 //foreach (wchar c; w) printf("c = x%x\n", c); | |
837 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); | |
838 assert(w == "he\U0010AAAAllo"); | |
839 d = toUTF32(c); | |
840 assert(d == "he\U0010AAAAllo"); | |
841 | |
842 c = toUTF8(w); | |
843 assert(c == "he\U0010AAAAllo"); | |
844 d = toUTF32(w); | |
845 assert(d == "he\U0010AAAAllo"); | |
846 | |
847 c = toUTF8(d); | |
848 assert(c == "he\U0010AAAAllo"); | |
849 w = toUTF16(d); | |
850 assert(w == "he\U0010AAAAllo"); | |
851 } |