comparison druntime/src/compiler/ldc/util/utf.d @ 1458:e0b2d67cfe7c

Added druntime (this should be removed once it works).
author Robert Clipsham <robert@octarineparrot.com>
date Tue, 02 Jun 2009 17:43:06 +0100
parents
children
comparison
equal deleted inserted replaced
1456:7b218ec1044f 1458:e0b2d67cfe7c
1 /********************************************
2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
3 *
4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
5 * wchar type.
6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
7 * the D utf.dchar type.
8 *
9 * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
10 *
11 * See_Also:
12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
15 * Macros:
16 * WIKI = Phobos/StdUtf
17 *
18 * Copyright: Copyright Digital Mars 2003 - 2009.
19 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>.
20 * Authors: Walter Bright, Sean Kelly
21 *
22 * Copyright Digital Mars 2003 - 2009.
23 * Distributed under the Boost Software License, Version 1.0.
24 * (See accompanying file LICENSE_1_0.txt or copy at
25 * http://www.boost.org/LICENSE_1_0.txt)
26 */
27 module rt.util.utf;
28
29
30 extern (C) void onUnicodeError( string msg, size_t idx );
31
32 /*******************************
33 * Test if c is a valid UTF-32 character.
34 *
35 * \uFFFE and \uFFFF are considered valid by this function,
36 * as they are permitted for internal use by an application,
37 * but they are not allowed for interchange by the Unicode standard.
38 *
39 * Returns: true if it is, false if not.
40 */
41
42 bool isValidDchar(dchar c)
43 {
44 /* Note: FFFE and FFFF are specifically permitted by the
45 * Unicode standard for application internal use, but are not
46 * allowed for interchange.
47 * (thanks to Arcane Jill)
48 */
49
50 return c < 0xD800 ||
51 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
52 }
53
54 unittest
55 {
56 debug(utf) printf("utf.isValidDchar.unittest\n");
57 assert(isValidDchar(cast(dchar)'a') == true);
58 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
59 }
60
61
62
63 immutable UTF8stride =
64 [
65 cast(ubyte)
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
76 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
77 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
78 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
79 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
80 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
81 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
82 ];
83
84 /**
85 * stride() returns the length of a UTF-8 sequence starting at index i
86 * in string s.
87 * Returns:
88 * The number of bytes in the UTF-8 sequence or
89 * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
90 */
91 uint stride(in char[] s, size_t i)
92 {
93 return UTF8stride[s[i]];
94 }
95
96 /**
97 * stride() returns the length of a UTF-16 sequence starting at index i
98 * in string s.
99 */
100 uint stride(in wchar[] s, size_t i)
101 { uint u = s[i];
102 return 1 + (u >= 0xD800 && u <= 0xDBFF);
103 }
104
105 /**
106 * stride() returns the length of a UTF-32 sequence starting at index i
107 * in string s.
108 * Returns: The return value will always be 1.
109 */
110 uint stride(in dchar[] s, size_t i)
111 {
112 return 1;
113 }
114
115 /*******************************************
116 * Given an index i into an array of characters s[],
117 * and assuming that index i is at the start of a UTF character,
118 * determine the number of UCS characters up to that index i.
119 */
120
121 size_t toUCSindex(in char[] s, size_t i)
122 {
123 size_t n;
124 size_t j;
125
126 for (j = 0; j < i; )
127 {
128 j += stride(s, j);
129 n++;
130 }
131 if (j > i)
132 {
133 onUnicodeError("invalid UTF-8 sequence", j);
134 }
135 return n;
136 }
137
138 /** ditto */
139 size_t toUCSindex(in wchar[] s, size_t i)
140 {
141 size_t n;
142 size_t j;
143
144 for (j = 0; j < i; )
145 {
146 j += stride(s, j);
147 n++;
148 }
149 if (j > i)
150 {
151 onUnicodeError("invalid UTF-16 sequence", j);
152 }
153 return n;
154 }
155
156 /** ditto */
157 size_t toUCSindex(in dchar[] s, size_t i)
158 {
159 return i;
160 }
161
162 /******************************************
163 * Given a UCS index n into an array of characters s[], return the UTF index.
164 */
165
166 size_t toUTFindex(in char[] s, size_t n)
167 {
168 size_t i;
169
170 while (n--)
171 {
172 uint j = UTF8stride[s[i]];
173 if (j == 0xFF)
174 onUnicodeError("invalid UTF-8 sequence", i);
175 i += j;
176 }
177 return i;
178 }
179
180 /** ditto */
181 size_t toUTFindex(in wchar[] s, size_t n)
182 {
183 size_t i;
184
185 while (n--)
186 { wchar u = s[i];
187
188 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
189 }
190 return i;
191 }
192
193 /** ditto */
194 size_t toUTFindex(in dchar[] s, size_t n)
195 {
196 return n;
197 }
198
199 /* =================== Decode ======================= */
200
201 /***************
202 * Decodes and returns character starting at s[idx]. idx is advanced past the
203 * decoded character. If the character is not well formed, a UtfException is
204 * thrown and idx remains unchanged.
205 */
206 dchar decode(in char[] s, inout size_t idx)
207 in
208 {
209 assert(idx >= 0 && idx < s.length);
210 }
211 out (result)
212 {
213 assert(isValidDchar(result));
214 }
215 body
216 {
217 size_t len = s.length;
218 dchar V;
219 size_t i = idx;
220 char u = s[i];
221
222 if (u & 0x80)
223 { uint n;
224 char u2;
225
226 /* The following encodings are valid, except for the 5 and 6 byte
227 * combinations:
228 * 0xxxxxxx
229 * 110xxxxx 10xxxxxx
230 * 1110xxxx 10xxxxxx 10xxxxxx
231 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
232 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
233 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
234 */
235 for (n = 1; ; n++)
236 {
237 if (n > 4)
238 goto Lerr; // only do the first 4 of 6 encodings
239 if (((u << n) & 0x80) == 0)
240 {
241 if (n == 1)
242 goto Lerr;
243 break;
244 }
245 }
246
247 // Pick off (7 - n) significant bits of B from first byte of octet
248 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
249
250 if (i + (n - 1) >= len)
251 goto Lerr; // off end of string
252
253 /* The following combinations are overlong, and illegal:
254 * 1100000x (10xxxxxx)
255 * 11100000 100xxxxx (10xxxxxx)
256 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
257 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
258 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
259 */
260 u2 = s[i + 1];
261 if ((u & 0xFE) == 0xC0 ||
262 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
263 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
264 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
265 (u == 0xFC && (u2 & 0xFC) == 0x80))
266 goto Lerr; // overlong combination
267
268 for (uint j = 1; j != n; j++)
269 {
270 u = s[i + j];
271 if ((u & 0xC0) != 0x80)
272 goto Lerr; // trailing bytes are 10xxxxxx
273 V = (V << 6) | (u & 0x3F);
274 }
275 if (!isValidDchar(V))
276 goto Lerr;
277 i += n;
278 }
279 else
280 {
281 V = cast(dchar) u;
282 i++;
283 }
284
285 idx = i;
286 return V;
287
288 Lerr:
289 onUnicodeError("invalid UTF-8 sequence", i);
290 return V; // dummy return
291 }
292
293 unittest
294 { size_t i;
295 dchar c;
296
297 debug(utf) printf("utf.decode.unittest\n");
298
299 static s1 = "abcd"c;
300 i = 0;
301 c = decode(s1, i);
302 assert(c == cast(dchar)'a');
303 assert(i == 1);
304 c = decode(s1, i);
305 assert(c == cast(dchar)'b');
306 assert(i == 2);
307
308 static s2 = "\xC2\xA9"c;
309 i = 0;
310 c = decode(s2, i);
311 assert(c == cast(dchar)'\u00A9');
312 assert(i == 2);
313
314 static s3 = "\xE2\x89\xA0"c;
315 i = 0;
316 c = decode(s3, i);
317 assert(c == cast(dchar)'\u2260');
318 assert(i == 3);
319
320 static s4 =
321 [ "\xE2\x89"c[], // too short
322 "\xC0\x8A",
323 "\xE0\x80\x8A",
324 "\xF0\x80\x80\x8A",
325 "\xF8\x80\x80\x80\x8A",
326 "\xFC\x80\x80\x80\x80\x8A",
327 ];
328
329 for (int j = 0; j < s4.length; j++)
330 {
331 try
332 {
333 i = 0;
334 c = decode(s4[j], i);
335 assert(0);
336 }
337 catch (Object o)
338 {
339 i = 23;
340 }
341 assert(i == 23);
342 }
343 }
344
345 /** ditto */
346
347 dchar decode(in wchar[] s, inout size_t idx)
348 in
349 {
350 assert(idx >= 0 && idx < s.length);
351 }
352 out (result)
353 {
354 assert(isValidDchar(result));
355 }
356 body
357 {
358 string msg;
359 dchar V;
360 size_t i = idx;
361 uint u = s[i];
362
363 if (u & ~0x7F)
364 { if (u >= 0xD800 && u <= 0xDBFF)
365 { uint u2;
366
367 if (i + 1 == s.length)
368 { msg = "surrogate UTF-16 high value past end of string";
369 goto Lerr;
370 }
371 u2 = s[i + 1];
372 if (u2 < 0xDC00 || u2 > 0xDFFF)
373 { msg = "surrogate UTF-16 low value out of range";
374 goto Lerr;
375 }
376 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
377 i += 2;
378 }
379 else if (u >= 0xDC00 && u <= 0xDFFF)
380 { msg = "unpaired surrogate UTF-16 value";
381 goto Lerr;
382 }
383 else if (u == 0xFFFE || u == 0xFFFF)
384 { msg = "illegal UTF-16 value";
385 goto Lerr;
386 }
387 else
388 i++;
389 }
390 else
391 {
392 i++;
393 }
394
395 idx = i;
396 return cast(dchar)u;
397
398 Lerr:
399 onUnicodeError(msg, i);
400 return cast(dchar)u; // dummy return
401 }
402
403 /** ditto */
404
405 dchar decode(in dchar[] s, inout size_t idx)
406 in
407 {
408 assert(idx >= 0 && idx < s.length);
409 }
410 body
411 {
412 size_t i = idx;
413 dchar c = s[i];
414
415 if (!isValidDchar(c))
416 goto Lerr;
417 idx = i + 1;
418 return c;
419
420 Lerr:
421 onUnicodeError("invalid UTF-32 value", i);
422 return c; // dummy return
423 }
424
425
426 /* =================== Encode ======================= */
427
428 /*******************************
429 * Encodes character c and appends it to array s[].
430 */
431 void encode(inout char[] s, dchar c)
432 in
433 {
434 assert(isValidDchar(c));
435 }
436 body
437 {
438 char[] r = s;
439
440 if (c <= 0x7F)
441 {
442 r ~= cast(char) c;
443 }
444 else
445 {
446 char[4] buf;
447 uint L;
448
449 if (c <= 0x7FF)
450 {
451 buf[0] = cast(char)(0xC0 | (c >> 6));
452 buf[1] = cast(char)(0x80 | (c & 0x3F));
453 L = 2;
454 }
455 else if (c <= 0xFFFF)
456 {
457 buf[0] = cast(char)(0xE0 | (c >> 12));
458 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
459 buf[2] = cast(char)(0x80 | (c & 0x3F));
460 L = 3;
461 }
462 else if (c <= 0x10FFFF)
463 {
464 buf[0] = cast(char)(0xF0 | (c >> 18));
465 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
466 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
467 buf[3] = cast(char)(0x80 | (c & 0x3F));
468 L = 4;
469 }
470 else
471 {
472 assert(0);
473 }
474 r ~= buf[0 .. L];
475 }
476 s = r;
477 }
478
479 unittest
480 {
481 debug(utf) printf("utf.encode.unittest\n");
482
483 char[] s = "abcd".dup;
484 encode(s, cast(dchar)'a');
485 assert(s.length == 5);
486 assert(s == "abcda");
487
488 encode(s, cast(dchar)'\u00A9');
489 assert(s.length == 7);
490 assert(s == "abcda\xC2\xA9");
491 //assert(s == "abcda\u00A9"); // BUG: fix compiler
492
493 encode(s, cast(dchar)'\u2260');
494 assert(s.length == 10);
495 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
496 }
497
498 /** ditto */
499
500 void encode(inout wchar[] s, dchar c)
501 in
502 {
503 assert(isValidDchar(c));
504 }
505 body
506 {
507 wchar[] r = s;
508
509 if (c <= 0xFFFF)
510 {
511 r ~= cast(wchar) c;
512 }
513 else
514 {
515 wchar[2] buf;
516
517 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
518 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
519 r ~= buf;
520 }
521 s = r;
522 }
523
524 /** ditto */
525 void encode(inout dchar[] s, dchar c)
526 in
527 {
528 assert(isValidDchar(c));
529 }
530 body
531 {
532 s ~= c;
533 }
534
535 /**
536 Returns the code length of $(D c) in the encoding using $(D C) as a
537 code point. The code is returned in character count, not in bytes.
538 */
539
540 ubyte codeLength(C)(dchar c)
541 {
542
543 static if (C.sizeof == 1)
544 {
545 return
546 c <= 0x7F ? 1
547 : c <= 0x7FF ? 2
548 : c <= 0xFFFF ? 3
549 : c <= 0x10FFFF ? 4
550 : (assert(false), 6);
551 }
552
553 else static if (C.sizeof == 2)
554 {
555 return c <= 0xFFFF ? 1 : 2;
556 }
557 else
558 {
559 static assert(C.sizeof == 4);
560 return 1;
561 }
562 }
563
564 /* =================== Validation ======================= */
565
566 /***********************************
567 Checks to see if string is well formed or not. $(D S) can be an array
568 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
569 if it is not. Use to check all untrusted input for correctness.
570 */
571 void validate(S)(in S s)
572 {
573 auto len = s.length;
574 for (size_t i = 0; i < len; )
575 {
576 decode(s, i);
577 }
578 }
579
580 /* =================== Conversion to UTF8 ======================= */
581
582 char[] toUTF8(char[4] buf, dchar c)
583 in
584 {
585 assert(isValidDchar(c));
586 }
587 body
588 {
589 if (c <= 0x7F)
590 {
591 buf[0] = cast(char) c;
592 return buf[0 .. 1];
593 }
594 else if (c <= 0x7FF)
595 {
596 buf[0] = cast(char)(0xC0 | (c >> 6));
597 buf[1] = cast(char)(0x80 | (c & 0x3F));
598 return buf[0 .. 2];
599 }
600 else if (c <= 0xFFFF)
601 {
602 buf[0] = cast(char)(0xE0 | (c >> 12));
603 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
604 buf[2] = cast(char)(0x80 | (c & 0x3F));
605 return buf[0 .. 3];
606 }
607 else if (c <= 0x10FFFF)
608 {
609 buf[0] = cast(char)(0xF0 | (c >> 18));
610 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
611 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
612 buf[3] = cast(char)(0x80 | (c & 0x3F));
613 return buf[0 .. 4];
614 }
615 assert(0);
616 }
617
618 /*******************
619 * Encodes string s into UTF-8 and returns the encoded string.
620 */
621 string toUTF8(string s)
622 in
623 {
624 validate(s);
625 }
626 body
627 {
628 return s;
629 }
630
631 /** ditto */
632 string toUTF8(in wchar[] s)
633 {
634 char[] r;
635 size_t i;
636 size_t slen = s.length;
637
638 r.length = slen;
639
640 for (i = 0; i < slen; i++)
641 { wchar c = s[i];
642
643 if (c <= 0x7F)
644 r[i] = cast(char)c; // fast path for ascii
645 else
646 {
647 r.length = i;
648 foreach (dchar c; s[i .. slen])
649 {
650 encode(r, c);
651 }
652 break;
653 }
654 }
655 return cast(string)r;
656 }
657
658 /** ditto */
659 string toUTF8(in dchar[] s)
660 {
661 char[] r;
662 size_t i;
663 size_t slen = s.length;
664
665 r.length = slen;
666
667 for (i = 0; i < slen; i++)
668 { dchar c = s[i];
669
670 if (c <= 0x7F)
671 r[i] = cast(char)c; // fast path for ascii
672 else
673 {
674 r.length = i;
675 foreach (dchar d; s[i .. slen])
676 {
677 encode(r, d);
678 }
679 break;
680 }
681 }
682 return cast(string)r;
683 }
684
685 /* =================== Conversion to UTF16 ======================= */
686
687 wchar[] toUTF16(wchar[2] buf, dchar c)
688 in
689 {
690 assert(isValidDchar(c));
691 }
692 body
693 {
694 if (c <= 0xFFFF)
695 {
696 buf[0] = cast(wchar) c;
697 return buf[0 .. 1];
698 }
699 else
700 {
701 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
702 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
703 return buf[0 .. 2];
704 }
705 }
706
707 /****************
708 * Encodes string s into UTF-16 and returns the encoded string.
709 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
710 * an LPWSTR or LPCWSTR argument.
711 */
712 wstring toUTF16(in char[] s)
713 {
714 wchar[] r;
715 size_t slen = s.length;
716
717 r.length = slen;
718 r.length = 0;
719 for (size_t i = 0; i < slen; )
720 {
721 dchar c = s[i];
722 if (c <= 0x7F)
723 {
724 i++;
725 r ~= cast(wchar)c;
726 }
727 else
728 {
729 c = decode(s, i);
730 encode(r, c);
731 }
732 }
733 return cast(wstring)r;
734 }
735
736 alias const(wchar)* wptr;
737 /** ditto */
738 wptr toUTF16z(in char[] s)
739 {
740 wchar[] r;
741 size_t slen = s.length;
742
743 r.length = slen + 1;
744 r.length = 0;
745 for (size_t i = 0; i < slen; )
746 {
747 dchar c = s[i];
748 if (c <= 0x7F)
749 {
750 i++;
751 r ~= cast(wchar)c;
752 }
753 else
754 {
755 c = decode(s, i);
756 encode(r, c);
757 }
758 }
759 r ~= "\000";
760 return r.ptr;
761 }
762
763 /** ditto */
764 wstring toUTF16(wstring s)
765 in
766 {
767 validate(s);
768 }
769 body
770 {
771 return s;
772 }
773
774 /** ditto */
775 wstring toUTF16(in dchar[] s)
776 {
777 wchar[] r;
778 size_t slen = s.length;
779
780 r.length = slen;
781 r.length = 0;
782 for (size_t i = 0; i < slen; i++)
783 {
784 encode(r, s[i]);
785 }
786 return cast(wstring)r;
787 }
788
789 /* =================== Conversion to UTF32 ======================= */
790
791 /*****
792 * Encodes string s into UTF-32 and returns the encoded string.
793 */
794 dstring toUTF32(in char[] s)
795 {
796 dchar[] r;
797 size_t slen = s.length;
798 size_t j = 0;
799
800 r.length = slen; // r[] will never be longer than s[]
801 for (size_t i = 0; i < slen; )
802 {
803 dchar c = s[i];
804 if (c >= 0x80)
805 c = decode(s, i);
806 else
807 i++; // c is ascii, no need for decode
808 r[j++] = c;
809 }
810 return cast(dstring)r[0 .. j];
811 }
812
813 /** ditto */
814 dstring toUTF32(in wchar[] s)
815 {
816 dchar[] r;
817 size_t slen = s.length;
818 size_t j = 0;
819
820 r.length = slen; // r[] will never be longer than s[]
821 for (size_t i = 0; i < slen; )
822 {
823 dchar c = s[i];
824 if (c >= 0x80)
825 c = decode(s, i);
826 else
827 i++; // c is ascii, no need for decode
828 r[j++] = c;
829 }
830 return cast(dstring)r[0 .. j];
831 }
832
833 /** ditto */
834 dstring toUTF32(dstring s)
835 in
836 {
837 validate(s);
838 }
839 body
840 {
841 return s;
842 }
843
844 /* ================================ tests ================================== */
845
846 unittest
847 {
848 debug(utf) printf("utf.toUTF.unittest\n");
849
850 auto c = "hello"c[];
851 auto w = toUTF16(c);
852 assert(w == "hello");
853 auto d = toUTF32(c);
854 assert(d == "hello");
855
856 c = toUTF8(w);
857 assert(c == "hello");
858 d = toUTF32(w);
859 assert(d == "hello");
860
861 c = toUTF8(d);
862 assert(c == "hello");
863 w = toUTF16(d);
864 assert(w == "hello");
865
866
867 c = "hel\u1234o";
868 w = toUTF16(c);
869 assert(w == "hel\u1234o");
870 d = toUTF32(c);
871 assert(d == "hel\u1234o");
872
873 c = toUTF8(w);
874 assert(c == "hel\u1234o");
875 d = toUTF32(w);
876 assert(d == "hel\u1234o");
877
878 c = toUTF8(d);
879 assert(c == "hel\u1234o");
880 w = toUTF16(d);
881 assert(w == "hel\u1234o");
882
883
884 c = "he\U0010AAAAllo";
885 w = toUTF16(c);
886 //foreach (wchar c; w) printf("c = x%x\n", c);
887 //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
888 assert(w == "he\U0010AAAAllo");
889 d = toUTF32(c);
890 assert(d == "he\U0010AAAAllo");
891
892 c = toUTF8(w);
893 assert(c == "he\U0010AAAAllo");
894 d = toUTF32(w);
895 assert(d == "he\U0010AAAAllo");
896
897 c = toUTF8(d);
898 assert(c == "he\U0010AAAAllo");
899 w = toUTF16(d);
900 assert(w == "he\U0010AAAAllo");
901 }