comparison druntime/src/compiler/dmd/util/utf.d @ 759:d3eb054172f9

Added copy of druntime from DMD 2.020 modified for LDC.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Tue, 11 Nov 2008 01:52:37 +0100
parents
children
comparison
equal deleted inserted replaced
758:f04dde6e882c 759:d3eb054172f9
1 // Written in the D programming language
2
3 /*
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5 * Written by Walter Bright
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * o The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * o Altered source versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 * o This notice may not be removed or altered from any source
22 * distribution.
23 */
24
25 /********************************************
26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
27 *
28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
29 * wchar type.
30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to
31 * the D utf.dchar type.
32 *
33 * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
34 *
35 * See_Also:
36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
39 * Macros:
40 * WIKI = Phobos/StdUtf
41 */
42
43 module rt.util.utf;
44
45
46 extern (C) void onUnicodeError( string msg, size_t idx );
47
48 /*******************************
49 * Test if c is a valid UTF-32 character.
50 *
51 * \uFFFE and \uFFFF are considered valid by this function,
52 * as they are permitted for internal use by an application,
53 * but they are not allowed for interchange by the Unicode standard.
54 *
55 * Returns: true if it is, false if not.
56 */
57
58 bool isValidDchar(dchar c)
59 {
60 /* Note: FFFE and FFFF are specifically permitted by the
61 * Unicode standard for application internal use, but are not
62 * allowed for interchange.
63 * (thanks to Arcane Jill)
64 */
65
66 return c < 0xD800 ||
67 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
68 }
69
70 unittest
71 {
72 debug(utf) printf("utf.isValidDchar.unittest\n");
73 assert(isValidDchar(cast(dchar)'a') == true);
74 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
75 }
76
77
78
79 auto UTF8stride =
80 [
81 cast(ubyte)
82 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
83 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
84 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
85 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
86 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
87 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
88 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
89 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
90 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
91 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
92 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
93 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
94 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
95 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
96 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
97 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
98 ];
99
100 /**
101 * stride() returns the length of a UTF-8 sequence starting at index i
102 * in string s.
103 * Returns:
104 * The number of bytes in the UTF-8 sequence or
105 * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
106 */
107 uint stride(in char[] s, size_t i)
108 {
109 return UTF8stride[s[i]];
110 }
111
112 /**
113 * stride() returns the length of a UTF-16 sequence starting at index i
114 * in string s.
115 */
116 uint stride(in wchar[] s, size_t i)
117 { uint u = s[i];
118 return 1 + (u >= 0xD800 && u <= 0xDBFF);
119 }
120
121 /**
122 * stride() returns the length of a UTF-32 sequence starting at index i
123 * in string s.
124 * Returns: The return value will always be 1.
125 */
126 uint stride(in dchar[] s, size_t i)
127 {
128 return 1;
129 }
130
131 /*******************************************
132 * Given an index i into an array of characters s[],
133 * and assuming that index i is at the start of a UTF character,
134 * determine the number of UCS characters up to that index i.
135 */
136
137 size_t toUCSindex(in char[] s, size_t i)
138 {
139 size_t n;
140 size_t j;
141
142 for (j = 0; j < i; )
143 {
144 j += stride(s, j);
145 n++;
146 }
147 if (j > i)
148 {
149 onUnicodeError("invalid UTF-8 sequence", j);
150 }
151 return n;
152 }
153
154 /** ditto */
155 size_t toUCSindex(in wchar[] s, size_t i)
156 {
157 size_t n;
158 size_t j;
159
160 for (j = 0; j < i; )
161 {
162 j += stride(s, j);
163 n++;
164 }
165 if (j > i)
166 {
167 onUnicodeError("invalid UTF-16 sequence", j);
168 }
169 return n;
170 }
171
172 /** ditto */
173 size_t toUCSindex(in dchar[] s, size_t i)
174 {
175 return i;
176 }
177
178 /******************************************
179 * Given a UCS index n into an array of characters s[], return the UTF index.
180 */
181
182 size_t toUTFindex(in char[] s, size_t n)
183 {
184 size_t i;
185
186 while (n--)
187 {
188 uint j = UTF8stride[s[i]];
189 if (j == 0xFF)
190 onUnicodeError("invalid UTF-8 sequence", i);
191 i += j;
192 }
193 return i;
194 }
195
196 /** ditto */
197 size_t toUTFindex(in wchar[] s, size_t n)
198 {
199 size_t i;
200
201 while (n--)
202 { wchar u = s[i];
203
204 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
205 }
206 return i;
207 }
208
209 /** ditto */
210 size_t toUTFindex(in dchar[] s, size_t n)
211 {
212 return n;
213 }
214
215 /* =================== Decode ======================= */
216
217 /***************
218 * Decodes and returns character starting at s[idx]. idx is advanced past the
219 * decoded character. If the character is not well formed, a UtfException is
220 * thrown and idx remains unchanged.
221 */
222 dchar decode(in char[] s, inout size_t idx)
223 in
224 {
225 assert(idx >= 0 && idx < s.length);
226 }
227 out (result)
228 {
229 assert(isValidDchar(result));
230 }
231 body
232 {
233 size_t len = s.length;
234 dchar V;
235 size_t i = idx;
236 char u = s[i];
237
238 if (u & 0x80)
239 { uint n;
240 char u2;
241
242 /* The following encodings are valid, except for the 5 and 6 byte
243 * combinations:
244 * 0xxxxxxx
245 * 110xxxxx 10xxxxxx
246 * 1110xxxx 10xxxxxx 10xxxxxx
247 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
248 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
249 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
250 */
251 for (n = 1; ; n++)
252 {
253 if (n > 4)
254 goto Lerr; // only do the first 4 of 6 encodings
255 if (((u << n) & 0x80) == 0)
256 {
257 if (n == 1)
258 goto Lerr;
259 break;
260 }
261 }
262
263 // Pick off (7 - n) significant bits of B from first byte of octet
264 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
265
266 if (i + (n - 1) >= len)
267 goto Lerr; // off end of string
268
269 /* The following combinations are overlong, and illegal:
270 * 1100000x (10xxxxxx)
271 * 11100000 100xxxxx (10xxxxxx)
272 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
273 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
274 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
275 */
276 u2 = s[i + 1];
277 if ((u & 0xFE) == 0xC0 ||
278 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
279 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
280 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
281 (u == 0xFC && (u2 & 0xFC) == 0x80))
282 goto Lerr; // overlong combination
283
284 for (uint j = 1; j != n; j++)
285 {
286 u = s[i + j];
287 if ((u & 0xC0) != 0x80)
288 goto Lerr; // trailing bytes are 10xxxxxx
289 V = (V << 6) | (u & 0x3F);
290 }
291 if (!isValidDchar(V))
292 goto Lerr;
293 i += n;
294 }
295 else
296 {
297 V = cast(dchar) u;
298 i++;
299 }
300
301 idx = i;
302 return V;
303
304 Lerr:
305 onUnicodeError("invalid UTF-8 sequence", i);
306 return V; // dummy return
307 }
308
309 unittest
310 { size_t i;
311 dchar c;
312
313 debug(utf) printf("utf.decode.unittest\n");
314
315 static s1 = "abcd"c;
316 i = 0;
317 c = decode(s1, i);
318 assert(c == cast(dchar)'a');
319 assert(i == 1);
320 c = decode(s1, i);
321 assert(c == cast(dchar)'b');
322 assert(i == 2);
323
324 static s2 = "\xC2\xA9"c;
325 i = 0;
326 c = decode(s2, i);
327 assert(c == cast(dchar)'\u00A9');
328 assert(i == 2);
329
330 static s3 = "\xE2\x89\xA0"c;
331 i = 0;
332 c = decode(s3, i);
333 assert(c == cast(dchar)'\u2260');
334 assert(i == 3);
335
336 static s4 =
337 [ "\xE2\x89"c, // too short
338 "\xC0\x8A",
339 "\xE0\x80\x8A",
340 "\xF0\x80\x80\x8A",
341 "\xF8\x80\x80\x80\x8A",
342 "\xFC\x80\x80\x80\x80\x8A",
343 ];
344
345 for (int j = 0; j < s4.length; j++)
346 {
347 try
348 {
349 i = 0;
350 c = decode(s4[j], i);
351 assert(0);
352 }
353 catch (Object o)
354 {
355 i = 23;
356 }
357 assert(i == 23);
358 }
359 }
360
361 /** ditto */
362
363 dchar decode(in wchar[] s, inout size_t idx)
364 in
365 {
366 assert(idx >= 0 && idx < s.length);
367 }
368 out (result)
369 {
370 assert(isValidDchar(result));
371 }
372 body
373 {
374 string msg;
375 dchar V;
376 size_t i = idx;
377 uint u = s[i];
378
379 if (u & ~0x7F)
380 { if (u >= 0xD800 && u <= 0xDBFF)
381 { uint u2;
382
383 if (i + 1 == s.length)
384 { msg = "surrogate UTF-16 high value past end of string";
385 goto Lerr;
386 }
387 u2 = s[i + 1];
388 if (u2 < 0xDC00 || u2 > 0xDFFF)
389 { msg = "surrogate UTF-16 low value out of range";
390 goto Lerr;
391 }
392 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
393 i += 2;
394 }
395 else if (u >= 0xDC00 && u <= 0xDFFF)
396 { msg = "unpaired surrogate UTF-16 value";
397 goto Lerr;
398 }
399 else if (u == 0xFFFE || u == 0xFFFF)
400 { msg = "illegal UTF-16 value";
401 goto Lerr;
402 }
403 else
404 i++;
405 }
406 else
407 {
408 i++;
409 }
410
411 idx = i;
412 return cast(dchar)u;
413
414 Lerr:
415 onUnicodeError(msg, i);
416 return cast(dchar)u; // dummy return
417 }
418
419 /** ditto */
420
421 dchar decode(in dchar[] s, inout size_t idx)
422 in
423 {
424 assert(idx >= 0 && idx < s.length);
425 }
426 body
427 {
428 size_t i = idx;
429 dchar c = s[i];
430
431 if (!isValidDchar(c))
432 goto Lerr;
433 idx = i + 1;
434 return c;
435
436 Lerr:
437 onUnicodeError("invalid UTF-32 value", i);
438 return c; // dummy return
439 }
440
441
442 /* =================== Encode ======================= */
443
444 /*******************************
445 * Encodes character c and appends it to array s[].
446 */
447 void encode(inout char[] s, dchar c)
448 in
449 {
450 assert(isValidDchar(c));
451 }
452 body
453 {
454 char[] r = s;
455
456 if (c <= 0x7F)
457 {
458 r ~= cast(char) c;
459 }
460 else
461 {
462 char[4] buf;
463 uint L;
464
465 if (c <= 0x7FF)
466 {
467 buf[0] = cast(char)(0xC0 | (c >> 6));
468 buf[1] = cast(char)(0x80 | (c & 0x3F));
469 L = 2;
470 }
471 else if (c <= 0xFFFF)
472 {
473 buf[0] = cast(char)(0xE0 | (c >> 12));
474 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
475 buf[2] = cast(char)(0x80 | (c & 0x3F));
476 L = 3;
477 }
478 else if (c <= 0x10FFFF)
479 {
480 buf[0] = cast(char)(0xF0 | (c >> 18));
481 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
482 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
483 buf[3] = cast(char)(0x80 | (c & 0x3F));
484 L = 4;
485 }
486 else
487 {
488 assert(0);
489 }
490 r ~= buf[0 .. L];
491 }
492 s = r;
493 }
494
495 unittest
496 {
497 debug(utf) printf("utf.encode.unittest\n");
498
499 char[] s = "abcd".dup;
500 encode(s, cast(dchar)'a');
501 assert(s.length == 5);
502 assert(s == "abcda");
503
504 encode(s, cast(dchar)'\u00A9');
505 assert(s.length == 7);
506 assert(s == "abcda\xC2\xA9");
507 //assert(s == "abcda\u00A9"); // BUG: fix compiler
508
509 encode(s, cast(dchar)'\u2260');
510 assert(s.length == 10);
511 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
512 }
513
514 /** ditto */
515
516 void encode(inout wchar[] s, dchar c)
517 in
518 {
519 assert(isValidDchar(c));
520 }
521 body
522 {
523 wchar[] r = s;
524
525 if (c <= 0xFFFF)
526 {
527 r ~= cast(wchar) c;
528 }
529 else
530 {
531 wchar[2] buf;
532
533 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
534 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
535 r ~= buf;
536 }
537 s = r;
538 }
539
540 /** ditto */
541 void encode(inout dchar[] s, dchar c)
542 in
543 {
544 assert(isValidDchar(c));
545 }
546 body
547 {
548 s ~= c;
549 }
550
551 /**
552 Returns the code length of $(D c) in the encoding using $(D C) as a
553 code point. The code is returned in character count, not in bytes.
554 */
555
556 ubyte codeLength(C)(dchar c)
557 {
558
559 static if (C.sizeof == 1)
560 {
561 return
562 c <= 0x7F ? 1
563 : c <= 0x7FF ? 2
564 : c <= 0xFFFF ? 3
565 : c <= 0x10FFFF ? 4
566 : (assert(false), 6);
567 }
568
569 else static if (C.sizeof == 2)
570 {
571 return c <= 0xFFFF ? 1 : 2;
572 }
573 else
574 {
575 static assert(C.sizeof == 4);
576 return 1;
577 }
578 }
579
580 /* =================== Validation ======================= */
581
582 /***********************************
583 Checks to see if string is well formed or not. $(D S) can be an array
584 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
585 if it is not. Use to check all untrusted input for correctness.
586 */
587 void validate(S)(in S s)
588 {
589 auto len = s.length;
590 for (size_t i = 0; i < len; )
591 {
592 decode(s, i);
593 }
594 }
595
596 /* =================== Conversion to UTF8 ======================= */
597
598 char[] toUTF8(char[4] buf, dchar c)
599 in
600 {
601 assert(isValidDchar(c));
602 }
603 body
604 {
605 if (c <= 0x7F)
606 {
607 buf[0] = cast(char) c;
608 return buf[0 .. 1];
609 }
610 else if (c <= 0x7FF)
611 {
612 buf[0] = cast(char)(0xC0 | (c >> 6));
613 buf[1] = cast(char)(0x80 | (c & 0x3F));
614 return buf[0 .. 2];
615 }
616 else if (c <= 0xFFFF)
617 {
618 buf[0] = cast(char)(0xE0 | (c >> 12));
619 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
620 buf[2] = cast(char)(0x80 | (c & 0x3F));
621 return buf[0 .. 3];
622 }
623 else if (c <= 0x10FFFF)
624 {
625 buf[0] = cast(char)(0xF0 | (c >> 18));
626 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
627 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
628 buf[3] = cast(char)(0x80 | (c & 0x3F));
629 return buf[0 .. 4];
630 }
631 assert(0);
632 }
633
634 /*******************
635 * Encodes string s into UTF-8 and returns the encoded string.
636 */
637 string toUTF8(string s)
638 in
639 {
640 validate(s);
641 }
642 body
643 {
644 return s;
645 }
646
647 /** ditto */
648 string toUTF8(in wchar[] s)
649 {
650 char[] r;
651 size_t i;
652 size_t slen = s.length;
653
654 r.length = slen;
655
656 for (i = 0; i < slen; i++)
657 { wchar c = s[i];
658
659 if (c <= 0x7F)
660 r[i] = cast(char)c; // fast path for ascii
661 else
662 {
663 r.length = i;
664 foreach (dchar c; s[i .. slen])
665 {
666 encode(r, c);
667 }
668 break;
669 }
670 }
671 return cast(string)r;
672 }
673
674 /** ditto */
675 string toUTF8(in dchar[] s)
676 {
677 char[] r;
678 size_t i;
679 size_t slen = s.length;
680
681 r.length = slen;
682
683 for (i = 0; i < slen; i++)
684 { dchar c = s[i];
685
686 if (c <= 0x7F)
687 r[i] = cast(char)c; // fast path for ascii
688 else
689 {
690 r.length = i;
691 foreach (dchar d; s[i .. slen])
692 {
693 encode(r, d);
694 }
695 break;
696 }
697 }
698 return cast(string)r;
699 }
700
701 /* =================== Conversion to UTF16 ======================= */
702
703 wchar[] toUTF16(wchar[2] buf, dchar c)
704 in
705 {
706 assert(isValidDchar(c));
707 }
708 body
709 {
710 if (c <= 0xFFFF)
711 {
712 buf[0] = cast(wchar) c;
713 return buf[0 .. 1];
714 }
715 else
716 {
717 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
718 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
719 return buf[0 .. 2];
720 }
721 }
722
723 /****************
724 * Encodes string s into UTF-16 and returns the encoded string.
725 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
726 * an LPWSTR or LPCWSTR argument.
727 */
728 wstring toUTF16(in char[] s)
729 {
730 wchar[] r;
731 size_t slen = s.length;
732
733 r.length = slen;
734 r.length = 0;
735 for (size_t i = 0; i < slen; )
736 {
737 dchar c = s[i];
738 if (c <= 0x7F)
739 {
740 i++;
741 r ~= cast(wchar)c;
742 }
743 else
744 {
745 c = decode(s, i);
746 encode(r, c);
747 }
748 }
749 return cast(wstring)r;
750 }
751
752 alias const(wchar)* wptr;
753 /** ditto */
754 wptr toUTF16z(in char[] s)
755 {
756 wchar[] r;
757 size_t slen = s.length;
758
759 r.length = slen + 1;
760 r.length = 0;
761 for (size_t i = 0; i < slen; )
762 {
763 dchar c = s[i];
764 if (c <= 0x7F)
765 {
766 i++;
767 r ~= cast(wchar)c;
768 }
769 else
770 {
771 c = decode(s, i);
772 encode(r, c);
773 }
774 }
775 r ~= "\000";
776 return r.ptr;
777 }
778
779 /** ditto */
780 wstring toUTF16(wstring s)
781 in
782 {
783 validate(s);
784 }
785 body
786 {
787 return s;
788 }
789
790 /** ditto */
791 wstring toUTF16(in dchar[] s)
792 {
793 wchar[] r;
794 size_t slen = s.length;
795
796 r.length = slen;
797 r.length = 0;
798 for (size_t i = 0; i < slen; i++)
799 {
800 encode(r, s[i]);
801 }
802 return cast(wstring)r;
803 }
804
805 /* =================== Conversion to UTF32 ======================= */
806
807 /*****
808 * Encodes string s into UTF-32 and returns the encoded string.
809 */
810 dstring toUTF32(in char[] s)
811 {
812 dchar[] r;
813 size_t slen = s.length;
814 size_t j = 0;
815
816 r.length = slen; // r[] will never be longer than s[]
817 for (size_t i = 0; i < slen; )
818 {
819 dchar c = s[i];
820 if (c >= 0x80)
821 c = decode(s, i);
822 else
823 i++; // c is ascii, no need for decode
824 r[j++] = c;
825 }
826 return cast(dstring)r[0 .. j];
827 }
828
829 /** ditto */
830 dstring toUTF32(in wchar[] s)
831 {
832 dchar[] r;
833 size_t slen = s.length;
834 size_t j = 0;
835
836 r.length = slen; // r[] will never be longer than s[]
837 for (size_t i = 0; i < slen; )
838 {
839 dchar c = s[i];
840 if (c >= 0x80)
841 c = decode(s, i);
842 else
843 i++; // c is ascii, no need for decode
844 r[j++] = c;
845 }
846 return cast(dstring)r[0 .. j];
847 }
848
849 /** ditto */
850 dstring toUTF32(dstring s)
851 in
852 {
853 validate(s);
854 }
855 body
856 {
857 return s;
858 }
859
860 /* ================================ tests ================================== */
861
862 unittest
863 {
864 debug(utf) printf("utf.toUTF.unittest\n");
865
866 auto c = "hello"c;
867 auto w = toUTF16(c);
868 assert(w == "hello");
869 auto d = toUTF32(c);
870 assert(d == "hello");
871
872 c = toUTF8(w);
873 assert(c == "hello");
874 d = toUTF32(w);
875 assert(d == "hello");
876
877 c = toUTF8(d);
878 assert(c == "hello");
879 w = toUTF16(d);
880 assert(w == "hello");
881
882
883 c = "hel\u1234o";
884 w = toUTF16(c);
885 assert(w == "hel\u1234o");
886 d = toUTF32(c);
887 assert(d == "hel\u1234o");
888
889 c = toUTF8(w);
890 assert(c == "hel\u1234o");
891 d = toUTF32(w);
892 assert(d == "hel\u1234o");
893
894 c = toUTF8(d);
895 assert(c == "hel\u1234o");
896 w = toUTF16(d);
897 assert(w == "hel\u1234o");
898
899
900 c = "he\U0010AAAAllo";
901 w = toUTF16(c);
902 //foreach (wchar c; w) printf("c = x%x\n", c);
903 //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
904 assert(w == "he\U0010AAAAllo");
905 d = toUTF32(c);
906 assert(d == "he\U0010AAAAllo");
907
908 c = toUTF8(w);
909 assert(c == "he\U0010AAAAllo");
910 d = toUTF32(w);
911 assert(d == "he\U0010AAAAllo");
912
913 c = toUTF8(d);
914 assert(c == "he\U0010AAAAllo");
915 w = toUTF16(d);
916 assert(w == "he\U0010AAAAllo");
917 }