comparison trunk/src/util/utf.d @ 629:d050e211402b

Moved files in src/std/ to src/util/.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Fri, 11 Jan 2008 20:03:46 +0100
parents trunk/src/std/utf.d@33b566df6af4
children
comparison
equal deleted inserted replaced
628:08681b93c3b3 629:d050e211402b
1 // utf.d
2
3 /*
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5 * Written by Walter Bright
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * o The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * o Altered source versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 * o This notice may not be removed or altered from any source
22 * distribution.
23 */
24
25 /********************************************
26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
27 *
28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
29 * wchar type.
30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to
31 * the D utf.dchar type.
32 *
33 * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
34 *
35 * See_Also:
36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
39 * Macros:
40 * WIKI = Phobos/StdUtf
41 */
42
43 /*
44 Note: this is not the original file!
45 Modified by Aziz Köksal:
46 Only commented out deprecated class UtfError.
47 */
48
49 module util.utf;
50
51 // private import std.stdio;
52
53 //debug=utf; // uncomment to turn on debugging printf's
54 /+
55 deprecated class UtfError : Error
56 {
57 size_t idx; // index in string of where error occurred
58
59 this(char[] s, size_t i)
60 {
61 idx = i;
62 super(s);
63 }
64 }
65 +/
66 /**********************************
67 * Exception class that is thrown upon any errors.
68 */
69
70 class UtfException : Exception
71 {
72 size_t idx; /// index in string of where error occurred
73
74 this(char[] s, size_t i)
75 {
76 idx = i;
77 super(s);
78 }
79 }
80
81 /*******************************
82 * Test if c is a valid UTF-32 character.
83 *
84 * \uFFFE and \uFFFF are considered valid by this function,
85 * as they are permitted for internal use by an application,
86 * but they are not allowed for interchange by the Unicode standard.
87 *
88 * Returns: true if it is, false if not.
89 */
90
91 bool isValidDchar(dchar c)
92 {
93 /* Note: FFFE and FFFF are specifically permitted by the
94 * Unicode standard for application internal use, but are not
95 * allowed for interchange.
96 * (thanks to Arcane Jill)
97 */
98
99 return c < 0xD800 ||
100 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
101 }
102
103 unittest
104 {
105 debug(utf) printf("utf.isValidDchar.unittest\n");
106 assert(isValidDchar(cast(dchar)'a') == true);
107 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
108 }
109
110
111 ubyte[256] UTF8stride =
112 [
113 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
114 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
115 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
116 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
117 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
118 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
119 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
120 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
121 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
122 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
123 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
124 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
125 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
126 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
127 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
128 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
129 ];
130
131 /**
132 * stride() returns the length of a UTF-8 sequence starting at index i
133 * in string s.
134 * Returns:
135 * The number of bytes in the UTF-8 sequence or
136 * 0xFF meaning s[i] is not the start of of UTF-8 sequence.
137 */
138
139 uint stride(char[] s, size_t i)
140 {
141 return UTF8stride[s[i]];
142 }
143
144 /**
145 * stride() returns the length of a UTF-16 sequence starting at index i
146 * in string s.
147 */
148
149 uint stride(wchar[] s, size_t i)
150 { uint u = s[i];
151 return 1 + (u >= 0xD800 && u <= 0xDBFF);
152 }
153
154 /**
155 * stride() returns the length of a UTF-32 sequence starting at index i
156 * in string s.
157 * Returns: The return value will always be 1.
158 */
159
160 uint stride(dchar[] s, size_t i)
161 {
162 return 1;
163 }
164
165 /*******************************************
166 * Given an index i into an array of characters s[],
167 * and assuming that index i is at the start of a UTF character,
168 * determine the number of UCS characters up to that index i.
169 */
170
171 size_t toUCSindex(char[] s, size_t i)
172 {
173 size_t n;
174 size_t j;
175 size_t stride;
176
177 for (j = 0; j < i; j += stride)
178 {
179 stride = UTF8stride[s[j]];
180 if (stride == 0xFF)
181 goto Lerr;
182 n++;
183 }
184 if (j > i)
185 {
186 Lerr:
187 throw new UtfException("1invalid UTF-8 sequence", j);
188 }
189 return n;
190 }
191
192 /** ditto */
193
194 size_t toUCSindex(wchar[] s, size_t i)
195 {
196 size_t n;
197 size_t j;
198
199 for (j = 0; j < i; )
200 { uint u = s[j];
201
202 j += 1 + (u >= 0xD800 && u <= 0xDBFF);
203 n++;
204 }
205 if (j > i)
206 {
207 Lerr:
208 throw new UtfException("2invalid UTF-16 sequence", j);
209 }
210 return n;
211 }
212
213 /** ditto */
214
215 size_t toUCSindex(dchar[] s, size_t i)
216 {
217 return i;
218 }
219
220 /******************************************
221 * Given a UCS index n into an array of characters s[], return the UTF index.
222 */
223
224 size_t toUTFindex(char[] s, size_t n)
225 {
226 size_t i;
227
228 while (n--)
229 {
230 uint j = UTF8stride[s[i]];
231 if (j == 0xFF)
232 throw new UtfException("3invalid UTF-8 sequence", i);
233 i += j;
234 }
235 return i;
236 }
237
238 /** ditto */
239
240 size_t toUTFindex(wchar[] s, size_t n)
241 {
242 size_t i;
243
244 while (n--)
245 { wchar u = s[i];
246
247 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
248 }
249 return i;
250 }
251
252 /** ditto */
253
254 size_t toUTFindex(dchar[] s, size_t n)
255 {
256 return n;
257 }
258
259 /* =================== Decode ======================= */
260
261 /***************
262 * Decodes and returns character starting at s[idx]. idx is advanced past the
263 * decoded character. If the character is not well formed, a UtfException is
264 * thrown and idx remains unchanged.
265 */
266
267 dchar decode(char[] s, inout size_t idx)
268 in
269 {
270 assert(idx >= 0 && idx < s.length);
271 }
272 out (result)
273 {
274 assert(isValidDchar(result));
275 }
276 body
277 {
278 size_t len = s.length;
279 dchar V;
280 size_t i = idx;
281 char u = s[i];
282
283 if (u & 0x80)
284 { uint n;
285 char u2;
286
287 /* The following encodings are valid, except for the 5 and 6 byte
288 * combinations:
289 * 0xxxxxxx
290 * 110xxxxx 10xxxxxx
291 * 1110xxxx 10xxxxxx 10xxxxxx
292 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
293 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
294 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
295 */
296 for (n = 1; ; n++)
297 {
298 if (n > 4)
299 goto Lerr; // only do the first 4 of 6 encodings
300 if (((u << n) & 0x80) == 0)
301 {
302 if (n == 1)
303 goto Lerr;
304 break;
305 }
306 }
307
308 // Pick off (7 - n) significant bits of B from first byte of octet
309 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
310
311 if (i + (n - 1) >= len)
312 goto Lerr; // off end of string
313
314 /* The following combinations are overlong, and illegal:
315 * 1100000x (10xxxxxx)
316 * 11100000 100xxxxx (10xxxxxx)
317 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
318 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
319 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
320 */
321 u2 = s[i + 1];
322 if ((u & 0xFE) == 0xC0 ||
323 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
324 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
325 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
326 (u == 0xFC && (u2 & 0xFC) == 0x80))
327 goto Lerr; // overlong combination
328
329 for (uint j = 1; j != n; j++)
330 {
331 u = s[i + j];
332 if ((u & 0xC0) != 0x80)
333 goto Lerr; // trailing bytes are 10xxxxxx
334 V = (V << 6) | (u & 0x3F);
335 }
336 if (!isValidDchar(V))
337 goto Lerr;
338 i += n;
339 }
340 else
341 {
342 V = cast(dchar) u;
343 i++;
344 }
345
346 idx = i;
347 return V;
348
349 Lerr:
350 //printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n'%.*s'\n", idx, i, s.length, s, s[i], s[i .. length]);
351 throw new UtfException("4invalid UTF-8 sequence", i);
352 }
353
354 unittest
355 { size_t i;
356 dchar c;
357
358 debug(utf) printf("utf.decode.unittest\n");
359
360 static char[] s1 = "abcd";
361 i = 0;
362 c = decode(s1, i);
363 assert(c == cast(dchar)'a');
364 assert(i == 1);
365 c = decode(s1, i);
366 assert(c == cast(dchar)'b');
367 assert(i == 2);
368
369 static char[] s2 = "\xC2\xA9";
370 i = 0;
371 c = decode(s2, i);
372 assert(c == cast(dchar)'\u00A9');
373 assert(i == 2);
374
375 static char[] s3 = "\xE2\x89\xA0";
376 i = 0;
377 c = decode(s3, i);
378 assert(c == cast(dchar)'\u2260');
379 assert(i == 3);
380
381 static char[][] s4 =
382 [ "\xE2\x89", // too short
383 "\xC0\x8A",
384 "\xE0\x80\x8A",
385 "\xF0\x80\x80\x8A",
386 "\xF8\x80\x80\x80\x8A",
387 "\xFC\x80\x80\x80\x80\x8A",
388 ];
389
390 for (int j = 0; j < s4.length; j++)
391 {
392 try
393 {
394 i = 0;
395 c = decode(s4[j], i);
396 assert(0);
397 }
398 catch (UtfException u)
399 {
400 i = 23;
401 delete u;
402 }
403 assert(i == 23);
404 }
405 }
406
407 /** ditto */
408
409 dchar decode(wchar[] s, inout size_t idx)
410 in
411 {
412 assert(idx >= 0 && idx < s.length);
413 }
414 out (result)
415 {
416 assert(isValidDchar(result));
417 }
418 body
419 {
420 char[] msg;
421 dchar V;
422 size_t i = idx;
423 uint u = s[i];
424
425 if (u & ~0x7F)
426 { if (u >= 0xD800 && u <= 0xDBFF)
427 { uint u2;
428
429 if (i + 1 == s.length)
430 { msg = "surrogate UTF-16 high value past end of string";
431 goto Lerr;
432 }
433 u2 = s[i + 1];
434 if (u2 < 0xDC00 || u2 > 0xDFFF)
435 { msg = "surrogate UTF-16 low value out of range";
436 goto Lerr;
437 }
438 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
439 i += 2;
440 }
441 else if (u >= 0xDC00 && u <= 0xDFFF)
442 { msg = "unpaired surrogate UTF-16 value";
443 goto Lerr;
444 }
445 else if (u == 0xFFFE || u == 0xFFFF)
446 { msg = "illegal UTF-16 value";
447 goto Lerr;
448 }
449 else
450 i++;
451 }
452 else
453 {
454 i++;
455 }
456
457 idx = i;
458 return cast(dchar)u;
459
460 Lerr:
461 throw new UtfException(msg, i);
462 }
463
464 /** ditto */
465
466 dchar decode(dchar[] s, inout size_t idx)
467 in
468 {
469 assert(idx >= 0 && idx < s.length);
470 }
471 body
472 {
473 size_t i = idx;
474 dchar c = s[i];
475
476 if (!isValidDchar(c))
477 goto Lerr;
478 idx = i + 1;
479 return c;
480
481 Lerr:
482 throw new UtfException("5invalid UTF-32 value", i);
483 }
484
485
486 /* =================== Encode ======================= */
487
488 /*******************************
489 * Encodes character c and appends it to array s[].
490 */
491
492 void encode(inout char[] s, dchar c)
493 in
494 {
495 assert(isValidDchar(c));
496 }
497 body
498 {
499 char[] r = s;
500
501 if (c <= 0x7F)
502 {
503 r ~= cast(char) c;
504 }
505 else
506 {
507 char[4] buf;
508 uint L;
509
510 if (c <= 0x7FF)
511 {
512 buf[0] = cast(char)(0xC0 | (c >> 6));
513 buf[1] = cast(char)(0x80 | (c & 0x3F));
514 L = 2;
515 }
516 else if (c <= 0xFFFF)
517 {
518 buf[0] = cast(char)(0xE0 | (c >> 12));
519 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
520 buf[2] = cast(char)(0x80 | (c & 0x3F));
521 L = 3;
522 }
523 else if (c <= 0x10FFFF)
524 {
525 buf[0] = cast(char)(0xF0 | (c >> 18));
526 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
527 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
528 buf[3] = cast(char)(0x80 | (c & 0x3F));
529 L = 4;
530 }
531 else
532 {
533 assert(0);
534 }
535 r ~= buf[0 .. L];
536 }
537 s = r;
538 }
539
540 unittest
541 {
542 debug(utf) printf("utf.encode.unittest\n");
543
544 char[] s = "abcd";
545 encode(s, cast(dchar)'a');
546 assert(s.length == 5);
547 assert(s == "abcda");
548
549 encode(s, cast(dchar)'\u00A9');
550 assert(s.length == 7);
551 assert(s == "abcda\xC2\xA9");
552 //assert(s == "abcda\u00A9"); // BUG: fix compiler
553
554 encode(s, cast(dchar)'\u2260');
555 assert(s.length == 10);
556 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
557 }
558
559 /** ditto */
560
561 void encode(inout wchar[] s, dchar c)
562 in
563 {
564 assert(isValidDchar(c));
565 }
566 body
567 {
568 wchar[] r = s;
569
570 if (c <= 0xFFFF)
571 {
572 r ~= cast(wchar) c;
573 }
574 else
575 {
576 wchar[2] buf;
577
578 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
579 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
580 r ~= buf;
581 }
582 s = r;
583 }
584
585 /** ditto */
586
587 void encode(inout dchar[] s, dchar c)
588 in
589 {
590 assert(isValidDchar(c));
591 }
592 body
593 {
594 s ~= c;
595 }
596
597 /* =================== Validation ======================= */
598
599 /***********************************
600 * Checks to see if string is well formed or not. Throws a UtfException if it is
601 * not. Use to check all untrusted input for correctness.
602 */
603
604 void validate(char[] s)
605 {
606 size_t len = s.length;
607 size_t i;
608
609 for (i = 0; i < len; )
610 {
611 decode(s, i);
612 }
613 }
614
615 /** ditto */
616
617 void validate(wchar[] s)
618 {
619 size_t len = s.length;
620 size_t i;
621
622 for (i = 0; i < len; )
623 {
624 decode(s, i);
625 }
626 }
627
628 /** ditto */
629
630 void validate(dchar[] s)
631 {
632 size_t len = s.length;
633 size_t i;
634
635 for (i = 0; i < len; )
636 {
637 decode(s, i);
638 }
639 }
640
641 /* =================== Conversion to UTF8 ======================= */
642
643 char[] toUTF8(char[4] buf, dchar c)
644 in
645 {
646 assert(isValidDchar(c));
647 }
648 body
649 {
650 if (c <= 0x7F)
651 {
652 buf[0] = cast(char) c;
653 return buf[0 .. 1];
654 }
655 else if (c <= 0x7FF)
656 {
657 buf[0] = cast(char)(0xC0 | (c >> 6));
658 buf[1] = cast(char)(0x80 | (c & 0x3F));
659 return buf[0 .. 2];
660 }
661 else if (c <= 0xFFFF)
662 {
663 buf[0] = cast(char)(0xE0 | (c >> 12));
664 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
665 buf[2] = cast(char)(0x80 | (c & 0x3F));
666 return buf[0 .. 3];
667 }
668 else if (c <= 0x10FFFF)
669 {
670 buf[0] = cast(char)(0xF0 | (c >> 18));
671 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
672 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
673 buf[3] = cast(char)(0x80 | (c & 0x3F));
674 return buf[0 .. 4];
675 }
676 assert(0);
677 }
678
679 /*******************
680 * Encodes string s into UTF-8 and returns the encoded string.
681 */
682
683 char[] toUTF8(char[] s)
684 in
685 {
686 validate(s);
687 }
688 body
689 {
690 return s;
691 }
692
693 /** ditto */
694
695 char[] toUTF8(wchar[] s)
696 {
697 char[] r;
698 size_t i;
699 size_t slen = s.length;
700
701 r.length = slen;
702
703 for (i = 0; i < slen; i++)
704 { wchar c = s[i];
705
706 if (c <= 0x7F)
707 r[i] = cast(char)c; // fast path for ascii
708 else
709 {
710 r.length = i;
711 foreach (dchar c; s[i .. slen])
712 {
713 encode(r, c);
714 }
715 break;
716 }
717 }
718 return r;
719 }
720
721 /** ditto */
722
723 char[] toUTF8(dchar[] s)
724 {
725 char[] r;
726 size_t i;
727 size_t slen = s.length;
728
729 r.length = slen;
730
731 for (i = 0; i < slen; i++)
732 { dchar c = s[i];
733
734 if (c <= 0x7F)
735 r[i] = cast(char)c; // fast path for ascii
736 else
737 {
738 r.length = i;
739 foreach (dchar d; s[i .. slen])
740 {
741 encode(r, d);
742 }
743 break;
744 }
745 }
746 return r;
747 }
748
749 /* =================== Conversion to UTF16 ======================= */
750
751 wchar[] toUTF16(wchar[2] buf, dchar c)
752 in
753 {
754 assert(isValidDchar(c));
755 }
756 body
757 {
758 if (c <= 0xFFFF)
759 {
760 buf[0] = cast(wchar) c;
761 return buf[0 .. 1];
762 }
763 else
764 {
765 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
766 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
767 return buf[0 .. 2];
768 }
769 }
770
771 /****************
772 * Encodes string s into UTF-16 and returns the encoded string.
773 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
774 * an LPWSTR or LPCWSTR argument.
775 */
776
777 wchar[] toUTF16(char[] s)
778 {
779 wchar[] r;
780 size_t slen = s.length;
781
782 r.length = slen;
783 r.length = 0;
784 for (size_t i = 0; i < slen; )
785 {
786 dchar c = s[i];
787 if (c <= 0x7F)
788 {
789 i++;
790 r ~= cast(wchar)c;
791 }
792 else
793 {
794 c = decode(s, i);
795 encode(r, c);
796 }
797 }
798 return r;
799 }
800
801 /** ditto */
802
803 wchar* toUTF16z(char[] s)
804 {
805 wchar[] r;
806 size_t slen = s.length;
807
808 r.length = slen + 1;
809 r.length = 0;
810 for (size_t i = 0; i < slen; )
811 {
812 dchar c = s[i];
813 if (c <= 0x7F)
814 {
815 i++;
816 r ~= cast(wchar)c;
817 }
818 else
819 {
820 c = decode(s, i);
821 encode(r, c);
822 }
823 }
824 r ~= "\000";
825 return r.ptr;
826 }
827
828 /** ditto */
829
830 wchar[] toUTF16(wchar[] s)
831 in
832 {
833 validate(s);
834 }
835 body
836 {
837 return s;
838 }
839
840 /** ditto */
841
842 wchar[] toUTF16(dchar[] s)
843 {
844 wchar[] r;
845 size_t slen = s.length;
846
847 r.length = slen;
848 r.length = 0;
849 for (size_t i = 0; i < slen; i++)
850 {
851 encode(r, s[i]);
852 }
853 return r;
854 }
855
856 /* =================== Conversion to UTF32 ======================= */
857
858 /*****
859 * Encodes string s into UTF-32 and returns the encoded string.
860 */
861
862 dchar[] toUTF32(char[] s)
863 {
864 dchar[] r;
865 size_t slen = s.length;
866 size_t j = 0;
867
868 r.length = slen; // r[] will never be longer than s[]
869 for (size_t i = 0; i < slen; )
870 {
871 dchar c = s[i];
872 if (c >= 0x80)
873 c = decode(s, i);
874 else
875 i++; // c is ascii, no need for decode
876 r[j++] = c;
877 }
878 return r[0 .. j];
879 }
880
881 /** ditto */
882
883 dchar[] toUTF32(wchar[] s)
884 {
885 dchar[] r;
886 size_t slen = s.length;
887 size_t j = 0;
888
889 r.length = slen; // r[] will never be longer than s[]
890 for (size_t i = 0; i < slen; )
891 {
892 dchar c = s[i];
893 if (c >= 0x80)
894 c = decode(s, i);
895 else
896 i++; // c is ascii, no need for decode
897 r[j++] = c;
898 }
899 return r[0 .. j];
900 }
901
902 /** ditto */
903
904 dchar[] toUTF32(dchar[] s)
905 in
906 {
907 validate(s);
908 }
909 body
910 {
911 return s;
912 }
913
914 /* ================================ tests ================================== */
915
916 unittest
917 {
918 debug(utf) printf("utf.toUTF.unittest\n");
919
920 char[] c;
921 wchar[] w;
922 dchar[] d;
923
924 c = "hello";
925 w = toUTF16(c);
926 assert(w == "hello");
927 d = toUTF32(c);
928 assert(d == "hello");
929
930 c = toUTF8(w);
931 assert(c == "hello");
932 d = toUTF32(w);
933 assert(d == "hello");
934
935 c = toUTF8(d);
936 assert(c == "hello");
937 w = toUTF16(d);
938 assert(w == "hello");
939
940
941 c = "hel\u1234o";
942 w = toUTF16(c);
943 assert(w == "hel\u1234o");
944 d = toUTF32(c);
945 assert(d == "hel\u1234o");
946
947 c = toUTF8(w);
948 assert(c == "hel\u1234o");
949 d = toUTF32(w);
950 assert(d == "hel\u1234o");
951
952 c = toUTF8(d);
953 assert(c == "hel\u1234o");
954 w = toUTF16(d);
955 assert(w == "hel\u1234o");
956
957
958 c = "he\U0010AAAAllo";
959 w = toUTF16(c);
960 //foreach (wchar c; w) printf("c = x%x\n", c);
961 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
962 assert(w == "he\U0010AAAAllo");
963 d = toUTF32(c);
964 assert(d == "he\U0010AAAAllo");
965
966 c = toUTF8(w);
967 assert(c == "he\U0010AAAAllo");
968 d = toUTF32(w);
969 assert(d == "he\U0010AAAAllo");
970
971 c = toUTF8(d);
972 assert(c == "he\U0010AAAAllo");
973 w = toUTF16(d);
974 assert(w == "he\U0010AAAAllo");
975 }