comparison tango/lib/compiler/llvmdc/util/utf.d @ 132:1700239cab2e trunk

[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author lindquist
date Fri, 11 Jan 2008 17:57:40 +0100
parents
children
comparison
equal deleted inserted replaced
131:5825d48b27d1 132:1700239cab2e
1 // utf.d
2
3 /*
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5 * Written by Walter Bright
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * o The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * o Altered source versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 * o This notice may not be removed or altered from any source
22 * distribution.
23 */
24
25 // Description of UTF-8 at:
26 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
27 // http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
28
29
30 module util.utf;
31
32
33 extern (C) void onUnicodeError( char[] msg, size_t idx );
34
35
36 bool isValidDchar(dchar c)
37 {
38 /* Note: FFFE and FFFF are specifically permitted by the
39 * Unicode standard for application internal use, but are not
40 * allowed for interchange.
41 * (thanks to Arcane Jill)
42 */
43
44 return c < 0xD800 ||
45 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
46 }
47
48 unittest
49 {
50 debug(utf) printf("utf.isValidDchar.unittest\n");
51 assert(isValidDchar(cast(dchar)'a') == true);
52 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
53 }
54
55
56 /* This array gives the length of a UTF-8 sequence indexed by the value
57 * of the leading byte. An FF represents an illegal starting value of
58 * a UTF-8 sequence.
59 * FF is used instead of 0 to avoid having loops hang.
60 */
61
62 ubyte[256] UTF8stride =
63 [
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
76 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
77 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
78 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
79 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
80 ];
81
82 uint stride(char[] s, size_t i)
83 {
84 return UTF8stride[s[i]];
85 }
86
87 uint stride(wchar[] s, size_t i)
88 { uint u = s[i];
89 return 1 + (u >= 0xD800 && u <= 0xDBFF);
90 }
91
92 uint stride(dchar[] s, size_t i)
93 {
94 return 1;
95 }
96
97 /*******************************************
98 * Given an index into an array of char's,
99 * and assuming that index is at the start of a UTF character,
100 * determine the number of UCS characters up to that index.
101 */
102
103 size_t toUCSindex(char[] s, size_t i)
104 {
105 size_t n;
106 size_t j;
107 size_t stride;
108
109 for (j = 0; j < i; j += stride)
110 {
111 stride = UTF8stride[s[j]];
112 if (stride == 0xFF)
113 goto Lerr;
114 n++;
115 }
116 if (j > i)
117 {
118 Lerr:
119 onUnicodeError("invalid UTF-8 sequence", j);
120 }
121 return n;
122 }
123
124 size_t toUCSindex(wchar[] s, size_t i)
125 {
126 size_t n;
127 size_t j;
128
129 for (j = 0; j < i; )
130 { uint u = s[j];
131
132 j += 1 + (u >= 0xD800 && u <= 0xDBFF);
133 n++;
134 }
135 if (j > i)
136 {
137 Lerr:
138 onUnicodeError("invalid UTF-16 sequence", j);
139 }
140 return n;
141 }
142
143 size_t toUCSindex(dchar[] s, size_t i)
144 {
145 return i;
146 }
147
148 /******************************************
149 * Given a UCS index into an array of characters, return the UTF index.
150 */
151
152 size_t toUTFindex(char[] s, size_t n)
153 {
154 size_t i;
155
156 while (n--)
157 {
158 uint j = UTF8stride[s[i]];
159 if (j == 0xFF)
160 onUnicodeError("invalid UTF-8 sequence", i);
161 i += j;
162 }
163 return i;
164 }
165
166 size_t toUTFindex(wchar[] s, size_t n)
167 {
168 size_t i;
169
170 while (n--)
171 { wchar u = s[i];
172
173 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
174 }
175 return i;
176 }
177
178 size_t toUTFindex(dchar[] s, size_t n)
179 {
180 return n;
181 }
182
183 /* =================== Decode ======================= */
184
185 dchar decode(char[] s, inout size_t idx)
186 in
187 {
188 assert(idx >= 0 && idx < s.length);
189 }
190 out (result)
191 {
192 assert(isValidDchar(result));
193 }
194 body
195 {
196 size_t len = s.length;
197 dchar V;
198 size_t i = idx;
199 char u = s[i];
200
201 if (u & 0x80)
202 { uint n;
203 char u2;
204
205 /* The following encodings are valid, except for the 5 and 6 byte
206 * combinations:
207 * 0xxxxxxx
208 * 110xxxxx 10xxxxxx
209 * 1110xxxx 10xxxxxx 10xxxxxx
210 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
213 */
214 for (n = 1; ; n++)
215 {
216 if (n > 4)
217 goto Lerr; // only do the first 4 of 6 encodings
218 if (((u << n) & 0x80) == 0)
219 {
220 if (n == 1)
221 goto Lerr;
222 break;
223 }
224 }
225
226 // Pick off (7 - n) significant bits of B from first byte of octet
227 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
228
229 if (i + (n - 1) >= len)
230 goto Lerr; // off end of string
231
232 /* The following combinations are overlong, and illegal:
233 * 1100000x (10xxxxxx)
234 * 11100000 100xxxxx (10xxxxxx)
235 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
236 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
237 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
238 */
239 u2 = s[i + 1];
240 if ((u & 0xFE) == 0xC0 ||
241 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
242 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
243 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
244 (u == 0xFC && (u2 & 0xFC) == 0x80))
245 goto Lerr; // overlong combination
246
247 for (uint j = 1; j != n; j++)
248 {
249 u = s[i + j];
250 if ((u & 0xC0) != 0x80)
251 goto Lerr; // trailing bytes are 10xxxxxx
252 V = (V << 6) | (u & 0x3F);
253 }
254 if (!isValidDchar(V))
255 goto Lerr;
256 i += n;
257 }
258 else
259 {
260 V = cast(dchar) u;
261 i++;
262 }
263
264 idx = i;
265 return V;
266
267 Lerr:
268 onUnicodeError("invalid UTF-8 sequence", i);
269 return V; // dummy return
270 }
271
272 unittest
273 { size_t i;
274 dchar c;
275
276 debug(utf) printf("utf.decode.unittest\n");
277
278 static char[] s1 = "abcd";
279 i = 0;
280 c = decode(s1, i);
281 assert(c == cast(dchar)'a');
282 assert(i == 1);
283 c = decode(s1, i);
284 assert(c == cast(dchar)'b');
285 assert(i == 2);
286
287 static char[] s2 = "\xC2\xA9";
288 i = 0;
289 c = decode(s2, i);
290 assert(c == cast(dchar)'\u00A9');
291 assert(i == 2);
292
293 static char[] s3 = "\xE2\x89\xA0";
294 i = 0;
295 c = decode(s3, i);
296 assert(c == cast(dchar)'\u2260');
297 assert(i == 3);
298
299 static char[][] s4 =
300 [ "\xE2\x89", // too short
301 "\xC0\x8A",
302 "\xE0\x80\x8A",
303 "\xF0\x80\x80\x8A",
304 "\xF8\x80\x80\x80\x8A",
305 "\xFC\x80\x80\x80\x80\x8A",
306 ];
307
308 for (int j = 0; j < s4.length; j++)
309 {
310 try
311 {
312 i = 0;
313 c = decode(s4[j], i);
314 assert(0);
315 }
316 catch (Object o)
317 {
318 i = 23;
319 }
320 assert(i == 23);
321 }
322 }
323
324 /********************************************************/
325
326 dchar decode(wchar[] s, inout size_t idx)
327 in
328 {
329 assert(idx >= 0 && idx < s.length);
330 }
331 out (result)
332 {
333 assert(isValidDchar(result));
334 }
335 body
336 {
337 char[] msg;
338 dchar V;
339 size_t i = idx;
340 uint u = s[i];
341
342 if (u & ~0x7F)
343 { if (u >= 0xD800 && u <= 0xDBFF)
344 { uint u2;
345
346 if (i + 1 == s.length)
347 { msg = "surrogate UTF-16 high value past end of string";
348 goto Lerr;
349 }
350 u2 = s[i + 1];
351 if (u2 < 0xDC00 || u2 > 0xDFFF)
352 { msg = "surrogate UTF-16 low value out of range";
353 goto Lerr;
354 }
355 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
356 i += 2;
357 }
358 else if (u >= 0xDC00 && u <= 0xDFFF)
359 { msg = "unpaired surrogate UTF-16 value";
360 goto Lerr;
361 }
362 else if (u == 0xFFFE || u == 0xFFFF)
363 { msg = "illegal UTF-16 value";
364 goto Lerr;
365 }
366 else
367 i++;
368 }
369 else
370 {
371 i++;
372 }
373
374 idx = i;
375 return cast(dchar)u;
376
377 Lerr:
378 onUnicodeError(msg, i);
379 return cast(dchar)u; // dummy return
380 }
381
382 /********************************************************/
383
384 dchar decode(dchar[] s, inout size_t idx)
385 in
386 {
387 assert(idx >= 0 && idx < s.length);
388 }
389 body
390 {
391 size_t i = idx;
392 dchar c = s[i];
393
394 if (!isValidDchar(c))
395 goto Lerr;
396 idx = i + 1;
397 return c;
398
399 Lerr:
400 onUnicodeError("invalid UTF-32 value", i);
401 return c; // dummy return
402 }
403
404
405 /* =================== Encode ======================= */
406
407 void encode(inout char[] s, dchar c)
408 in
409 {
410 assert(isValidDchar(c));
411 }
412 body
413 {
414 char[] r = s;
415
416 if (c <= 0x7F)
417 {
418 r ~= cast(char) c;
419 }
420 else
421 {
422 char[4] buf;
423 uint L;
424
425 if (c <= 0x7FF)
426 {
427 buf[0] = cast(char)(0xC0 | (c >> 6));
428 buf[1] = cast(char)(0x80 | (c & 0x3F));
429 L = 2;
430 }
431 else if (c <= 0xFFFF)
432 {
433 buf[0] = cast(char)(0xE0 | (c >> 12));
434 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
435 buf[2] = cast(char)(0x80 | (c & 0x3F));
436 L = 3;
437 }
438 else if (c <= 0x10FFFF)
439 {
440 buf[0] = cast(char)(0xF0 | (c >> 18));
441 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
442 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
443 buf[3] = cast(char)(0x80 | (c & 0x3F));
444 L = 4;
445 }
446 else
447 {
448 assert(0);
449 }
450 r ~= buf[0 .. L];
451 }
452 s = r;
453 }
454
455 unittest
456 {
457 debug(utf) printf("utf.encode.unittest\n");
458
459 char[] s = "abcd";
460 encode(s, cast(dchar)'a');
461 assert(s.length == 5);
462 assert(s == "abcda");
463
464 encode(s, cast(dchar)'\u00A9');
465 assert(s.length == 7);
466 assert(s == "abcda\xC2\xA9");
467 //assert(s == "abcda\u00A9"); // BUG: fix compiler
468
469 encode(s, cast(dchar)'\u2260');
470 assert(s.length == 10);
471 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
472 }
473
474 /********************************************************/
475
476 void encode(inout wchar[] s, dchar c)
477 in
478 {
479 assert(isValidDchar(c));
480 }
481 body
482 {
483 wchar[] r = s;
484
485 if (c <= 0xFFFF)
486 {
487 r ~= cast(wchar) c;
488 }
489 else
490 {
491 wchar[2] buf;
492
493 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
494 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
495 r ~= buf;
496 }
497 s = r;
498 }
499
500 void encode(inout dchar[] s, dchar c)
501 in
502 {
503 assert(isValidDchar(c));
504 }
505 body
506 {
507 s ~= c;
508 }
509
510 /* =================== Validation ======================= */
511
512 void validate(char[] s)
513 {
514 size_t len = s.length;
515 size_t i;
516
517 for (i = 0; i < len; )
518 {
519 decode(s, i);
520 }
521 }
522
523 void validate(wchar[] s)
524 {
525 size_t len = s.length;
526 size_t i;
527
528 for (i = 0; i < len; )
529 {
530 decode(s, i);
531 }
532 }
533
534 void validate(dchar[] s)
535 {
536 size_t len = s.length;
537 size_t i;
538
539 for (i = 0; i < len; )
540 {
541 decode(s, i);
542 }
543 }
544
545 /* =================== Conversion to UTF8 ======================= */
546
547 char[] toUTF8(char[4] buf, dchar c)
548 in
549 {
550 assert(isValidDchar(c));
551 }
552 body
553 {
554 if (c <= 0x7F)
555 {
556 buf[0] = cast(char) c;
557 return buf[0 .. 1];
558 }
559 else if (c <= 0x7FF)
560 {
561 buf[0] = cast(char)(0xC0 | (c >> 6));
562 buf[1] = cast(char)(0x80 | (c & 0x3F));
563 return buf[0 .. 2];
564 }
565 else if (c <= 0xFFFF)
566 {
567 buf[0] = cast(char)(0xE0 | (c >> 12));
568 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
569 buf[2] = cast(char)(0x80 | (c & 0x3F));
570 return buf[0 .. 3];
571 }
572 else if (c <= 0x10FFFF)
573 {
574 buf[0] = cast(char)(0xF0 | (c >> 18));
575 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
576 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
577 buf[3] = cast(char)(0x80 | (c & 0x3F));
578 return buf[0 .. 4];
579 }
580 assert(0);
581 }
582
583 char[] toUTF8(char[] s)
584 in
585 {
586 validate(s);
587 }
588 body
589 {
590 return s;
591 }
592
593 char[] toUTF8(wchar[] s)
594 {
595 char[] r;
596 size_t i;
597 size_t slen = s.length;
598
599 r.length = slen;
600
601 for (i = 0; i < slen; i++)
602 { wchar c = s[i];
603
604 if (c <= 0x7F)
605 r[i] = cast(char)c; // fast path for ascii
606 else
607 {
608 r.length = i;
609 foreach (dchar c; s[i .. slen])
610 {
611 encode(r, c);
612 }
613 break;
614 }
615 }
616 return r;
617 }
618
619 char[] toUTF8(dchar[] s)
620 {
621 char[] r;
622 size_t i;
623 size_t slen = s.length;
624
625 r.length = slen;
626
627 for (i = 0; i < slen; i++)
628 { dchar c = s[i];
629
630 if (c <= 0x7F)
631 r[i] = cast(char)c; // fast path for ascii
632 else
633 {
634 r.length = i;
635 foreach (dchar d; s[i .. slen])
636 {
637 encode(r, d);
638 }
639 break;
640 }
641 }
642 return r;
643 }
644
645 /* =================== Conversion to UTF16 ======================= */
646
647 wchar[] toUTF16(wchar[2] buf, dchar c)
648 in
649 {
650 assert(isValidDchar(c));
651 }
652 body
653 {
654 if (c <= 0xFFFF)
655 {
656 buf[0] = cast(wchar) c;
657 return buf[0 .. 1];
658 }
659 else
660 {
661 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
662 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
663 return buf[0 .. 2];
664 }
665 }
666
667 wchar[] toUTF16(char[] s)
668 {
669 wchar[] r;
670 size_t slen = s.length;
671
672 r.length = slen;
673 r.length = 0;
674 for (size_t i = 0; i < slen; )
675 {
676 dchar c = s[i];
677 if (c <= 0x7F)
678 {
679 i++;
680 r ~= cast(wchar)c;
681 }
682 else
683 {
684 c = decode(s, i);
685 encode(r, c);
686 }
687 }
688 return r;
689 }
690
691 wchar* toUTF16z(char[] s)
692 {
693 wchar[] r;
694 size_t slen = s.length;
695
696 r.length = slen + 1;
697 r.length = 0;
698 for (size_t i = 0; i < slen; )
699 {
700 dchar c = s[i];
701 if (c <= 0x7F)
702 {
703 i++;
704 r ~= cast(wchar)c;
705 }
706 else
707 {
708 c = decode(s, i);
709 encode(r, c);
710 }
711 }
712 r ~= "\000";
713 return r.ptr;
714 }
715
716 wchar[] toUTF16(wchar[] s)
717 in
718 {
719 validate(s);
720 }
721 body
722 {
723 return s;
724 }
725
726 wchar[] toUTF16(dchar[] s)
727 {
728 wchar[] r;
729 size_t slen = s.length;
730
731 r.length = slen;
732 r.length = 0;
733 for (size_t i = 0; i < slen; i++)
734 {
735 encode(r, s[i]);
736 }
737 return r;
738 }
739
740 /* =================== Conversion to UTF32 ======================= */
741
742 dchar[] toUTF32(char[] s)
743 {
744 dchar[] r;
745 size_t slen = s.length;
746 size_t j = 0;
747
748 r.length = slen; // r[] will never be longer than s[]
749 for (size_t i = 0; i < slen; )
750 {
751 dchar c = s[i];
752 if (c >= 0x80)
753 c = decode(s, i);
754 else
755 i++; // c is ascii, no need for decode
756 r[j++] = c;
757 }
758 return r[0 .. j];
759 }
760
761 dchar[] toUTF32(wchar[] s)
762 {
763 dchar[] r;
764 size_t slen = s.length;
765 size_t j = 0;
766
767 r.length = slen; // r[] will never be longer than s[]
768 for (size_t i = 0; i < slen; )
769 {
770 dchar c = s[i];
771 if (c >= 0x80)
772 c = decode(s, i);
773 else
774 i++; // c is ascii, no need for decode
775 r[j++] = c;
776 }
777 return r[0 .. j];
778 }
779
780 dchar[] toUTF32(dchar[] s)
781 in
782 {
783 validate(s);
784 }
785 body
786 {
787 return s;
788 }
789
790 /* ================================ tests ================================== */
791
792 unittest
793 {
794 debug(utf) printf("utf.toUTF.unittest\n");
795
796 char[] c;
797 wchar[] w;
798 dchar[] d;
799
800 c = "hello";
801 w = toUTF16(c);
802 assert(w == "hello");
803 d = toUTF32(c);
804 assert(d == "hello");
805
806 c = toUTF8(w);
807 assert(c == "hello");
808 d = toUTF32(w);
809 assert(d == "hello");
810
811 c = toUTF8(d);
812 assert(c == "hello");
813 w = toUTF16(d);
814 assert(w == "hello");
815
816
817 c = "hel\u1234o";
818 w = toUTF16(c);
819 assert(w == "hel\u1234o");
820 d = toUTF32(c);
821 assert(d == "hel\u1234o");
822
823 c = toUTF8(w);
824 assert(c == "hel\u1234o");
825 d = toUTF32(w);
826 assert(d == "hel\u1234o");
827
828 c = toUTF8(d);
829 assert(c == "hel\u1234o");
830 w = toUTF16(d);
831 assert(w == "hel\u1234o");
832
833
834 c = "he\U0010AAAAllo";
835 w = toUTF16(c);
836 //foreach (wchar c; w) printf("c = x%x\n", c);
837 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
838 assert(w == "he\U0010AAAAllo");
839 d = toUTF32(c);
840 assert(d == "he\U0010AAAAllo");
841
842 c = toUTF8(w);
843 assert(c == "he\U0010AAAAllo");
844 d = toUTF32(w);
845 assert(d == "he\U0010AAAAllo");
846
847 c = toUTF8(d);
848 assert(c == "he\U0010AAAAllo");
849 w = toUTF16(d);
850 assert(w == "he\U0010AAAAllo");
851 }