comparison runtime/internal/util/utf.d @ 443:44f08170f4ef

Removed tango from the repository and instead added a runtime dir with the files needed to patch and build tango from svn. Reworked the LLVMDC specific pragmas.
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Fri, 01 Aug 2008 00:32:06 +0200
parents
children
comparison
equal deleted inserted replaced
442:76078c8ab5b9 443:44f08170f4ef
1 // utf.d
2
3 /*
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
5 * Written by Walter Bright
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * o The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * o Altered source versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
21 * o This notice may not be removed or altered from any source
22 * distribution.
23 */
24
25 // Description of UTF-8 at:
26 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
27 // http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
28
29
30 module util.utf;
31
32
33 extern (C) void onUnicodeError( char[] msg, size_t idx );
34
35
36 bool isValidDchar(dchar c)
37 {
38 /* Note: FFFE and FFFF are specifically permitted by the
39 * Unicode standard for application internal use, but are not
40 * allowed for interchange.
41 * (thanks to Arcane Jill)
42 */
43
44 return c < 0xD800 ||
45 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
46 }
47
48 unittest
49 {
50 debug(utf) printf("utf.isValidDchar.unittest\n");
51 assert(isValidDchar(cast(dchar)'a') == true);
52 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
53 }
54
55
56 /* This array gives the length of a UTF-8 sequence indexed by the value
57 * of the leading byte. An FF represents an illegal starting value of
58 * a UTF-8 sequence.
59 * FF is used instead of 0 to avoid having loops hang.
60 */
61
62 ubyte[256] UTF8stride =
63 [
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
76 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
77 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
78 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
79 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
80 ];
81
82 uint stride(char[] s, size_t i)
83 {
84 return UTF8stride[s[i]];
85 }
86
87 uint stride(wchar[] s, size_t i)
88 { uint u = s[i];
89 return 1 + (u >= 0xD800 && u <= 0xDBFF);
90 }
91
92 uint stride(dchar[] s, size_t i)
93 {
94 return 1;
95 }
96
97 /*******************************************
98 * Given an index into an array of char's,
99 * and assuming that index is at the start of a UTF character,
100 * determine the number of UCS characters up to that index.
101 */
102
103 size_t toUCSindex(char[] s, size_t i)
104 {
105 size_t n;
106 size_t j;
107 size_t stride;
108
109 for (j = 0; j < i; j += stride)
110 {
111 stride = UTF8stride[s[j]];
112 if (stride == 0xFF)
113 goto Lerr;
114 n++;
115 }
116 if (j > i)
117 {
118 Lerr:
119 onUnicodeError("invalid UTF-8 sequence", j);
120 }
121 return n;
122 }
123
124 size_t toUCSindex(wchar[] s, size_t i)
125 {
126 size_t n;
127 size_t j;
128
129 for (j = 0; j < i; )
130 { uint u = s[j];
131
132 j += 1 + (u >= 0xD800 && u <= 0xDBFF);
133 n++;
134 }
135 if (j > i)
136 {
137 Lerr:
138 onUnicodeError("invalid UTF-16 sequence", j);
139 }
140 return n;
141 }
142
143 size_t toUCSindex(dchar[] s, size_t i)
144 {
145 return i;
146 }
147
148 /******************************************
149 * Given a UCS index into an array of characters, return the UTF index.
150 */
151
152 size_t toUTFindex(char[] s, size_t n)
153 {
154 size_t i;
155
156 while (n--)
157 {
158 uint j = UTF8stride[s[i]];
159 if (j == 0xFF)
160 onUnicodeError("invalid UTF-8 sequence", i);
161 i += j;
162 }
163 return i;
164 }
165
166 size_t toUTFindex(wchar[] s, size_t n)
167 {
168 size_t i;
169
170 while (n--)
171 { wchar u = s[i];
172
173 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
174 }
175 return i;
176 }
177
178 size_t toUTFindex(dchar[] s, size_t n)
179 {
180 return n;
181 }
182
183 /* =================== Decode ======================= */
184
185 dchar decode(char[] s, inout size_t idx)
186 in
187 {
188 assert(idx >= 0 && idx < s.length);
189 }
190 out (result)
191 {
192 assert(isValidDchar(result));
193 }
194 body
195 {
196 size_t len = s.length;
197 dchar V;
198 size_t i = idx;
199 char u = s[i];
200
201 if (u & 0x80)
202 { uint n;
203 char u2;
204
205 /* The following encodings are valid, except for the 5 and 6 byte
206 * combinations:
207 * 0xxxxxxx
208 * 110xxxxx 10xxxxxx
209 * 1110xxxx 10xxxxxx 10xxxxxx
210 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
213 */
214 for (n = 1; ; n++)
215 {
216 if (n > 4)
217 goto Lerr; // only do the first 4 of 6 encodings
218 if (((u << n) & 0x80) == 0)
219 {
220 if (n == 1)
221 goto Lerr;
222 break;
223 }
224 }
225
226 // Pick off (7 - n) significant bits of B from first byte of octet
227 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
228
229 if (i + (n - 1) >= len)
230 goto Lerr; // off end of string
231
232 /* The following combinations are overlong, and illegal:
233 * 1100000x (10xxxxxx)
234 * 11100000 100xxxxx (10xxxxxx)
235 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
236 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
237 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
238 */
239 u2 = s[i + 1];
240 if ((u & 0xFE) == 0xC0 ||
241 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
242 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
243 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
244 (u == 0xFC && (u2 & 0xFC) == 0x80))
245 goto Lerr; // overlong combination
246
247 for (uint j = 1; j != n; j++)
248 {
249 u = s[i + j];
250 if ((u & 0xC0) != 0x80)
251 goto Lerr; // trailing bytes are 10xxxxxx
252 V = (V << 6) | (u & 0x3F);
253 }
254 if (!isValidDchar(V))
255 goto Lerr;
256 i += n;
257 }
258 else
259 {
260 V = cast(dchar) u;
261 i++;
262 }
263
264 idx = i;
265 return V;
266
267 Lerr:
268 onUnicodeError("invalid UTF-8 sequence", i);
269 return V; // dummy return
270 }
271
272 unittest
273 { size_t i;
274 dchar c;
275
276 debug(utf) printf("utf.decode.unittest\n");
277
278 static char[] s1 = "abcd";
279 i = 0;
280 c = decode(s1, i);
281 assert(c == cast(dchar)'a');
282 assert(i == 1);
283 c = decode(s1, i);
284 assert(c == cast(dchar)'b');
285 assert(i == 2);
286
287 static char[] s2 = "\xC2\xA9";
288 i = 0;
289 c = decode(s2, i);
290 assert(c == cast(dchar)'\u00A9');
291 assert(i == 2);
292
293 static char[] s3 = "\xE2\x89\xA0";
294 i = 0;
295 c = decode(s3, i);
296 assert(c == cast(dchar)'\u2260');
297 assert(i == 3);
298
299 static char[][] s4 =
300 [ "\xE2\x89", // too short
301 "\xC0\x8A",
302 "\xE0\x80\x8A",
303 "\xF0\x80\x80\x8A",
304 "\xF8\x80\x80\x80\x8A",
305 "\xFC\x80\x80\x80\x80\x8A",
306 ];
307
308 for (int j = 0; j < s4.length; j++)
309 {
310 try
311 {
312 i = 0;
313 c = decode(s4[j], i);
314 assert(0);
315 }
316 catch (Object o)
317 {
318 i = 23;
319 }
320 assert(i == 23);
321 }
322 }
323
324 /********************************************************/
325
326 dchar decode(wchar[] s, inout size_t idx)
327 in
328 {
329 assert(idx >= 0 && idx < s.length);
330 }
331 out (result)
332 {
333 assert(isValidDchar(result));
334 }
335 body
336 {
337 char[] msg;
338 dchar V;
339 size_t i = idx;
340 uint u = s[i];
341
342 if (u & ~0x7F)
343 { if (u >= 0xD800 && u <= 0xDBFF)
344 { uint u2;
345
346 if (i + 1 == s.length)
347 { msg = "surrogate UTF-16 high value past end of string";
348 goto Lerr;
349 }
350 u2 = s[i + 1];
351 if (u2 < 0xDC00 || u2 > 0xDFFF)
352 { msg = "surrogate UTF-16 low value out of range";
353 goto Lerr;
354 }
355 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
356 i += 2;
357 }
358 else if (u >= 0xDC00 && u <= 0xDFFF)
359 { msg = "unpaired surrogate UTF-16 value";
360 goto Lerr;
361 }
362 else if (u == 0xFFFE || u == 0xFFFF)
363 { msg = "illegal UTF-16 value";
364 goto Lerr;
365 }
366 else
367 i++;
368 }
369 else
370 {
371 i++;
372 }
373
374 idx = i;
375 return cast(dchar)u;
376
377 Lerr:
378 onUnicodeError(msg, i);
379 return cast(dchar)u; // dummy return
380 }
381
382 /********************************************************/
383
384 dchar decode(dchar[] s, inout size_t idx)
385 in
386 {
387 assert(idx >= 0 && idx < s.length);
388 }
389 body
390 {
391 size_t i = idx;
392 dchar c = s[i];
393
394 if (!isValidDchar(c))
395 goto Lerr;
396 idx = i + 1;
397 return c;
398
399 Lerr:
400 onUnicodeError("invalid UTF-32 value", i);
401 return c; // dummy return
402 }
403
404
405 /* =================== Encode ======================= */
406
407 void encode(inout char[] s, dchar c)
408 in
409 {
410 assert(isValidDchar(c));
411 }
412 body
413 {
414 char[] r = s;
415
416 if (c <= 0x7F)
417 {
418 r ~= cast(char) c;
419 }
420 else
421 {
422 char[4] buf;
423 uint L;
424
425 if (c <= 0x7FF)
426 {
427 buf[0] = cast(char)(0xC0 | (c >> 6));
428 buf[1] = cast(char)(0x80 | (c & 0x3F));
429 L = 2;
430 }
431 else if (c <= 0xFFFF)
432 {
433 buf[0] = cast(char)(0xE0 | (c >> 12));
434 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
435 buf[2] = cast(char)(0x80 | (c & 0x3F));
436 L = 3;
437 }
438 else if (c <= 0x10FFFF)
439 {
440 buf[0] = cast(char)(0xF0 | (c >> 18));
441 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
442 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
443 buf[3] = cast(char)(0x80 | (c & 0x3F));
444 L = 4;
445 }
446 else
447 {
448 assert(0);
449 }
450 r ~= buf[0 .. L];
451 }
452 s = r;
453 }
454
455 unittest
456 {
457 debug(utf) printf("utf.encode.unittest\n");
458
459 char[] s = "abcd";
460 encode(s, cast(dchar)'a');
461 assert(s.length == 5);
462 assert(s == "abcda");
463
464 encode(s, cast(dchar)'\u00A9');
465 assert(s.length == 7);
466 assert(s == "abcda\xC2\xA9");
467 //assert(s == "abcda\u00A9"); // BUG: fix compiler
468
469 encode(s, cast(dchar)'\u2260');
470 assert(s.length == 10);
471 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
472 }
473
474 /********************************************************/
475
476 void encode(inout wchar[] s, dchar c)
477 in
478 {
479 assert(isValidDchar(c));
480 }
481 body
482 {
483 wchar[] r = s;
484
485 if (c <= 0xFFFF)
486 {
487 r ~= cast(wchar) c;
488 }
489 else
490 {
491 wchar[2] buf;
492
493 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
494 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
495 r ~= buf;
496 }
497 s = r;
498 }
499
500 void encode(inout dchar[] s, dchar c)
501 in
502 {
503 assert(isValidDchar(c));
504 }
505 body
506 {
507 s ~= c;
508 }
509
510 /* =================== Validation ======================= */
511
512 void validate(char[] s)
513 {
514 size_t len = s.length;
515 size_t i;
516
517 for (i = 0; i < len; )
518 {
519 decode(s, i);
520 }
521 }
522
523 void validate(wchar[] s)
524 {
525 size_t len = s.length;
526 size_t i;
527
528 for (i = 0; i < len; )
529 {
530 decode(s, i);
531 }
532 }
533
534 void validate(dchar[] s)
535 {
536 size_t len = s.length;
537 size_t i;
538
539 for (i = 0; i < len; )
540 {
541 decode(s, i);
542 }
543 }
544
545 /* =================== Conversion to UTF8 ======================= */
546
547 char[] toUTF8(char[4] buf, dchar c)
548 in
549 {
550 assert(isValidDchar(c));
551 }
552 body
553 {
554 if (c <= 0x7F)
555 {
556 buf[0] = cast(char) c;
557 return buf[0 .. 1];
558 }
559 else if (c <= 0x7FF)
560 {
561 buf[0] = cast(char)(0xC0 | (c >> 6));
562 buf[1] = cast(char)(0x80 | (c & 0x3F));
563 return buf[0 .. 2];
564 }
565 else if (c <= 0xFFFF)
566 {
567 buf[0] = cast(char)(0xE0 | (c >> 12));
568 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
569 buf[2] = cast(char)(0x80 | (c & 0x3F));
570 return buf[0 .. 3];
571 }
572 else if (c <= 0x10FFFF)
573 {
574 buf[0] = cast(char)(0xF0 | (c >> 18));
575 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
576 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
577 buf[3] = cast(char)(0x80 | (c & 0x3F));
578 return buf[0 .. 4];
579 }
580 assert(0);
581 }
582
583 char[] toUTF8(char[] s)
584 in
585 {
586 validate(s);
587 }
588 body
589 {
590 return s;
591 }
592
593 char[] toUTF8(wchar[] s)
594 {
595 char[] r;
596 size_t i;
597 size_t slen = s.length;
598
599 r.length = slen;
600
601 for (i = 0; i < slen; i++)
602 { wchar c = s[i];
603
604 if (c <= 0x7F)
605 r[i] = cast(char)c; // fast path for ascii
606 else
607 {
608 r.length = i;
609 foreach (dchar c; s[i .. slen])
610 {
611 encode(r, c);
612 }
613 break;
614 }
615 }
616 return r;
617 }
618
619 char[] toUTF8(dchar[] s)
620 {
621 char[] r;
622 size_t i;
623 size_t slen = s.length;
624
625 r.length = slen;
626
627 for (i = 0; i < slen; i++)
628 { dchar c = s[i];
629
630 if (c <= 0x7F)
631 r[i] = cast(char)c; // fast path for ascii
632 else
633 {
634 r.length = i;
635 foreach (dchar d; s[i .. slen])
636 {
637 encode(r, d);
638 }
639 break;
640 }
641 }
642 return r;
643 }
644
645 /* =================== Conversion to UTF16 ======================= */
646
647 wchar[] toUTF16(wchar[2] buf, dchar c)
648 in
649 {
650 assert(isValidDchar(c));
651 }
652 body
653 {
654 if (c <= 0xFFFF)
655 {
656 buf[0] = cast(wchar) c;
657 return buf[0 .. 1];
658 }
659 else
660 {
661 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
662 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
663 return buf[0 .. 2];
664 }
665 }
666
667 wchar[] toUTF16(char[] s)
668 {
669 wchar[] r;
670 size_t slen = s.length;
671
672 r.length = slen;
673 r.length = 0;
674 for (size_t i = 0; i < slen; )
675 {
676 dchar c = s[i];
677 if (c <= 0x7F)
678 {
679 i++;
680 r ~= cast(wchar)c;
681 }
682 else
683 {
684 c = decode(s, i);
685 encode(r, c);
686 }
687 }
688 return r;
689 }
690
691 wchar* toUTF16z(char[] s)
692 {
693 wchar[] r;
694 size_t slen = s.length;
695
696 r.length = slen + 1;
697 r.length = 0;
698 for (size_t i = 0; i < slen; )
699 {
700 dchar c = s[i];
701 if (c <= 0x7F)
702 {
703 i++;
704 r ~= cast(wchar)c;
705 }
706 else
707 {
708 c = decode(s, i);
709 encode(r, c);
710 }
711 }
712 r ~= "\000";
713 return r.ptr;
714 }
715
716 wchar[] toUTF16(wchar[] s)
717 in
718 {
719 validate(s);
720 }
721 body
722 {
723 return s;
724 }
725
726 wchar[] toUTF16(dchar[] s)
727 {
728 wchar[] r;
729 size_t slen = s.length;
730
731 r.length = slen;
732 r.length = 0;
733 for (size_t i = 0; i < slen; i++)
734 {
735 encode(r, s[i]);
736 }
737 return r;
738 }
739
740 /* =================== Conversion to UTF32 ======================= */
741
742 dchar[] toUTF32(char[] s)
743 {
744 dchar[] r;
745 size_t slen = s.length;
746 size_t j = 0;
747
748 r.length = slen; // r[] will never be longer than s[]
749 for (size_t i = 0; i < slen; )
750 {
751 dchar c = s[i];
752 if (c >= 0x80)
753 c = decode(s, i);
754 else
755 i++; // c is ascii, no need for decode
756 r[j++] = c;
757 }
758 return r[0 .. j];
759 }
760
761 dchar[] toUTF32(wchar[] s)
762 {
763 dchar[] r;
764 size_t slen = s.length;
765 size_t j = 0;
766
767 r.length = slen; // r[] will never be longer than s[]
768 for (size_t i = 0; i < slen; )
769 {
770 dchar c = s[i];
771 if (c >= 0x80)
772 c = decode(s, i);
773 else
774 i++; // c is ascii, no need for decode
775 r[j++] = c;
776 }
777 return r[0 .. j];
778 }
779
780 dchar[] toUTF32(dchar[] s)
781 in
782 {
783 validate(s);
784 }
785 body
786 {
787 return s;
788 }
789
790 /* ================================ tests ================================== */
791
792 unittest
793 {
794 debug(utf) printf("utf.toUTF.unittest\n");
795
796 char[] c;
797 wchar[] w;
798 dchar[] d;
799
800 c = "hello";
801 w = toUTF16(c);
802 assert(w == "hello");
803 d = toUTF32(c);
804 assert(d == "hello");
805
806 c = toUTF8(w);
807 assert(c == "hello");
808 d = toUTF32(w);
809 assert(d == "hello");
810
811 c = toUTF8(d);
812 assert(c == "hello");
813 w = toUTF16(d);
814 assert(w == "hello");
815
816
817 c = "hel\u1234o";
818 w = toUTF16(c);
819 assert(w == "hel\u1234o");
820 d = toUTF32(c);
821 assert(d == "hel\u1234o");
822
823 c = toUTF8(w);
824 assert(c == "hel\u1234o");
825 d = toUTF32(w);
826 assert(d == "hel\u1234o");
827
828 c = toUTF8(d);
829 assert(c == "hel\u1234o");
830 w = toUTF16(d);
831 assert(w == "hel\u1234o");
832
833
834 c = "he\U0010AAAAllo";
835 w = toUTF16(c);
836 //foreach (wchar c; w) printf("c = x%x\n", c);
837 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
838 assert(w == "he\U0010AAAAllo");
839 d = toUTF32(c);
840 assert(d == "he\U0010AAAAllo");
841
842 c = toUTF8(w);
843 assert(c == "he\U0010AAAAllo");
844 d = toUTF32(w);
845 assert(d == "he\U0010AAAAllo");
846
847 c = toUTF8(d);
848 assert(c == "he\U0010AAAAllo");
849 w = toUTF16(d);
850 assert(w == "he\U0010AAAAllo");
851 }