Mercurial > projects > dil
comparison trunk/src/util/utf.d @ 629:d050e211402b
Moved files in src/std/ to src/util/.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Fri, 11 Jan 2008 20:03:46 +0100 |
parents | trunk/src/std/utf.d@33b566df6af4 |
children |
comparison
equal
deleted
inserted
replaced
628:08681b93c3b3 | 629:d050e211402b |
---|---|
1 // utf.d | |
2 | |
3 /* | |
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com | |
5 * Written by Walter Bright | |
6 * | |
7 * This software is provided 'as-is', without any express or implied | |
8 * warranty. In no event will the authors be held liable for any damages | |
9 * arising from the use of this software. | |
10 * | |
11 * Permission is granted to anyone to use this software for any purpose, | |
12 * including commercial applications, and to alter it and redistribute it | |
13 * freely, subject to the following restrictions: | |
14 * | |
15 * o The origin of this software must not be misrepresented; you must not | |
16 * claim that you wrote the original software. If you use this software | |
17 * in a product, an acknowledgment in the product documentation would be | |
18 * appreciated but is not required. | |
19 * o Altered source versions must be plainly marked as such, and must not | |
20 * be misrepresented as being the original software. | |
21 * o This notice may not be removed or altered from any source | |
22 * distribution. | |
23 */ | |
24 | |
25 /******************************************** | |
26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. | |
27 * | |
28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D | |
29 * wchar type. | |
30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to | |
31 * the D utf.dchar type. | |
32 * | |
33 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). | |
34 * | |
35 * See_Also: | |
36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> | |
37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> | |
38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) | |
39 * Macros: | |
40 * WIKI = Phobos/StdUtf | |
41 */ | |
42 | |
43 /* | |
44 Note: this is not the original file! | |
45 Modified by Aziz Köksal: | |
46 Only commented out deprecated class UtfError. | |
47 */ | |
48 | |
49 module util.utf; | |
50 | |
51 // private import std.stdio; | |
52 | |
53 //debug=utf; // uncomment to turn on debugging printf's | |
54 /+ | |
55 deprecated class UtfError : Error | |
56 { | |
57 size_t idx; // index in string of where error occurred | |
58 | |
59 this(char[] s, size_t i) | |
60 { | |
61 idx = i; | |
62 super(s); | |
63 } | |
64 } | |
65 +/ | |
66 /********************************** | |
67 * Exception class that is thrown upon any errors. | |
68 */ | |
69 | |
70 class UtfException : Exception | |
71 { | |
72 size_t idx; /// index in string of where error occurred | |
73 | |
74 this(char[] s, size_t i) | |
75 { | |
76 idx = i; | |
77 super(s); | |
78 } | |
79 } | |
80 | |
81 /******************************* | |
82 * Test if c is a valid UTF-32 character. | |
83 * | |
84 * \uFFFE and \uFFFF are considered valid by this function, | |
85 * as they are permitted for internal use by an application, | |
86 * but they are not allowed for interchange by the Unicode standard. | |
87 * | |
88 * Returns: true if it is, false if not. | |
89 */ | |
90 | |
91 bool isValidDchar(dchar c) | |
92 { | |
93 /* Note: FFFE and FFFF are specifically permitted by the | |
94 * Unicode standard for application internal use, but are not | |
95 * allowed for interchange. | |
96 * (thanks to Arcane Jill) | |
97 */ | |
98 | |
99 return c < 0xD800 || | |
100 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); | |
101 } | |
102 | |
103 unittest | |
104 { | |
105 debug(utf) printf("utf.isValidDchar.unittest\n"); | |
106 assert(isValidDchar(cast(dchar)'a') == true); | |
107 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); | |
108 } | |
109 | |
110 | |
111 ubyte[256] UTF8stride = | |
112 [ | |
113 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
114 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
115 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
116 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
117 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
118 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
119 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
120 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
121 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
122 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
123 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
124 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
125 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
126 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
127 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
128 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, | |
129 ]; | |
130 | |
131 /** | |
132 * stride() returns the length of a UTF-8 sequence starting at index i | |
133 * in string s. | |
134 * Returns: | |
135 * The number of bytes in the UTF-8 sequence or | |
136 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. | |
137 */ | |
138 | |
139 uint stride(char[] s, size_t i) | |
140 { | |
141 return UTF8stride[s[i]]; | |
142 } | |
143 | |
144 /** | |
145 * stride() returns the length of a UTF-16 sequence starting at index i | |
146 * in string s. | |
147 */ | |
148 | |
149 uint stride(wchar[] s, size_t i) | |
150 { uint u = s[i]; | |
151 return 1 + (u >= 0xD800 && u <= 0xDBFF); | |
152 } | |
153 | |
154 /** | |
155 * stride() returns the length of a UTF-32 sequence starting at index i | |
156 * in string s. | |
157 * Returns: The return value will always be 1. | |
158 */ | |
159 | |
160 uint stride(dchar[] s, size_t i) | |
161 { | |
162 return 1; | |
163 } | |
164 | |
165 /******************************************* | |
166 * Given an index i into an array of characters s[], | |
167 * and assuming that index i is at the start of a UTF character, | |
168 * determine the number of UCS characters up to that index i. | |
169 */ | |
170 | |
171 size_t toUCSindex(char[] s, size_t i) | |
172 { | |
173 size_t n; | |
174 size_t j; | |
175 size_t stride; | |
176 | |
177 for (j = 0; j < i; j += stride) | |
178 { | |
179 stride = UTF8stride[s[j]]; | |
180 if (stride == 0xFF) | |
181 goto Lerr; | |
182 n++; | |
183 } | |
184 if (j > i) | |
185 { | |
186 Lerr: | |
187 throw new UtfException("1invalid UTF-8 sequence", j); | |
188 } | |
189 return n; | |
190 } | |
191 | |
192 /** ditto */ | |
193 | |
194 size_t toUCSindex(wchar[] s, size_t i) | |
195 { | |
196 size_t n; | |
197 size_t j; | |
198 | |
199 for (j = 0; j < i; ) | |
200 { uint u = s[j]; | |
201 | |
202 j += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
203 n++; | |
204 } | |
205 if (j > i) | |
206 { | |
207 Lerr: | |
208 throw new UtfException("2invalid UTF-16 sequence", j); | |
209 } | |
210 return n; | |
211 } | |
212 | |
213 /** ditto */ | |
214 | |
215 size_t toUCSindex(dchar[] s, size_t i) | |
216 { | |
217 return i; | |
218 } | |
219 | |
220 /****************************************** | |
221 * Given a UCS index n into an array of characters s[], return the UTF index. | |
222 */ | |
223 | |
224 size_t toUTFindex(char[] s, size_t n) | |
225 { | |
226 size_t i; | |
227 | |
228 while (n--) | |
229 { | |
230 uint j = UTF8stride[s[i]]; | |
231 if (j == 0xFF) | |
232 throw new UtfException("3invalid UTF-8 sequence", i); | |
233 i += j; | |
234 } | |
235 return i; | |
236 } | |
237 | |
238 /** ditto */ | |
239 | |
240 size_t toUTFindex(wchar[] s, size_t n) | |
241 { | |
242 size_t i; | |
243 | |
244 while (n--) | |
245 { wchar u = s[i]; | |
246 | |
247 i += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
248 } | |
249 return i; | |
250 } | |
251 | |
252 /** ditto */ | |
253 | |
254 size_t toUTFindex(dchar[] s, size_t n) | |
255 { | |
256 return n; | |
257 } | |
258 | |
259 /* =================== Decode ======================= */ | |
260 | |
261 /*************** | |
262 * Decodes and returns character starting at s[idx]. idx is advanced past the | |
263 * decoded character. If the character is not well formed, a UtfException is | |
264 * thrown and idx remains unchanged. | |
265 */ | |
266 | |
267 dchar decode(char[] s, inout size_t idx) | |
268 in | |
269 { | |
270 assert(idx >= 0 && idx < s.length); | |
271 } | |
272 out (result) | |
273 { | |
274 assert(isValidDchar(result)); | |
275 } | |
276 body | |
277 { | |
278 size_t len = s.length; | |
279 dchar V; | |
280 size_t i = idx; | |
281 char u = s[i]; | |
282 | |
283 if (u & 0x80) | |
284 { uint n; | |
285 char u2; | |
286 | |
287 /* The following encodings are valid, except for the 5 and 6 byte | |
288 * combinations: | |
289 * 0xxxxxxx | |
290 * 110xxxxx 10xxxxxx | |
291 * 1110xxxx 10xxxxxx 10xxxxxx | |
292 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
293 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
294 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
295 */ | |
296 for (n = 1; ; n++) | |
297 { | |
298 if (n > 4) | |
299 goto Lerr; // only do the first 4 of 6 encodings | |
300 if (((u << n) & 0x80) == 0) | |
301 { | |
302 if (n == 1) | |
303 goto Lerr; | |
304 break; | |
305 } | |
306 } | |
307 | |
308 // Pick off (7 - n) significant bits of B from first byte of octet | |
309 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); | |
310 | |
311 if (i + (n - 1) >= len) | |
312 goto Lerr; // off end of string | |
313 | |
314 /* The following combinations are overlong, and illegal: | |
315 * 1100000x (10xxxxxx) | |
316 * 11100000 100xxxxx (10xxxxxx) | |
317 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
318 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
319 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
320 */ | |
321 u2 = s[i + 1]; | |
322 if ((u & 0xFE) == 0xC0 || | |
323 (u == 0xE0 && (u2 & 0xE0) == 0x80) || | |
324 (u == 0xF0 && (u2 & 0xF0) == 0x80) || | |
325 (u == 0xF8 && (u2 & 0xF8) == 0x80) || | |
326 (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
327 goto Lerr; // overlong combination | |
328 | |
329 for (uint j = 1; j != n; j++) | |
330 { | |
331 u = s[i + j]; | |
332 if ((u & 0xC0) != 0x80) | |
333 goto Lerr; // trailing bytes are 10xxxxxx | |
334 V = (V << 6) | (u & 0x3F); | |
335 } | |
336 if (!isValidDchar(V)) | |
337 goto Lerr; | |
338 i += n; | |
339 } | |
340 else | |
341 { | |
342 V = cast(dchar) u; | |
343 i++; | |
344 } | |
345 | |
346 idx = i; | |
347 return V; | |
348 | |
349 Lerr: | |
350 //printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n'%.*s'\n", idx, i, s.length, s, s[i], s[i .. length]); | |
351 throw new UtfException("4invalid UTF-8 sequence", i); | |
352 } | |
353 | |
354 unittest | |
355 { size_t i; | |
356 dchar c; | |
357 | |
358 debug(utf) printf("utf.decode.unittest\n"); | |
359 | |
360 static char[] s1 = "abcd"; | |
361 i = 0; | |
362 c = decode(s1, i); | |
363 assert(c == cast(dchar)'a'); | |
364 assert(i == 1); | |
365 c = decode(s1, i); | |
366 assert(c == cast(dchar)'b'); | |
367 assert(i == 2); | |
368 | |
369 static char[] s2 = "\xC2\xA9"; | |
370 i = 0; | |
371 c = decode(s2, i); | |
372 assert(c == cast(dchar)'\u00A9'); | |
373 assert(i == 2); | |
374 | |
375 static char[] s3 = "\xE2\x89\xA0"; | |
376 i = 0; | |
377 c = decode(s3, i); | |
378 assert(c == cast(dchar)'\u2260'); | |
379 assert(i == 3); | |
380 | |
381 static char[][] s4 = | |
382 [ "\xE2\x89", // too short | |
383 "\xC0\x8A", | |
384 "\xE0\x80\x8A", | |
385 "\xF0\x80\x80\x8A", | |
386 "\xF8\x80\x80\x80\x8A", | |
387 "\xFC\x80\x80\x80\x80\x8A", | |
388 ]; | |
389 | |
390 for (int j = 0; j < s4.length; j++) | |
391 { | |
392 try | |
393 { | |
394 i = 0; | |
395 c = decode(s4[j], i); | |
396 assert(0); | |
397 } | |
398 catch (UtfException u) | |
399 { | |
400 i = 23; | |
401 delete u; | |
402 } | |
403 assert(i == 23); | |
404 } | |
405 } | |
406 | |
407 /** ditto */ | |
408 | |
409 dchar decode(wchar[] s, inout size_t idx) | |
410 in | |
411 { | |
412 assert(idx >= 0 && idx < s.length); | |
413 } | |
414 out (result) | |
415 { | |
416 assert(isValidDchar(result)); | |
417 } | |
418 body | |
419 { | |
420 char[] msg; | |
421 dchar V; | |
422 size_t i = idx; | |
423 uint u = s[i]; | |
424 | |
425 if (u & ~0x7F) | |
426 { if (u >= 0xD800 && u <= 0xDBFF) | |
427 { uint u2; | |
428 | |
429 if (i + 1 == s.length) | |
430 { msg = "surrogate UTF-16 high value past end of string"; | |
431 goto Lerr; | |
432 } | |
433 u2 = s[i + 1]; | |
434 if (u2 < 0xDC00 || u2 > 0xDFFF) | |
435 { msg = "surrogate UTF-16 low value out of range"; | |
436 goto Lerr; | |
437 } | |
438 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
439 i += 2; | |
440 } | |
441 else if (u >= 0xDC00 && u <= 0xDFFF) | |
442 { msg = "unpaired surrogate UTF-16 value"; | |
443 goto Lerr; | |
444 } | |
445 else if (u == 0xFFFE || u == 0xFFFF) | |
446 { msg = "illegal UTF-16 value"; | |
447 goto Lerr; | |
448 } | |
449 else | |
450 i++; | |
451 } | |
452 else | |
453 { | |
454 i++; | |
455 } | |
456 | |
457 idx = i; | |
458 return cast(dchar)u; | |
459 | |
460 Lerr: | |
461 throw new UtfException(msg, i); | |
462 } | |
463 | |
464 /** ditto */ | |
465 | |
466 dchar decode(dchar[] s, inout size_t idx) | |
467 in | |
468 { | |
469 assert(idx >= 0 && idx < s.length); | |
470 } | |
471 body | |
472 { | |
473 size_t i = idx; | |
474 dchar c = s[i]; | |
475 | |
476 if (!isValidDchar(c)) | |
477 goto Lerr; | |
478 idx = i + 1; | |
479 return c; | |
480 | |
481 Lerr: | |
482 throw new UtfException("5invalid UTF-32 value", i); | |
483 } | |
484 | |
485 | |
486 /* =================== Encode ======================= */ | |
487 | |
488 /******************************* | |
489 * Encodes character c and appends it to array s[]. | |
490 */ | |
491 | |
492 void encode(inout char[] s, dchar c) | |
493 in | |
494 { | |
495 assert(isValidDchar(c)); | |
496 } | |
497 body | |
498 { | |
499 char[] r = s; | |
500 | |
501 if (c <= 0x7F) | |
502 { | |
503 r ~= cast(char) c; | |
504 } | |
505 else | |
506 { | |
507 char[4] buf; | |
508 uint L; | |
509 | |
510 if (c <= 0x7FF) | |
511 { | |
512 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
513 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
514 L = 2; | |
515 } | |
516 else if (c <= 0xFFFF) | |
517 { | |
518 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
519 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
520 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
521 L = 3; | |
522 } | |
523 else if (c <= 0x10FFFF) | |
524 { | |
525 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
526 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
527 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
528 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
529 L = 4; | |
530 } | |
531 else | |
532 { | |
533 assert(0); | |
534 } | |
535 r ~= buf[0 .. L]; | |
536 } | |
537 s = r; | |
538 } | |
539 | |
540 unittest | |
541 { | |
542 debug(utf) printf("utf.encode.unittest\n"); | |
543 | |
544 char[] s = "abcd"; | |
545 encode(s, cast(dchar)'a'); | |
546 assert(s.length == 5); | |
547 assert(s == "abcda"); | |
548 | |
549 encode(s, cast(dchar)'\u00A9'); | |
550 assert(s.length == 7); | |
551 assert(s == "abcda\xC2\xA9"); | |
552 //assert(s == "abcda\u00A9"); // BUG: fix compiler | |
553 | |
554 encode(s, cast(dchar)'\u2260'); | |
555 assert(s.length == 10); | |
556 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); | |
557 } | |
558 | |
559 /** ditto */ | |
560 | |
561 void encode(inout wchar[] s, dchar c) | |
562 in | |
563 { | |
564 assert(isValidDchar(c)); | |
565 } | |
566 body | |
567 { | |
568 wchar[] r = s; | |
569 | |
570 if (c <= 0xFFFF) | |
571 { | |
572 r ~= cast(wchar) c; | |
573 } | |
574 else | |
575 { | |
576 wchar[2] buf; | |
577 | |
578 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
579 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
580 r ~= buf; | |
581 } | |
582 s = r; | |
583 } | |
584 | |
585 /** ditto */ | |
586 | |
587 void encode(inout dchar[] s, dchar c) | |
588 in | |
589 { | |
590 assert(isValidDchar(c)); | |
591 } | |
592 body | |
593 { | |
594 s ~= c; | |
595 } | |
596 | |
597 /* =================== Validation ======================= */ | |
598 | |
599 /*********************************** | |
600 * Checks to see if string is well formed or not. Throws a UtfException if it is | |
601 * not. Use to check all untrusted input for correctness. | |
602 */ | |
603 | |
604 void validate(char[] s) | |
605 { | |
606 size_t len = s.length; | |
607 size_t i; | |
608 | |
609 for (i = 0; i < len; ) | |
610 { | |
611 decode(s, i); | |
612 } | |
613 } | |
614 | |
615 /** ditto */ | |
616 | |
617 void validate(wchar[] s) | |
618 { | |
619 size_t len = s.length; | |
620 size_t i; | |
621 | |
622 for (i = 0; i < len; ) | |
623 { | |
624 decode(s, i); | |
625 } | |
626 } | |
627 | |
628 /** ditto */ | |
629 | |
630 void validate(dchar[] s) | |
631 { | |
632 size_t len = s.length; | |
633 size_t i; | |
634 | |
635 for (i = 0; i < len; ) | |
636 { | |
637 decode(s, i); | |
638 } | |
639 } | |
640 | |
641 /* =================== Conversion to UTF8 ======================= */ | |
642 | |
643 char[] toUTF8(char[4] buf, dchar c) | |
644 in | |
645 { | |
646 assert(isValidDchar(c)); | |
647 } | |
648 body | |
649 { | |
650 if (c <= 0x7F) | |
651 { | |
652 buf[0] = cast(char) c; | |
653 return buf[0 .. 1]; | |
654 } | |
655 else if (c <= 0x7FF) | |
656 { | |
657 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
658 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
659 return buf[0 .. 2]; | |
660 } | |
661 else if (c <= 0xFFFF) | |
662 { | |
663 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
664 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
665 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
666 return buf[0 .. 3]; | |
667 } | |
668 else if (c <= 0x10FFFF) | |
669 { | |
670 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
671 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
672 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
673 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
674 return buf[0 .. 4]; | |
675 } | |
676 assert(0); | |
677 } | |
678 | |
679 /******************* | |
680 * Encodes string s into UTF-8 and returns the encoded string. | |
681 */ | |
682 | |
683 char[] toUTF8(char[] s) | |
684 in | |
685 { | |
686 validate(s); | |
687 } | |
688 body | |
689 { | |
690 return s; | |
691 } | |
692 | |
693 /** ditto */ | |
694 | |
695 char[] toUTF8(wchar[] s) | |
696 { | |
697 char[] r; | |
698 size_t i; | |
699 size_t slen = s.length; | |
700 | |
701 r.length = slen; | |
702 | |
703 for (i = 0; i < slen; i++) | |
704 { wchar c = s[i]; | |
705 | |
706 if (c <= 0x7F) | |
707 r[i] = cast(char)c; // fast path for ascii | |
708 else | |
709 { | |
710 r.length = i; | |
711 foreach (dchar c; s[i .. slen]) | |
712 { | |
713 encode(r, c); | |
714 } | |
715 break; | |
716 } | |
717 } | |
718 return r; | |
719 } | |
720 | |
721 /** ditto */ | |
722 | |
723 char[] toUTF8(dchar[] s) | |
724 { | |
725 char[] r; | |
726 size_t i; | |
727 size_t slen = s.length; | |
728 | |
729 r.length = slen; | |
730 | |
731 for (i = 0; i < slen; i++) | |
732 { dchar c = s[i]; | |
733 | |
734 if (c <= 0x7F) | |
735 r[i] = cast(char)c; // fast path for ascii | |
736 else | |
737 { | |
738 r.length = i; | |
739 foreach (dchar d; s[i .. slen]) | |
740 { | |
741 encode(r, d); | |
742 } | |
743 break; | |
744 } | |
745 } | |
746 return r; | |
747 } | |
748 | |
749 /* =================== Conversion to UTF16 ======================= */ | |
750 | |
751 wchar[] toUTF16(wchar[2] buf, dchar c) | |
752 in | |
753 { | |
754 assert(isValidDchar(c)); | |
755 } | |
756 body | |
757 { | |
758 if (c <= 0xFFFF) | |
759 { | |
760 buf[0] = cast(wchar) c; | |
761 return buf[0 .. 1]; | |
762 } | |
763 else | |
764 { | |
765 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
766 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
767 return buf[0 .. 2]; | |
768 } | |
769 } | |
770 | |
771 /**************** | |
772 * Encodes string s into UTF-16 and returns the encoded string. | |
773 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take | |
774 * an LPWSTR or LPCWSTR argument. | |
775 */ | |
776 | |
777 wchar[] toUTF16(char[] s) | |
778 { | |
779 wchar[] r; | |
780 size_t slen = s.length; | |
781 | |
782 r.length = slen; | |
783 r.length = 0; | |
784 for (size_t i = 0; i < slen; ) | |
785 { | |
786 dchar c = s[i]; | |
787 if (c <= 0x7F) | |
788 { | |
789 i++; | |
790 r ~= cast(wchar)c; | |
791 } | |
792 else | |
793 { | |
794 c = decode(s, i); | |
795 encode(r, c); | |
796 } | |
797 } | |
798 return r; | |
799 } | |
800 | |
801 /** ditto */ | |
802 | |
803 wchar* toUTF16z(char[] s) | |
804 { | |
805 wchar[] r; | |
806 size_t slen = s.length; | |
807 | |
808 r.length = slen + 1; | |
809 r.length = 0; | |
810 for (size_t i = 0; i < slen; ) | |
811 { | |
812 dchar c = s[i]; | |
813 if (c <= 0x7F) | |
814 { | |
815 i++; | |
816 r ~= cast(wchar)c; | |
817 } | |
818 else | |
819 { | |
820 c = decode(s, i); | |
821 encode(r, c); | |
822 } | |
823 } | |
824 r ~= "\000"; | |
825 return r.ptr; | |
826 } | |
827 | |
828 /** ditto */ | |
829 | |
830 wchar[] toUTF16(wchar[] s) | |
831 in | |
832 { | |
833 validate(s); | |
834 } | |
835 body | |
836 { | |
837 return s; | |
838 } | |
839 | |
840 /** ditto */ | |
841 | |
842 wchar[] toUTF16(dchar[] s) | |
843 { | |
844 wchar[] r; | |
845 size_t slen = s.length; | |
846 | |
847 r.length = slen; | |
848 r.length = 0; | |
849 for (size_t i = 0; i < slen; i++) | |
850 { | |
851 encode(r, s[i]); | |
852 } | |
853 return r; | |
854 } | |
855 | |
856 /* =================== Conversion to UTF32 ======================= */ | |
857 | |
858 /***** | |
859 * Encodes string s into UTF-32 and returns the encoded string. | |
860 */ | |
861 | |
862 dchar[] toUTF32(char[] s) | |
863 { | |
864 dchar[] r; | |
865 size_t slen = s.length; | |
866 size_t j = 0; | |
867 | |
868 r.length = slen; // r[] will never be longer than s[] | |
869 for (size_t i = 0; i < slen; ) | |
870 { | |
871 dchar c = s[i]; | |
872 if (c >= 0x80) | |
873 c = decode(s, i); | |
874 else | |
875 i++; // c is ascii, no need for decode | |
876 r[j++] = c; | |
877 } | |
878 return r[0 .. j]; | |
879 } | |
880 | |
881 /** ditto */ | |
882 | |
883 dchar[] toUTF32(wchar[] s) | |
884 { | |
885 dchar[] r; | |
886 size_t slen = s.length; | |
887 size_t j = 0; | |
888 | |
889 r.length = slen; // r[] will never be longer than s[] | |
890 for (size_t i = 0; i < slen; ) | |
891 { | |
892 dchar c = s[i]; | |
893 if (c >= 0x80) | |
894 c = decode(s, i); | |
895 else | |
896 i++; // c is ascii, no need for decode | |
897 r[j++] = c; | |
898 } | |
899 return r[0 .. j]; | |
900 } | |
901 | |
902 /** ditto */ | |
903 | |
904 dchar[] toUTF32(dchar[] s) | |
905 in | |
906 { | |
907 validate(s); | |
908 } | |
909 body | |
910 { | |
911 return s; | |
912 } | |
913 | |
914 /* ================================ tests ================================== */ | |
915 | |
916 unittest | |
917 { | |
918 debug(utf) printf("utf.toUTF.unittest\n"); | |
919 | |
920 char[] c; | |
921 wchar[] w; | |
922 dchar[] d; | |
923 | |
924 c = "hello"; | |
925 w = toUTF16(c); | |
926 assert(w == "hello"); | |
927 d = toUTF32(c); | |
928 assert(d == "hello"); | |
929 | |
930 c = toUTF8(w); | |
931 assert(c == "hello"); | |
932 d = toUTF32(w); | |
933 assert(d == "hello"); | |
934 | |
935 c = toUTF8(d); | |
936 assert(c == "hello"); | |
937 w = toUTF16(d); | |
938 assert(w == "hello"); | |
939 | |
940 | |
941 c = "hel\u1234o"; | |
942 w = toUTF16(c); | |
943 assert(w == "hel\u1234o"); | |
944 d = toUTF32(c); | |
945 assert(d == "hel\u1234o"); | |
946 | |
947 c = toUTF8(w); | |
948 assert(c == "hel\u1234o"); | |
949 d = toUTF32(w); | |
950 assert(d == "hel\u1234o"); | |
951 | |
952 c = toUTF8(d); | |
953 assert(c == "hel\u1234o"); | |
954 w = toUTF16(d); | |
955 assert(w == "hel\u1234o"); | |
956 | |
957 | |
958 c = "he\U0010AAAAllo"; | |
959 w = toUTF16(c); | |
960 //foreach (wchar c; w) printf("c = x%x\n", c); | |
961 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c); | |
962 assert(w == "he\U0010AAAAllo"); | |
963 d = toUTF32(c); | |
964 assert(d == "he\U0010AAAAllo"); | |
965 | |
966 c = toUTF8(w); | |
967 assert(c == "he\U0010AAAAllo"); | |
968 d = toUTF32(w); | |
969 assert(d == "he\U0010AAAAllo"); | |
970 | |
971 c = toUTF8(d); | |
972 assert(c == "he\U0010AAAAllo"); | |
973 w = toUTF16(d); | |
974 assert(w == "he\U0010AAAAllo"); | |
975 } |