Mercurial > projects > ldc
comparison druntime/src/compiler/dmd/util/utf.d @ 759:d3eb054172f9
Added copy of druntime from DMD 2.020 modified for LDC.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:52:37 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
758:f04dde6e882c | 759:d3eb054172f9 |
---|---|
1 // Written in the D programming language | |
2 | |
3 /* | |
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com | |
5 * Written by Walter Bright | |
6 * | |
7 * This software is provided 'as-is', without any express or implied | |
8 * warranty. In no event will the authors be held liable for any damages | |
9 * arising from the use of this software. | |
10 * | |
11 * Permission is granted to anyone to use this software for any purpose, | |
12 * including commercial applications, and to alter it and redistribute it | |
13 * freely, subject to the following restrictions: | |
14 * | |
15 * o The origin of this software must not be misrepresented; you must not | |
16 * claim that you wrote the original software. If you use this software | |
17 * in a product, an acknowledgment in the product documentation would be | |
18 * appreciated but is not required. | |
19 * o Altered source versions must be plainly marked as such, and must not | |
20 * be misrepresented as being the original software. | |
21 * o This notice may not be removed or altered from any source | |
22 * distribution. | |
23 */ | |
24 | |
25 /******************************************** | |
26 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. | |
27 * | |
28 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D | |
29 * wchar type. | |
30 * For linux systems, the C wchar_t type is UTF-32 and corresponds to | |
31 * the D utf.dchar type. | |
32 * | |
33 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). | |
34 * | |
35 * See_Also: | |
36 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> | |
37 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> | |
38 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) | |
39 * Macros: | |
40 * WIKI = Phobos/StdUtf | |
41 */ | |
42 | |
43 module rt.util.utf; | |
44 | |
45 | |
46 extern (C) void onUnicodeError( string msg, size_t idx ); | |
47 | |
48 /******************************* | |
49 * Test if c is a valid UTF-32 character. | |
50 * | |
51 * \uFFFE and \uFFFF are considered valid by this function, | |
52 * as they are permitted for internal use by an application, | |
53 * but they are not allowed for interchange by the Unicode standard. | |
54 * | |
55 * Returns: true if it is, false if not. | |
56 */ | |
57 | |
58 bool isValidDchar(dchar c) | |
59 { | |
60 /* Note: FFFE and FFFF are specifically permitted by the | |
61 * Unicode standard for application internal use, but are not | |
62 * allowed for interchange. | |
63 * (thanks to Arcane Jill) | |
64 */ | |
65 | |
66 return c < 0xD800 || | |
67 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); | |
68 } | |
69 | |
70 unittest | |
71 { | |
72 debug(utf) printf("utf.isValidDchar.unittest\n"); | |
73 assert(isValidDchar(cast(dchar)'a') == true); | |
74 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); | |
75 } | |
76 | |
77 | |
78 | |
79 auto UTF8stride = | |
80 [ | |
81 cast(ubyte) | |
82 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
83 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
84 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
85 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
86 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
87 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
88 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
89 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
90 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
91 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
92 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
93 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
94 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
95 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
96 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
97 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, | |
98 ]; | |
99 | |
100 /** | |
101 * stride() returns the length of a UTF-8 sequence starting at index i | |
102 * in string s. | |
103 * Returns: | |
104 * The number of bytes in the UTF-8 sequence or | |
105 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. | |
106 */ | |
107 uint stride(in char[] s, size_t i) | |
108 { | |
109 return UTF8stride[s[i]]; | |
110 } | |
111 | |
112 /** | |
113 * stride() returns the length of a UTF-16 sequence starting at index i | |
114 * in string s. | |
115 */ | |
116 uint stride(in wchar[] s, size_t i) | |
117 { uint u = s[i]; | |
118 return 1 + (u >= 0xD800 && u <= 0xDBFF); | |
119 } | |
120 | |
121 /** | |
122 * stride() returns the length of a UTF-32 sequence starting at index i | |
123 * in string s. | |
124 * Returns: The return value will always be 1. | |
125 */ | |
126 uint stride(in dchar[] s, size_t i) | |
127 { | |
128 return 1; | |
129 } | |
130 | |
131 /******************************************* | |
132 * Given an index i into an array of characters s[], | |
133 * and assuming that index i is at the start of a UTF character, | |
134 * determine the number of UCS characters up to that index i. | |
135 */ | |
136 | |
137 size_t toUCSindex(in char[] s, size_t i) | |
138 { | |
139 size_t n; | |
140 size_t j; | |
141 | |
142 for (j = 0; j < i; ) | |
143 { | |
144 j += stride(s, j); | |
145 n++; | |
146 } | |
147 if (j > i) | |
148 { | |
149 onUnicodeError("invalid UTF-8 sequence", j); | |
150 } | |
151 return n; | |
152 } | |
153 | |
154 /** ditto */ | |
155 size_t toUCSindex(in wchar[] s, size_t i) | |
156 { | |
157 size_t n; | |
158 size_t j; | |
159 | |
160 for (j = 0; j < i; ) | |
161 { | |
162 j += stride(s, j); | |
163 n++; | |
164 } | |
165 if (j > i) | |
166 { | |
167 onUnicodeError("invalid UTF-16 sequence", j); | |
168 } | |
169 return n; | |
170 } | |
171 | |
172 /** ditto */ | |
173 size_t toUCSindex(in dchar[] s, size_t i) | |
174 { | |
175 return i; | |
176 } | |
177 | |
178 /****************************************** | |
179 * Given a UCS index n into an array of characters s[], return the UTF index. | |
180 */ | |
181 | |
182 size_t toUTFindex(in char[] s, size_t n) | |
183 { | |
184 size_t i; | |
185 | |
186 while (n--) | |
187 { | |
188 uint j = UTF8stride[s[i]]; | |
189 if (j == 0xFF) | |
190 onUnicodeError("invalid UTF-8 sequence", i); | |
191 i += j; | |
192 } | |
193 return i; | |
194 } | |
195 | |
196 /** ditto */ | |
197 size_t toUTFindex(in wchar[] s, size_t n) | |
198 { | |
199 size_t i; | |
200 | |
201 while (n--) | |
202 { wchar u = s[i]; | |
203 | |
204 i += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
205 } | |
206 return i; | |
207 } | |
208 | |
209 /** ditto */ | |
210 size_t toUTFindex(in dchar[] s, size_t n) | |
211 { | |
212 return n; | |
213 } | |
214 | |
215 /* =================== Decode ======================= */ | |
216 | |
217 /*************** | |
218 * Decodes and returns character starting at s[idx]. idx is advanced past the | |
219 * decoded character. If the character is not well formed, a UtfException is | |
220 * thrown and idx remains unchanged. | |
221 */ | |
222 dchar decode(in char[] s, inout size_t idx) | |
223 in | |
224 { | |
225 assert(idx >= 0 && idx < s.length); | |
226 } | |
227 out (result) | |
228 { | |
229 assert(isValidDchar(result)); | |
230 } | |
231 body | |
232 { | |
233 size_t len = s.length; | |
234 dchar V; | |
235 size_t i = idx; | |
236 char u = s[i]; | |
237 | |
238 if (u & 0x80) | |
239 { uint n; | |
240 char u2; | |
241 | |
242 /* The following encodings are valid, except for the 5 and 6 byte | |
243 * combinations: | |
244 * 0xxxxxxx | |
245 * 110xxxxx 10xxxxxx | |
246 * 1110xxxx 10xxxxxx 10xxxxxx | |
247 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
248 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
249 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
250 */ | |
251 for (n = 1; ; n++) | |
252 { | |
253 if (n > 4) | |
254 goto Lerr; // only do the first 4 of 6 encodings | |
255 if (((u << n) & 0x80) == 0) | |
256 { | |
257 if (n == 1) | |
258 goto Lerr; | |
259 break; | |
260 } | |
261 } | |
262 | |
263 // Pick off (7 - n) significant bits of B from first byte of octet | |
264 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); | |
265 | |
266 if (i + (n - 1) >= len) | |
267 goto Lerr; // off end of string | |
268 | |
269 /* The following combinations are overlong, and illegal: | |
270 * 1100000x (10xxxxxx) | |
271 * 11100000 100xxxxx (10xxxxxx) | |
272 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
273 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
274 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
275 */ | |
276 u2 = s[i + 1]; | |
277 if ((u & 0xFE) == 0xC0 || | |
278 (u == 0xE0 && (u2 & 0xE0) == 0x80) || | |
279 (u == 0xF0 && (u2 & 0xF0) == 0x80) || | |
280 (u == 0xF8 && (u2 & 0xF8) == 0x80) || | |
281 (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
282 goto Lerr; // overlong combination | |
283 | |
284 for (uint j = 1; j != n; j++) | |
285 { | |
286 u = s[i + j]; | |
287 if ((u & 0xC0) != 0x80) | |
288 goto Lerr; // trailing bytes are 10xxxxxx | |
289 V = (V << 6) | (u & 0x3F); | |
290 } | |
291 if (!isValidDchar(V)) | |
292 goto Lerr; | |
293 i += n; | |
294 } | |
295 else | |
296 { | |
297 V = cast(dchar) u; | |
298 i++; | |
299 } | |
300 | |
301 idx = i; | |
302 return V; | |
303 | |
304 Lerr: | |
305 onUnicodeError("invalid UTF-8 sequence", i); | |
306 return V; // dummy return | |
307 } | |
308 | |
309 unittest | |
310 { size_t i; | |
311 dchar c; | |
312 | |
313 debug(utf) printf("utf.decode.unittest\n"); | |
314 | |
315 static s1 = "abcd"c; | |
316 i = 0; | |
317 c = decode(s1, i); | |
318 assert(c == cast(dchar)'a'); | |
319 assert(i == 1); | |
320 c = decode(s1, i); | |
321 assert(c == cast(dchar)'b'); | |
322 assert(i == 2); | |
323 | |
324 static s2 = "\xC2\xA9"c; | |
325 i = 0; | |
326 c = decode(s2, i); | |
327 assert(c == cast(dchar)'\u00A9'); | |
328 assert(i == 2); | |
329 | |
330 static s3 = "\xE2\x89\xA0"c; | |
331 i = 0; | |
332 c = decode(s3, i); | |
333 assert(c == cast(dchar)'\u2260'); | |
334 assert(i == 3); | |
335 | |
336 static s4 = | |
337 [ "\xE2\x89"c, // too short | |
338 "\xC0\x8A", | |
339 "\xE0\x80\x8A", | |
340 "\xF0\x80\x80\x8A", | |
341 "\xF8\x80\x80\x80\x8A", | |
342 "\xFC\x80\x80\x80\x80\x8A", | |
343 ]; | |
344 | |
345 for (int j = 0; j < s4.length; j++) | |
346 { | |
347 try | |
348 { | |
349 i = 0; | |
350 c = decode(s4[j], i); | |
351 assert(0); | |
352 } | |
353 catch (Object o) | |
354 { | |
355 i = 23; | |
356 } | |
357 assert(i == 23); | |
358 } | |
359 } | |
360 | |
361 /** ditto */ | |
362 | |
363 dchar decode(in wchar[] s, inout size_t idx) | |
364 in | |
365 { | |
366 assert(idx >= 0 && idx < s.length); | |
367 } | |
368 out (result) | |
369 { | |
370 assert(isValidDchar(result)); | |
371 } | |
372 body | |
373 { | |
374 string msg; | |
375 dchar V; | |
376 size_t i = idx; | |
377 uint u = s[i]; | |
378 | |
379 if (u & ~0x7F) | |
380 { if (u >= 0xD800 && u <= 0xDBFF) | |
381 { uint u2; | |
382 | |
383 if (i + 1 == s.length) | |
384 { msg = "surrogate UTF-16 high value past end of string"; | |
385 goto Lerr; | |
386 } | |
387 u2 = s[i + 1]; | |
388 if (u2 < 0xDC00 || u2 > 0xDFFF) | |
389 { msg = "surrogate UTF-16 low value out of range"; | |
390 goto Lerr; | |
391 } | |
392 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
393 i += 2; | |
394 } | |
395 else if (u >= 0xDC00 && u <= 0xDFFF) | |
396 { msg = "unpaired surrogate UTF-16 value"; | |
397 goto Lerr; | |
398 } | |
399 else if (u == 0xFFFE || u == 0xFFFF) | |
400 { msg = "illegal UTF-16 value"; | |
401 goto Lerr; | |
402 } | |
403 else | |
404 i++; | |
405 } | |
406 else | |
407 { | |
408 i++; | |
409 } | |
410 | |
411 idx = i; | |
412 return cast(dchar)u; | |
413 | |
414 Lerr: | |
415 onUnicodeError(msg, i); | |
416 return cast(dchar)u; // dummy return | |
417 } | |
418 | |
419 /** ditto */ | |
420 | |
421 dchar decode(in dchar[] s, inout size_t idx) | |
422 in | |
423 { | |
424 assert(idx >= 0 && idx < s.length); | |
425 } | |
426 body | |
427 { | |
428 size_t i = idx; | |
429 dchar c = s[i]; | |
430 | |
431 if (!isValidDchar(c)) | |
432 goto Lerr; | |
433 idx = i + 1; | |
434 return c; | |
435 | |
436 Lerr: | |
437 onUnicodeError("invalid UTF-32 value", i); | |
438 return c; // dummy return | |
439 } | |
440 | |
441 | |
442 /* =================== Encode ======================= */ | |
443 | |
444 /******************************* | |
445 * Encodes character c and appends it to array s[]. | |
446 */ | |
447 void encode(inout char[] s, dchar c) | |
448 in | |
449 { | |
450 assert(isValidDchar(c)); | |
451 } | |
452 body | |
453 { | |
454 char[] r = s; | |
455 | |
456 if (c <= 0x7F) | |
457 { | |
458 r ~= cast(char) c; | |
459 } | |
460 else | |
461 { | |
462 char[4] buf; | |
463 uint L; | |
464 | |
465 if (c <= 0x7FF) | |
466 { | |
467 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
468 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
469 L = 2; | |
470 } | |
471 else if (c <= 0xFFFF) | |
472 { | |
473 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
474 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
475 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
476 L = 3; | |
477 } | |
478 else if (c <= 0x10FFFF) | |
479 { | |
480 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
481 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
482 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
483 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
484 L = 4; | |
485 } | |
486 else | |
487 { | |
488 assert(0); | |
489 } | |
490 r ~= buf[0 .. L]; | |
491 } | |
492 s = r; | |
493 } | |
494 | |
495 unittest | |
496 { | |
497 debug(utf) printf("utf.encode.unittest\n"); | |
498 | |
499 char[] s = "abcd".dup; | |
500 encode(s, cast(dchar)'a'); | |
501 assert(s.length == 5); | |
502 assert(s == "abcda"); | |
503 | |
504 encode(s, cast(dchar)'\u00A9'); | |
505 assert(s.length == 7); | |
506 assert(s == "abcda\xC2\xA9"); | |
507 //assert(s == "abcda\u00A9"); // BUG: fix compiler | |
508 | |
509 encode(s, cast(dchar)'\u2260'); | |
510 assert(s.length == 10); | |
511 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); | |
512 } | |
513 | |
514 /** ditto */ | |
515 | |
516 void encode(inout wchar[] s, dchar c) | |
517 in | |
518 { | |
519 assert(isValidDchar(c)); | |
520 } | |
521 body | |
522 { | |
523 wchar[] r = s; | |
524 | |
525 if (c <= 0xFFFF) | |
526 { | |
527 r ~= cast(wchar) c; | |
528 } | |
529 else | |
530 { | |
531 wchar[2] buf; | |
532 | |
533 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
534 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
535 r ~= buf; | |
536 } | |
537 s = r; | |
538 } | |
539 | |
540 /** ditto */ | |
541 void encode(inout dchar[] s, dchar c) | |
542 in | |
543 { | |
544 assert(isValidDchar(c)); | |
545 } | |
546 body | |
547 { | |
548 s ~= c; | |
549 } | |
550 | |
551 /** | |
552 Returns the code length of $(D c) in the encoding using $(D C) as a | |
553 code point. The code is returned in character count, not in bytes. | |
554 */ | |
555 | |
556 ubyte codeLength(C)(dchar c) | |
557 { | |
558 | |
559 static if (C.sizeof == 1) | |
560 { | |
561 return | |
562 c <= 0x7F ? 1 | |
563 : c <= 0x7FF ? 2 | |
564 : c <= 0xFFFF ? 3 | |
565 : c <= 0x10FFFF ? 4 | |
566 : (assert(false), 6); | |
567 } | |
568 | |
569 else static if (C.sizeof == 2) | |
570 { | |
571 return c <= 0xFFFF ? 1 : 2; | |
572 } | |
573 else | |
574 { | |
575 static assert(C.sizeof == 4); | |
576 return 1; | |
577 } | |
578 } | |
579 | |
580 /* =================== Validation ======================= */ | |
581 | |
582 /*********************************** | |
583 Checks to see if string is well formed or not. $(D S) can be an array | |
584 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) | |
585 if it is not. Use to check all untrusted input for correctness. | |
586 */ | |
587 void validate(S)(in S s) | |
588 { | |
589 auto len = s.length; | |
590 for (size_t i = 0; i < len; ) | |
591 { | |
592 decode(s, i); | |
593 } | |
594 } | |
595 | |
596 /* =================== Conversion to UTF8 ======================= */ | |
597 | |
598 char[] toUTF8(char[4] buf, dchar c) | |
599 in | |
600 { | |
601 assert(isValidDchar(c)); | |
602 } | |
603 body | |
604 { | |
605 if (c <= 0x7F) | |
606 { | |
607 buf[0] = cast(char) c; | |
608 return buf[0 .. 1]; | |
609 } | |
610 else if (c <= 0x7FF) | |
611 { | |
612 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
613 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
614 return buf[0 .. 2]; | |
615 } | |
616 else if (c <= 0xFFFF) | |
617 { | |
618 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
619 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
620 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
621 return buf[0 .. 3]; | |
622 } | |
623 else if (c <= 0x10FFFF) | |
624 { | |
625 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
626 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
627 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
628 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
629 return buf[0 .. 4]; | |
630 } | |
631 assert(0); | |
632 } | |
633 | |
634 /******************* | |
635 * Encodes string s into UTF-8 and returns the encoded string. | |
636 */ | |
637 string toUTF8(string s) | |
638 in | |
639 { | |
640 validate(s); | |
641 } | |
642 body | |
643 { | |
644 return s; | |
645 } | |
646 | |
647 /** ditto */ | |
648 string toUTF8(in wchar[] s) | |
649 { | |
650 char[] r; | |
651 size_t i; | |
652 size_t slen = s.length; | |
653 | |
654 r.length = slen; | |
655 | |
656 for (i = 0; i < slen; i++) | |
657 { wchar c = s[i]; | |
658 | |
659 if (c <= 0x7F) | |
660 r[i] = cast(char)c; // fast path for ascii | |
661 else | |
662 { | |
663 r.length = i; | |
664 foreach (dchar c; s[i .. slen]) | |
665 { | |
666 encode(r, c); | |
667 } | |
668 break; | |
669 } | |
670 } | |
671 return cast(string)r; | |
672 } | |
673 | |
674 /** ditto */ | |
675 string toUTF8(in dchar[] s) | |
676 { | |
677 char[] r; | |
678 size_t i; | |
679 size_t slen = s.length; | |
680 | |
681 r.length = slen; | |
682 | |
683 for (i = 0; i < slen; i++) | |
684 { dchar c = s[i]; | |
685 | |
686 if (c <= 0x7F) | |
687 r[i] = cast(char)c; // fast path for ascii | |
688 else | |
689 { | |
690 r.length = i; | |
691 foreach (dchar d; s[i .. slen]) | |
692 { | |
693 encode(r, d); | |
694 } | |
695 break; | |
696 } | |
697 } | |
698 return cast(string)r; | |
699 } | |
700 | |
701 /* =================== Conversion to UTF16 ======================= */ | |
702 | |
703 wchar[] toUTF16(wchar[2] buf, dchar c) | |
704 in | |
705 { | |
706 assert(isValidDchar(c)); | |
707 } | |
708 body | |
709 { | |
710 if (c <= 0xFFFF) | |
711 { | |
712 buf[0] = cast(wchar) c; | |
713 return buf[0 .. 1]; | |
714 } | |
715 else | |
716 { | |
717 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
718 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
719 return buf[0 .. 2]; | |
720 } | |
721 } | |
722 | |
723 /**************** | |
724 * Encodes string s into UTF-16 and returns the encoded string. | |
725 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take | |
726 * an LPWSTR or LPCWSTR argument. | |
727 */ | |
728 wstring toUTF16(in char[] s) | |
729 { | |
730 wchar[] r; | |
731 size_t slen = s.length; | |
732 | |
733 r.length = slen; | |
734 r.length = 0; | |
735 for (size_t i = 0; i < slen; ) | |
736 { | |
737 dchar c = s[i]; | |
738 if (c <= 0x7F) | |
739 { | |
740 i++; | |
741 r ~= cast(wchar)c; | |
742 } | |
743 else | |
744 { | |
745 c = decode(s, i); | |
746 encode(r, c); | |
747 } | |
748 } | |
749 return cast(wstring)r; | |
750 } | |
751 | |
752 alias const(wchar)* wptr; | |
753 /** ditto */ | |
754 wptr toUTF16z(in char[] s) | |
755 { | |
756 wchar[] r; | |
757 size_t slen = s.length; | |
758 | |
759 r.length = slen + 1; | |
760 r.length = 0; | |
761 for (size_t i = 0; i < slen; ) | |
762 { | |
763 dchar c = s[i]; | |
764 if (c <= 0x7F) | |
765 { | |
766 i++; | |
767 r ~= cast(wchar)c; | |
768 } | |
769 else | |
770 { | |
771 c = decode(s, i); | |
772 encode(r, c); | |
773 } | |
774 } | |
775 r ~= "\000"; | |
776 return r.ptr; | |
777 } | |
778 | |
779 /** ditto */ | |
780 wstring toUTF16(wstring s) | |
781 in | |
782 { | |
783 validate(s); | |
784 } | |
785 body | |
786 { | |
787 return s; | |
788 } | |
789 | |
790 /** ditto */ | |
791 wstring toUTF16(in dchar[] s) | |
792 { | |
793 wchar[] r; | |
794 size_t slen = s.length; | |
795 | |
796 r.length = slen; | |
797 r.length = 0; | |
798 for (size_t i = 0; i < slen; i++) | |
799 { | |
800 encode(r, s[i]); | |
801 } | |
802 return cast(wstring)r; | |
803 } | |
804 | |
805 /* =================== Conversion to UTF32 ======================= */ | |
806 | |
807 /***** | |
808 * Encodes string s into UTF-32 and returns the encoded string. | |
809 */ | |
810 dstring toUTF32(in char[] s) | |
811 { | |
812 dchar[] r; | |
813 size_t slen = s.length; | |
814 size_t j = 0; | |
815 | |
816 r.length = slen; // r[] will never be longer than s[] | |
817 for (size_t i = 0; i < slen; ) | |
818 { | |
819 dchar c = s[i]; | |
820 if (c >= 0x80) | |
821 c = decode(s, i); | |
822 else | |
823 i++; // c is ascii, no need for decode | |
824 r[j++] = c; | |
825 } | |
826 return cast(dstring)r[0 .. j]; | |
827 } | |
828 | |
829 /** ditto */ | |
830 dstring toUTF32(in wchar[] s) | |
831 { | |
832 dchar[] r; | |
833 size_t slen = s.length; | |
834 size_t j = 0; | |
835 | |
836 r.length = slen; // r[] will never be longer than s[] | |
837 for (size_t i = 0; i < slen; ) | |
838 { | |
839 dchar c = s[i]; | |
840 if (c >= 0x80) | |
841 c = decode(s, i); | |
842 else | |
843 i++; // c is ascii, no need for decode | |
844 r[j++] = c; | |
845 } | |
846 return cast(dstring)r[0 .. j]; | |
847 } | |
848 | |
849 /** ditto */ | |
850 dstring toUTF32(dstring s) | |
851 in | |
852 { | |
853 validate(s); | |
854 } | |
855 body | |
856 { | |
857 return s; | |
858 } | |
859 | |
860 /* ================================ tests ================================== */ | |
861 | |
862 unittest | |
863 { | |
864 debug(utf) printf("utf.toUTF.unittest\n"); | |
865 | |
866 auto c = "hello"c; | |
867 auto w = toUTF16(c); | |
868 assert(w == "hello"); | |
869 auto d = toUTF32(c); | |
870 assert(d == "hello"); | |
871 | |
872 c = toUTF8(w); | |
873 assert(c == "hello"); | |
874 d = toUTF32(w); | |
875 assert(d == "hello"); | |
876 | |
877 c = toUTF8(d); | |
878 assert(c == "hello"); | |
879 w = toUTF16(d); | |
880 assert(w == "hello"); | |
881 | |
882 | |
883 c = "hel\u1234o"; | |
884 w = toUTF16(c); | |
885 assert(w == "hel\u1234o"); | |
886 d = toUTF32(c); | |
887 assert(d == "hel\u1234o"); | |
888 | |
889 c = toUTF8(w); | |
890 assert(c == "hel\u1234o"); | |
891 d = toUTF32(w); | |
892 assert(d == "hel\u1234o"); | |
893 | |
894 c = toUTF8(d); | |
895 assert(c == "hel\u1234o"); | |
896 w = toUTF16(d); | |
897 assert(w == "hel\u1234o"); | |
898 | |
899 | |
900 c = "he\U0010AAAAllo"; | |
901 w = toUTF16(c); | |
902 //foreach (wchar c; w) printf("c = x%x\n", c); | |
903 //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c); | |
904 assert(w == "he\U0010AAAAllo"); | |
905 d = toUTF32(c); | |
906 assert(d == "he\U0010AAAAllo"); | |
907 | |
908 c = toUTF8(w); | |
909 assert(c == "he\U0010AAAAllo"); | |
910 d = toUTF32(w); | |
911 assert(d == "he\U0010AAAAllo"); | |
912 | |
913 c = toUTF8(d); | |
914 assert(c == "he\U0010AAAAllo"); | |
915 w = toUTF16(d); | |
916 assert(w == "he\U0010AAAAllo"); | |
917 } |