Mercurial > projects > ldc
comparison druntime/src/compiler/ldc/util/utf.d @ 1458:e0b2d67cfe7c
Added druntime (this should be removed once it works).
author | Robert Clipsham <robert@octarineparrot.com> |
---|---|
date | Tue, 02 Jun 2009 17:43:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1456:7b218ec1044f | 1458:e0b2d67cfe7c |
---|---|
1 /******************************************** | |
2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. | |
3 * | |
4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D | |
5 * wchar type. | |
6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to | |
7 * the D utf.dchar type. | |
8 * | |
9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). | |
10 * | |
11 * See_Also: | |
12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> | |
13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> | |
14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) | |
15 * Macros: | |
16 * WIKI = Phobos/StdUtf | |
17 * | |
18 * Copyright: Copyright Digital Mars 2003 - 2009. | |
19 * License: <a href="http://www.boost.org/LICENSE_1_0.txt>Boost License 1.0</a>. | |
20 * Authors: Walter Bright, Sean Kelly | |
21 * | |
22 * Copyright Digital Mars 2003 - 2009. | |
23 * Distributed under the Boost Software License, Version 1.0. | |
24 * (See accompanying file LICENSE_1_0.txt or copy at | |
25 * http://www.boost.org/LICENSE_1_0.txt) | |
26 */ | |
27 module rt.util.utf; | |
28 | |
29 | |
30 extern (C) void onUnicodeError( string msg, size_t idx ); | |
31 | |
32 /******************************* | |
33 * Test if c is a valid UTF-32 character. | |
34 * | |
35 * \uFFFE and \uFFFF are considered valid by this function, | |
36 * as they are permitted for internal use by an application, | |
37 * but they are not allowed for interchange by the Unicode standard. | |
38 * | |
39 * Returns: true if it is, false if not. | |
40 */ | |
41 | |
42 bool isValidDchar(dchar c) | |
43 { | |
44 /* Note: FFFE and FFFF are specifically permitted by the | |
45 * Unicode standard for application internal use, but are not | |
46 * allowed for interchange. | |
47 * (thanks to Arcane Jill) | |
48 */ | |
49 | |
50 return c < 0xD800 || | |
51 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); | |
52 } | |
53 | |
54 unittest | |
55 { | |
56 debug(utf) printf("utf.isValidDchar.unittest\n"); | |
57 assert(isValidDchar(cast(dchar)'a') == true); | |
58 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); | |
59 } | |
60 | |
61 | |
62 | |
63 immutable UTF8stride = | |
64 [ | |
65 cast(ubyte) | |
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
73 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
76 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
77 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, | |
78 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
79 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
80 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, | |
81 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, | |
82 ]; | |
83 | |
84 /** | |
85 * stride() returns the length of a UTF-8 sequence starting at index i | |
86 * in string s. | |
87 * Returns: | |
88 * The number of bytes in the UTF-8 sequence or | |
89 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. | |
90 */ | |
91 uint stride(in char[] s, size_t i) | |
92 { | |
93 return UTF8stride[s[i]]; | |
94 } | |
95 | |
96 /** | |
97 * stride() returns the length of a UTF-16 sequence starting at index i | |
98 * in string s. | |
99 */ | |
100 uint stride(in wchar[] s, size_t i) | |
101 { uint u = s[i]; | |
102 return 1 + (u >= 0xD800 && u <= 0xDBFF); | |
103 } | |
104 | |
105 /** | |
106 * stride() returns the length of a UTF-32 sequence starting at index i | |
107 * in string s. | |
108 * Returns: The return value will always be 1. | |
109 */ | |
110 uint stride(in dchar[] s, size_t i) | |
111 { | |
112 return 1; | |
113 } | |
114 | |
115 /******************************************* | |
116 * Given an index i into an array of characters s[], | |
117 * and assuming that index i is at the start of a UTF character, | |
118 * determine the number of UCS characters up to that index i. | |
119 */ | |
120 | |
121 size_t toUCSindex(in char[] s, size_t i) | |
122 { | |
123 size_t n; | |
124 size_t j; | |
125 | |
126 for (j = 0; j < i; ) | |
127 { | |
128 j += stride(s, j); | |
129 n++; | |
130 } | |
131 if (j > i) | |
132 { | |
133 onUnicodeError("invalid UTF-8 sequence", j); | |
134 } | |
135 return n; | |
136 } | |
137 | |
138 /** ditto */ | |
139 size_t toUCSindex(in wchar[] s, size_t i) | |
140 { | |
141 size_t n; | |
142 size_t j; | |
143 | |
144 for (j = 0; j < i; ) | |
145 { | |
146 j += stride(s, j); | |
147 n++; | |
148 } | |
149 if (j > i) | |
150 { | |
151 onUnicodeError("invalid UTF-16 sequence", j); | |
152 } | |
153 return n; | |
154 } | |
155 | |
156 /** ditto */ | |
157 size_t toUCSindex(in dchar[] s, size_t i) | |
158 { | |
159 return i; | |
160 } | |
161 | |
162 /****************************************** | |
163 * Given a UCS index n into an array of characters s[], return the UTF index. | |
164 */ | |
165 | |
166 size_t toUTFindex(in char[] s, size_t n) | |
167 { | |
168 size_t i; | |
169 | |
170 while (n--) | |
171 { | |
172 uint j = UTF8stride[s[i]]; | |
173 if (j == 0xFF) | |
174 onUnicodeError("invalid UTF-8 sequence", i); | |
175 i += j; | |
176 } | |
177 return i; | |
178 } | |
179 | |
180 /** ditto */ | |
181 size_t toUTFindex(in wchar[] s, size_t n) | |
182 { | |
183 size_t i; | |
184 | |
185 while (n--) | |
186 { wchar u = s[i]; | |
187 | |
188 i += 1 + (u >= 0xD800 && u <= 0xDBFF); | |
189 } | |
190 return i; | |
191 } | |
192 | |
193 /** ditto */ | |
194 size_t toUTFindex(in dchar[] s, size_t n) | |
195 { | |
196 return n; | |
197 } | |
198 | |
199 /* =================== Decode ======================= */ | |
200 | |
201 /*************** | |
202 * Decodes and returns character starting at s[idx]. idx is advanced past the | |
203 * decoded character. If the character is not well formed, a UtfException is | |
204 * thrown and idx remains unchanged. | |
205 */ | |
206 dchar decode(in char[] s, inout size_t idx) | |
207 in | |
208 { | |
209 assert(idx >= 0 && idx < s.length); | |
210 } | |
211 out (result) | |
212 { | |
213 assert(isValidDchar(result)); | |
214 } | |
215 body | |
216 { | |
217 size_t len = s.length; | |
218 dchar V; | |
219 size_t i = idx; | |
220 char u = s[i]; | |
221 | |
222 if (u & 0x80) | |
223 { uint n; | |
224 char u2; | |
225 | |
226 /* The following encodings are valid, except for the 5 and 6 byte | |
227 * combinations: | |
228 * 0xxxxxxx | |
229 * 110xxxxx 10xxxxxx | |
230 * 1110xxxx 10xxxxxx 10xxxxxx | |
231 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
232 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
233 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
234 */ | |
235 for (n = 1; ; n++) | |
236 { | |
237 if (n > 4) | |
238 goto Lerr; // only do the first 4 of 6 encodings | |
239 if (((u << n) & 0x80) == 0) | |
240 { | |
241 if (n == 1) | |
242 goto Lerr; | |
243 break; | |
244 } | |
245 } | |
246 | |
247 // Pick off (7 - n) significant bits of B from first byte of octet | |
248 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); | |
249 | |
250 if (i + (n - 1) >= len) | |
251 goto Lerr; // off end of string | |
252 | |
253 /* The following combinations are overlong, and illegal: | |
254 * 1100000x (10xxxxxx) | |
255 * 11100000 100xxxxx (10xxxxxx) | |
256 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
257 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
258 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
259 */ | |
260 u2 = s[i + 1]; | |
261 if ((u & 0xFE) == 0xC0 || | |
262 (u == 0xE0 && (u2 & 0xE0) == 0x80) || | |
263 (u == 0xF0 && (u2 & 0xF0) == 0x80) || | |
264 (u == 0xF8 && (u2 & 0xF8) == 0x80) || | |
265 (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
266 goto Lerr; // overlong combination | |
267 | |
268 for (uint j = 1; j != n; j++) | |
269 { | |
270 u = s[i + j]; | |
271 if ((u & 0xC0) != 0x80) | |
272 goto Lerr; // trailing bytes are 10xxxxxx | |
273 V = (V << 6) | (u & 0x3F); | |
274 } | |
275 if (!isValidDchar(V)) | |
276 goto Lerr; | |
277 i += n; | |
278 } | |
279 else | |
280 { | |
281 V = cast(dchar) u; | |
282 i++; | |
283 } | |
284 | |
285 idx = i; | |
286 return V; | |
287 | |
288 Lerr: | |
289 onUnicodeError("invalid UTF-8 sequence", i); | |
290 return V; // dummy return | |
291 } | |
292 | |
293 unittest | |
294 { size_t i; | |
295 dchar c; | |
296 | |
297 debug(utf) printf("utf.decode.unittest\n"); | |
298 | |
299 static s1 = "abcd"c; | |
300 i = 0; | |
301 c = decode(s1, i); | |
302 assert(c == cast(dchar)'a'); | |
303 assert(i == 1); | |
304 c = decode(s1, i); | |
305 assert(c == cast(dchar)'b'); | |
306 assert(i == 2); | |
307 | |
308 static s2 = "\xC2\xA9"c; | |
309 i = 0; | |
310 c = decode(s2, i); | |
311 assert(c == cast(dchar)'\u00A9'); | |
312 assert(i == 2); | |
313 | |
314 static s3 = "\xE2\x89\xA0"c; | |
315 i = 0; | |
316 c = decode(s3, i); | |
317 assert(c == cast(dchar)'\u2260'); | |
318 assert(i == 3); | |
319 | |
320 static s4 = | |
321 [ "\xE2\x89"c[], // too short | |
322 "\xC0\x8A", | |
323 "\xE0\x80\x8A", | |
324 "\xF0\x80\x80\x8A", | |
325 "\xF8\x80\x80\x80\x8A", | |
326 "\xFC\x80\x80\x80\x80\x8A", | |
327 ]; | |
328 | |
329 for (int j = 0; j < s4.length; j++) | |
330 { | |
331 try | |
332 { | |
333 i = 0; | |
334 c = decode(s4[j], i); | |
335 assert(0); | |
336 } | |
337 catch (Object o) | |
338 { | |
339 i = 23; | |
340 } | |
341 assert(i == 23); | |
342 } | |
343 } | |
344 | |
345 /** ditto */ | |
346 | |
347 dchar decode(in wchar[] s, inout size_t idx) | |
348 in | |
349 { | |
350 assert(idx >= 0 && idx < s.length); | |
351 } | |
352 out (result) | |
353 { | |
354 assert(isValidDchar(result)); | |
355 } | |
356 body | |
357 { | |
358 string msg; | |
359 dchar V; | |
360 size_t i = idx; | |
361 uint u = s[i]; | |
362 | |
363 if (u & ~0x7F) | |
364 { if (u >= 0xD800 && u <= 0xDBFF) | |
365 { uint u2; | |
366 | |
367 if (i + 1 == s.length) | |
368 { msg = "surrogate UTF-16 high value past end of string"; | |
369 goto Lerr; | |
370 } | |
371 u2 = s[i + 1]; | |
372 if (u2 < 0xDC00 || u2 > 0xDFFF) | |
373 { msg = "surrogate UTF-16 low value out of range"; | |
374 goto Lerr; | |
375 } | |
376 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
377 i += 2; | |
378 } | |
379 else if (u >= 0xDC00 && u <= 0xDFFF) | |
380 { msg = "unpaired surrogate UTF-16 value"; | |
381 goto Lerr; | |
382 } | |
383 else if (u == 0xFFFE || u == 0xFFFF) | |
384 { msg = "illegal UTF-16 value"; | |
385 goto Lerr; | |
386 } | |
387 else | |
388 i++; | |
389 } | |
390 else | |
391 { | |
392 i++; | |
393 } | |
394 | |
395 idx = i; | |
396 return cast(dchar)u; | |
397 | |
398 Lerr: | |
399 onUnicodeError(msg, i); | |
400 return cast(dchar)u; // dummy return | |
401 } | |
402 | |
403 /** ditto */ | |
404 | |
405 dchar decode(in dchar[] s, inout size_t idx) | |
406 in | |
407 { | |
408 assert(idx >= 0 && idx < s.length); | |
409 } | |
410 body | |
411 { | |
412 size_t i = idx; | |
413 dchar c = s[i]; | |
414 | |
415 if (!isValidDchar(c)) | |
416 goto Lerr; | |
417 idx = i + 1; | |
418 return c; | |
419 | |
420 Lerr: | |
421 onUnicodeError("invalid UTF-32 value", i); | |
422 return c; // dummy return | |
423 } | |
424 | |
425 | |
426 /* =================== Encode ======================= */ | |
427 | |
428 /******************************* | |
429 * Encodes character c and appends it to array s[]. | |
430 */ | |
431 void encode(inout char[] s, dchar c) | |
432 in | |
433 { | |
434 assert(isValidDchar(c)); | |
435 } | |
436 body | |
437 { | |
438 char[] r = s; | |
439 | |
440 if (c <= 0x7F) | |
441 { | |
442 r ~= cast(char) c; | |
443 } | |
444 else | |
445 { | |
446 char[4] buf; | |
447 uint L; | |
448 | |
449 if (c <= 0x7FF) | |
450 { | |
451 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
452 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
453 L = 2; | |
454 } | |
455 else if (c <= 0xFFFF) | |
456 { | |
457 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
458 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
459 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
460 L = 3; | |
461 } | |
462 else if (c <= 0x10FFFF) | |
463 { | |
464 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
465 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
466 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
467 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
468 L = 4; | |
469 } | |
470 else | |
471 { | |
472 assert(0); | |
473 } | |
474 r ~= buf[0 .. L]; | |
475 } | |
476 s = r; | |
477 } | |
478 | |
479 unittest | |
480 { | |
481 debug(utf) printf("utf.encode.unittest\n"); | |
482 | |
483 char[] s = "abcd".dup; | |
484 encode(s, cast(dchar)'a'); | |
485 assert(s.length == 5); | |
486 assert(s == "abcda"); | |
487 | |
488 encode(s, cast(dchar)'\u00A9'); | |
489 assert(s.length == 7); | |
490 assert(s == "abcda\xC2\xA9"); | |
491 //assert(s == "abcda\u00A9"); // BUG: fix compiler | |
492 | |
493 encode(s, cast(dchar)'\u2260'); | |
494 assert(s.length == 10); | |
495 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); | |
496 } | |
497 | |
498 /** ditto */ | |
499 | |
500 void encode(inout wchar[] s, dchar c) | |
501 in | |
502 { | |
503 assert(isValidDchar(c)); | |
504 } | |
505 body | |
506 { | |
507 wchar[] r = s; | |
508 | |
509 if (c <= 0xFFFF) | |
510 { | |
511 r ~= cast(wchar) c; | |
512 } | |
513 else | |
514 { | |
515 wchar[2] buf; | |
516 | |
517 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
518 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
519 r ~= buf; | |
520 } | |
521 s = r; | |
522 } | |
523 | |
524 /** ditto */ | |
525 void encode(inout dchar[] s, dchar c) | |
526 in | |
527 { | |
528 assert(isValidDchar(c)); | |
529 } | |
530 body | |
531 { | |
532 s ~= c; | |
533 } | |
534 | |
535 /** | |
536 Returns the code length of $(D c) in the encoding using $(D C) as a | |
537 code point. The code is returned in character count, not in bytes. | |
538 */ | |
539 | |
540 ubyte codeLength(C)(dchar c) | |
541 { | |
542 | |
543 static if (C.sizeof == 1) | |
544 { | |
545 return | |
546 c <= 0x7F ? 1 | |
547 : c <= 0x7FF ? 2 | |
548 : c <= 0xFFFF ? 3 | |
549 : c <= 0x10FFFF ? 4 | |
550 : (assert(false), 6); | |
551 } | |
552 | |
553 else static if (C.sizeof == 2) | |
554 { | |
555 return c <= 0xFFFF ? 1 : 2; | |
556 } | |
557 else | |
558 { | |
559 static assert(C.sizeof == 4); | |
560 return 1; | |
561 } | |
562 } | |
563 | |
564 /* =================== Validation ======================= */ | |
565 | |
566 /*********************************** | |
567 Checks to see if string is well formed or not. $(D S) can be an array | |
568 of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) | |
569 if it is not. Use to check all untrusted input for correctness. | |
570 */ | |
571 void validate(S)(in S s) | |
572 { | |
573 auto len = s.length; | |
574 for (size_t i = 0; i < len; ) | |
575 { | |
576 decode(s, i); | |
577 } | |
578 } | |
579 | |
580 /* =================== Conversion to UTF8 ======================= */ | |
581 | |
582 char[] toUTF8(char[4] buf, dchar c) | |
583 in | |
584 { | |
585 assert(isValidDchar(c)); | |
586 } | |
587 body | |
588 { | |
589 if (c <= 0x7F) | |
590 { | |
591 buf[0] = cast(char) c; | |
592 return buf[0 .. 1]; | |
593 } | |
594 else if (c <= 0x7FF) | |
595 { | |
596 buf[0] = cast(char)(0xC0 | (c >> 6)); | |
597 buf[1] = cast(char)(0x80 | (c & 0x3F)); | |
598 return buf[0 .. 2]; | |
599 } | |
600 else if (c <= 0xFFFF) | |
601 { | |
602 buf[0] = cast(char)(0xE0 | (c >> 12)); | |
603 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
604 buf[2] = cast(char)(0x80 | (c & 0x3F)); | |
605 return buf[0 .. 3]; | |
606 } | |
607 else if (c <= 0x10FFFF) | |
608 { | |
609 buf[0] = cast(char)(0xF0 | (c >> 18)); | |
610 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); | |
611 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); | |
612 buf[3] = cast(char)(0x80 | (c & 0x3F)); | |
613 return buf[0 .. 4]; | |
614 } | |
615 assert(0); | |
616 } | |
617 | |
618 /******************* | |
619 * Encodes string s into UTF-8 and returns the encoded string. | |
620 */ | |
621 string toUTF8(string s) | |
622 in | |
623 { | |
624 validate(s); | |
625 } | |
626 body | |
627 { | |
628 return s; | |
629 } | |
630 | |
631 /** ditto */ | |
632 string toUTF8(in wchar[] s) | |
633 { | |
634 char[] r; | |
635 size_t i; | |
636 size_t slen = s.length; | |
637 | |
638 r.length = slen; | |
639 | |
640 for (i = 0; i < slen; i++) | |
641 { wchar c = s[i]; | |
642 | |
643 if (c <= 0x7F) | |
644 r[i] = cast(char)c; // fast path for ascii | |
645 else | |
646 { | |
647 r.length = i; | |
648 foreach (dchar c; s[i .. slen]) | |
649 { | |
650 encode(r, c); | |
651 } | |
652 break; | |
653 } | |
654 } | |
655 return cast(string)r; | |
656 } | |
657 | |
658 /** ditto */ | |
659 string toUTF8(in dchar[] s) | |
660 { | |
661 char[] r; | |
662 size_t i; | |
663 size_t slen = s.length; | |
664 | |
665 r.length = slen; | |
666 | |
667 for (i = 0; i < slen; i++) | |
668 { dchar c = s[i]; | |
669 | |
670 if (c <= 0x7F) | |
671 r[i] = cast(char)c; // fast path for ascii | |
672 else | |
673 { | |
674 r.length = i; | |
675 foreach (dchar d; s[i .. slen]) | |
676 { | |
677 encode(r, d); | |
678 } | |
679 break; | |
680 } | |
681 } | |
682 return cast(string)r; | |
683 } | |
684 | |
685 /* =================== Conversion to UTF16 ======================= */ | |
686 | |
687 wchar[] toUTF16(wchar[2] buf, dchar c) | |
688 in | |
689 { | |
690 assert(isValidDchar(c)); | |
691 } | |
692 body | |
693 { | |
694 if (c <= 0xFFFF) | |
695 { | |
696 buf[0] = cast(wchar) c; | |
697 return buf[0 .. 1]; | |
698 } | |
699 else | |
700 { | |
701 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); | |
702 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); | |
703 return buf[0 .. 2]; | |
704 } | |
705 } | |
706 | |
707 /**************** | |
708 * Encodes string s into UTF-16 and returns the encoded string. | |
709 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take | |
710 * an LPWSTR or LPCWSTR argument. | |
711 */ | |
712 wstring toUTF16(in char[] s) | |
713 { | |
714 wchar[] r; | |
715 size_t slen = s.length; | |
716 | |
717 r.length = slen; | |
718 r.length = 0; | |
719 for (size_t i = 0; i < slen; ) | |
720 { | |
721 dchar c = s[i]; | |
722 if (c <= 0x7F) | |
723 { | |
724 i++; | |
725 r ~= cast(wchar)c; | |
726 } | |
727 else | |
728 { | |
729 c = decode(s, i); | |
730 encode(r, c); | |
731 } | |
732 } | |
733 return cast(wstring)r; | |
734 } | |
735 | |
736 alias const(wchar)* wptr; | |
737 /** ditto */ | |
738 wptr toUTF16z(in char[] s) | |
739 { | |
740 wchar[] r; | |
741 size_t slen = s.length; | |
742 | |
743 r.length = slen + 1; | |
744 r.length = 0; | |
745 for (size_t i = 0; i < slen; ) | |
746 { | |
747 dchar c = s[i]; | |
748 if (c <= 0x7F) | |
749 { | |
750 i++; | |
751 r ~= cast(wchar)c; | |
752 } | |
753 else | |
754 { | |
755 c = decode(s, i); | |
756 encode(r, c); | |
757 } | |
758 } | |
759 r ~= "\000"; | |
760 return r.ptr; | |
761 } | |
762 | |
763 /** ditto */ | |
764 wstring toUTF16(wstring s) | |
765 in | |
766 { | |
767 validate(s); | |
768 } | |
769 body | |
770 { | |
771 return s; | |
772 } | |
773 | |
774 /** ditto */ | |
775 wstring toUTF16(in dchar[] s) | |
776 { | |
777 wchar[] r; | |
778 size_t slen = s.length; | |
779 | |
780 r.length = slen; | |
781 r.length = 0; | |
782 for (size_t i = 0; i < slen; i++) | |
783 { | |
784 encode(r, s[i]); | |
785 } | |
786 return cast(wstring)r; | |
787 } | |
788 | |
789 /* =================== Conversion to UTF32 ======================= */ | |
790 | |
791 /***** | |
792 * Encodes string s into UTF-32 and returns the encoded string. | |
793 */ | |
794 dstring toUTF32(in char[] s) | |
795 { | |
796 dchar[] r; | |
797 size_t slen = s.length; | |
798 size_t j = 0; | |
799 | |
800 r.length = slen; // r[] will never be longer than s[] | |
801 for (size_t i = 0; i < slen; ) | |
802 { | |
803 dchar c = s[i]; | |
804 if (c >= 0x80) | |
805 c = decode(s, i); | |
806 else | |
807 i++; // c is ascii, no need for decode | |
808 r[j++] = c; | |
809 } | |
810 return cast(dstring)r[0 .. j]; | |
811 } | |
812 | |
813 /** ditto */ | |
814 dstring toUTF32(in wchar[] s) | |
815 { | |
816 dchar[] r; | |
817 size_t slen = s.length; | |
818 size_t j = 0; | |
819 | |
820 r.length = slen; // r[] will never be longer than s[] | |
821 for (size_t i = 0; i < slen; ) | |
822 { | |
823 dchar c = s[i]; | |
824 if (c >= 0x80) | |
825 c = decode(s, i); | |
826 else | |
827 i++; // c is ascii, no need for decode | |
828 r[j++] = c; | |
829 } | |
830 return cast(dstring)r[0 .. j]; | |
831 } | |
832 | |
833 /** ditto */ | |
834 dstring toUTF32(dstring s) | |
835 in | |
836 { | |
837 validate(s); | |
838 } | |
839 body | |
840 { | |
841 return s; | |
842 } | |
843 | |
844 /* ================================ tests ================================== */ | |
845 | |
846 unittest | |
847 { | |
848 debug(utf) printf("utf.toUTF.unittest\n"); | |
849 | |
850 auto c = "hello"c[]; | |
851 auto w = toUTF16(c); | |
852 assert(w == "hello"); | |
853 auto d = toUTF32(c); | |
854 assert(d == "hello"); | |
855 | |
856 c = toUTF8(w); | |
857 assert(c == "hello"); | |
858 d = toUTF32(w); | |
859 assert(d == "hello"); | |
860 | |
861 c = toUTF8(d); | |
862 assert(c == "hello"); | |
863 w = toUTF16(d); | |
864 assert(w == "hello"); | |
865 | |
866 | |
867 c = "hel\u1234o"; | |
868 w = toUTF16(c); | |
869 assert(w == "hel\u1234o"); | |
870 d = toUTF32(c); | |
871 assert(d == "hel\u1234o"); | |
872 | |
873 c = toUTF8(w); | |
874 assert(c == "hel\u1234o"); | |
875 d = toUTF32(w); | |
876 assert(d == "hel\u1234o"); | |
877 | |
878 c = toUTF8(d); | |
879 assert(c == "hel\u1234o"); | |
880 w = toUTF16(d); | |
881 assert(w == "hel\u1234o"); | |
882 | |
883 | |
884 c = "he\U0010AAAAllo"; | |
885 w = toUTF16(c); | |
886 //foreach (wchar c; w) printf("c = x%x\n", c); | |
887 //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c); | |
888 assert(w == "he\U0010AAAAllo"); | |
889 d = toUTF32(c); | |
890 assert(d == "he\U0010AAAAllo"); | |
891 | |
892 c = toUTF8(w); | |
893 assert(c == "he\U0010AAAAllo"); | |
894 d = toUTF32(w); | |
895 assert(d == "he\U0010AAAAllo"); | |
896 | |
897 c = toUTF8(d); | |
898 assert(c == "he\U0010AAAAllo"); | |
899 w = toUTF16(d); | |
900 assert(w == "he\U0010AAAAllo"); | |
901 } |