132
|
1 // utf.d
|
|
2
|
|
3 /*
|
|
4 * Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
|
|
5 * Written by Walter Bright
|
|
6 *
|
|
7 * This software is provided 'as-is', without any express or implied
|
|
8 * warranty. In no event will the authors be held liable for any damages
|
|
9 * arising from the use of this software.
|
|
10 *
|
|
11 * Permission is granted to anyone to use this software for any purpose,
|
|
12 * including commercial applications, and to alter it and redistribute it
|
|
13 * freely, subject to the following restrictions:
|
|
14 *
|
|
15 * o The origin of this software must not be misrepresented; you must not
|
|
16 * claim that you wrote the original software. If you use this software
|
|
17 * in a product, an acknowledgment in the product documentation would be
|
|
18 * appreciated but is not required.
|
|
19 * o Altered source versions must be plainly marked as such, and must not
|
|
20 * be misrepresented as being the original software.
|
|
21 * o This notice may not be removed or altered from any source
|
|
22 * distribution.
|
|
23 */
|
|
24
|
|
25 // Description of UTF-8 at:
|
|
26 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
|
|
27 // http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335
|
|
28
|
|
29
|
|
30 module util.utf;
|
|
31
|
|
32
|
|
33 extern (C) void onUnicodeError( char[] msg, size_t idx );
|
|
34
|
|
35
|
|
36 bool isValidDchar(dchar c)
|
|
37 {
|
|
38 /* Note: FFFE and FFFF are specifically permitted by the
|
|
39 * Unicode standard for application internal use, but are not
|
|
40 * allowed for interchange.
|
|
41 * (thanks to Arcane Jill)
|
|
42 */
|
|
43
|
|
44 return c < 0xD800 ||
|
|
45 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
|
|
46 }
|
|
47
|
|
48 unittest
|
|
49 {
|
|
50 debug(utf) printf("utf.isValidDchar.unittest\n");
|
|
51 assert(isValidDchar(cast(dchar)'a') == true);
|
|
52 assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
|
|
53 }
|
|
54
|
|
55
|
|
56 /* This array gives the length of a UTF-8 sequence indexed by the value
|
|
57 * of the leading byte. An FF represents an illegal starting value of
|
|
58 * a UTF-8 sequence.
|
|
59 * FF is used instead of 0 to avoid having loops hang.
|
|
60 */
|
|
61
|
|
62 ubyte[256] UTF8stride =
|
|
63 [
|
|
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
73 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
74 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
75 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
76 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
77 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
78 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
|
79 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
|
|
80 ];
|
|
81
|
|
82 uint stride(char[] s, size_t i)
|
|
83 {
|
|
84 return UTF8stride[s[i]];
|
|
85 }
|
|
86
|
|
87 uint stride(wchar[] s, size_t i)
|
|
88 { uint u = s[i];
|
|
89 return 1 + (u >= 0xD800 && u <= 0xDBFF);
|
|
90 }
|
|
91
|
|
92 uint stride(dchar[] s, size_t i)
|
|
93 {
|
|
94 return 1;
|
|
95 }
|
|
96
|
|
97 /*******************************************
|
|
98 * Given an index into an array of char's,
|
|
99 * and assuming that index is at the start of a UTF character,
|
|
100 * determine the number of UCS characters up to that index.
|
|
101 */
|
|
102
|
|
103 size_t toUCSindex(char[] s, size_t i)
|
|
104 {
|
|
105 size_t n;
|
|
106 size_t j;
|
|
107 size_t stride;
|
|
108
|
|
109 for (j = 0; j < i; j += stride)
|
|
110 {
|
|
111 stride = UTF8stride[s[j]];
|
|
112 if (stride == 0xFF)
|
|
113 goto Lerr;
|
|
114 n++;
|
|
115 }
|
|
116 if (j > i)
|
|
117 {
|
|
118 Lerr:
|
|
119 onUnicodeError("invalid UTF-8 sequence", j);
|
|
120 }
|
|
121 return n;
|
|
122 }
|
|
123
|
|
124 size_t toUCSindex(wchar[] s, size_t i)
|
|
125 {
|
|
126 size_t n;
|
|
127 size_t j;
|
|
128
|
|
129 for (j = 0; j < i; )
|
|
130 { uint u = s[j];
|
|
131
|
|
132 j += 1 + (u >= 0xD800 && u <= 0xDBFF);
|
|
133 n++;
|
|
134 }
|
|
135 if (j > i)
|
|
136 {
|
|
137 Lerr:
|
|
138 onUnicodeError("invalid UTF-16 sequence", j);
|
|
139 }
|
|
140 return n;
|
|
141 }
|
|
142
|
|
143 size_t toUCSindex(dchar[] s, size_t i)
|
|
144 {
|
|
145 return i;
|
|
146 }
|
|
147
|
|
148 /******************************************
|
|
149 * Given a UCS index into an array of characters, return the UTF index.
|
|
150 */
|
|
151
|
|
152 size_t toUTFindex(char[] s, size_t n)
|
|
153 {
|
|
154 size_t i;
|
|
155
|
|
156 while (n--)
|
|
157 {
|
|
158 uint j = UTF8stride[s[i]];
|
|
159 if (j == 0xFF)
|
|
160 onUnicodeError("invalid UTF-8 sequence", i);
|
|
161 i += j;
|
|
162 }
|
|
163 return i;
|
|
164 }
|
|
165
|
|
166 size_t toUTFindex(wchar[] s, size_t n)
|
|
167 {
|
|
168 size_t i;
|
|
169
|
|
170 while (n--)
|
|
171 { wchar u = s[i];
|
|
172
|
|
173 i += 1 + (u >= 0xD800 && u <= 0xDBFF);
|
|
174 }
|
|
175 return i;
|
|
176 }
|
|
177
|
|
178 size_t toUTFindex(dchar[] s, size_t n)
|
|
179 {
|
|
180 return n;
|
|
181 }
|
|
182
|
|
183 /* =================== Decode ======================= */
|
|
184
|
|
185 dchar decode(char[] s, inout size_t idx)
|
|
186 in
|
|
187 {
|
|
188 assert(idx >= 0 && idx < s.length);
|
|
189 }
|
|
190 out (result)
|
|
191 {
|
|
192 assert(isValidDchar(result));
|
|
193 }
|
|
194 body
|
|
195 {
|
|
196 size_t len = s.length;
|
|
197 dchar V;
|
|
198 size_t i = idx;
|
|
199 char u = s[i];
|
|
200
|
|
201 if (u & 0x80)
|
|
202 { uint n;
|
|
203 char u2;
|
|
204
|
|
205 /* The following encodings are valid, except for the 5 and 6 byte
|
|
206 * combinations:
|
|
207 * 0xxxxxxx
|
|
208 * 110xxxxx 10xxxxxx
|
|
209 * 1110xxxx 10xxxxxx 10xxxxxx
|
|
210 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
211 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
212 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
213 */
|
|
214 for (n = 1; ; n++)
|
|
215 {
|
|
216 if (n > 4)
|
|
217 goto Lerr; // only do the first 4 of 6 encodings
|
|
218 if (((u << n) & 0x80) == 0)
|
|
219 {
|
|
220 if (n == 1)
|
|
221 goto Lerr;
|
|
222 break;
|
|
223 }
|
|
224 }
|
|
225
|
|
226 // Pick off (7 - n) significant bits of B from first byte of octet
|
|
227 V = cast(dchar)(u & ((1 << (7 - n)) - 1));
|
|
228
|
|
229 if (i + (n - 1) >= len)
|
|
230 goto Lerr; // off end of string
|
|
231
|
|
232 /* The following combinations are overlong, and illegal:
|
|
233 * 1100000x (10xxxxxx)
|
|
234 * 11100000 100xxxxx (10xxxxxx)
|
|
235 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
|
236 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
|
237 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
238 */
|
|
239 u2 = s[i + 1];
|
|
240 if ((u & 0xFE) == 0xC0 ||
|
|
241 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
|
242 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
|
243 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
|
244 (u == 0xFC && (u2 & 0xFC) == 0x80))
|
|
245 goto Lerr; // overlong combination
|
|
246
|
|
247 for (uint j = 1; j != n; j++)
|
|
248 {
|
|
249 u = s[i + j];
|
|
250 if ((u & 0xC0) != 0x80)
|
|
251 goto Lerr; // trailing bytes are 10xxxxxx
|
|
252 V = (V << 6) | (u & 0x3F);
|
|
253 }
|
|
254 if (!isValidDchar(V))
|
|
255 goto Lerr;
|
|
256 i += n;
|
|
257 }
|
|
258 else
|
|
259 {
|
|
260 V = cast(dchar) u;
|
|
261 i++;
|
|
262 }
|
|
263
|
|
264 idx = i;
|
|
265 return V;
|
|
266
|
|
267 Lerr:
|
|
268 onUnicodeError("invalid UTF-8 sequence", i);
|
|
269 return V; // dummy return
|
|
270 }
|
|
271
|
|
272 unittest
|
|
273 { size_t i;
|
|
274 dchar c;
|
|
275
|
|
276 debug(utf) printf("utf.decode.unittest\n");
|
|
277
|
|
278 static char[] s1 = "abcd";
|
|
279 i = 0;
|
|
280 c = decode(s1, i);
|
|
281 assert(c == cast(dchar)'a');
|
|
282 assert(i == 1);
|
|
283 c = decode(s1, i);
|
|
284 assert(c == cast(dchar)'b');
|
|
285 assert(i == 2);
|
|
286
|
|
287 static char[] s2 = "\xC2\xA9";
|
|
288 i = 0;
|
|
289 c = decode(s2, i);
|
|
290 assert(c == cast(dchar)'\u00A9');
|
|
291 assert(i == 2);
|
|
292
|
|
293 static char[] s3 = "\xE2\x89\xA0";
|
|
294 i = 0;
|
|
295 c = decode(s3, i);
|
|
296 assert(c == cast(dchar)'\u2260');
|
|
297 assert(i == 3);
|
|
298
|
|
299 static char[][] s4 =
|
|
300 [ "\xE2\x89", // too short
|
|
301 "\xC0\x8A",
|
|
302 "\xE0\x80\x8A",
|
|
303 "\xF0\x80\x80\x8A",
|
|
304 "\xF8\x80\x80\x80\x8A",
|
|
305 "\xFC\x80\x80\x80\x80\x8A",
|
|
306 ];
|
|
307
|
|
308 for (int j = 0; j < s4.length; j++)
|
|
309 {
|
|
310 try
|
|
311 {
|
|
312 i = 0;
|
|
313 c = decode(s4[j], i);
|
|
314 assert(0);
|
|
315 }
|
|
316 catch (Object o)
|
|
317 {
|
|
318 i = 23;
|
|
319 }
|
|
320 assert(i == 23);
|
|
321 }
|
|
322 }
|
|
323
|
|
324 /********************************************************/
|
|
325
|
|
326 dchar decode(wchar[] s, inout size_t idx)
|
|
327 in
|
|
328 {
|
|
329 assert(idx >= 0 && idx < s.length);
|
|
330 }
|
|
331 out (result)
|
|
332 {
|
|
333 assert(isValidDchar(result));
|
|
334 }
|
|
335 body
|
|
336 {
|
|
337 char[] msg;
|
|
338 dchar V;
|
|
339 size_t i = idx;
|
|
340 uint u = s[i];
|
|
341
|
|
342 if (u & ~0x7F)
|
|
343 { if (u >= 0xD800 && u <= 0xDBFF)
|
|
344 { uint u2;
|
|
345
|
|
346 if (i + 1 == s.length)
|
|
347 { msg = "surrogate UTF-16 high value past end of string";
|
|
348 goto Lerr;
|
|
349 }
|
|
350 u2 = s[i + 1];
|
|
351 if (u2 < 0xDC00 || u2 > 0xDFFF)
|
|
352 { msg = "surrogate UTF-16 low value out of range";
|
|
353 goto Lerr;
|
|
354 }
|
|
355 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
|
|
356 i += 2;
|
|
357 }
|
|
358 else if (u >= 0xDC00 && u <= 0xDFFF)
|
|
359 { msg = "unpaired surrogate UTF-16 value";
|
|
360 goto Lerr;
|
|
361 }
|
|
362 else if (u == 0xFFFE || u == 0xFFFF)
|
|
363 { msg = "illegal UTF-16 value";
|
|
364 goto Lerr;
|
|
365 }
|
|
366 else
|
|
367 i++;
|
|
368 }
|
|
369 else
|
|
370 {
|
|
371 i++;
|
|
372 }
|
|
373
|
|
374 idx = i;
|
|
375 return cast(dchar)u;
|
|
376
|
|
377 Lerr:
|
|
378 onUnicodeError(msg, i);
|
|
379 return cast(dchar)u; // dummy return
|
|
380 }
|
|
381
|
|
382 /********************************************************/
|
|
383
|
|
384 dchar decode(dchar[] s, inout size_t idx)
|
|
385 in
|
|
386 {
|
|
387 assert(idx >= 0 && idx < s.length);
|
|
388 }
|
|
389 body
|
|
390 {
|
|
391 size_t i = idx;
|
|
392 dchar c = s[i];
|
|
393
|
|
394 if (!isValidDchar(c))
|
|
395 goto Lerr;
|
|
396 idx = i + 1;
|
|
397 return c;
|
|
398
|
|
399 Lerr:
|
|
400 onUnicodeError("invalid UTF-32 value", i);
|
|
401 return c; // dummy return
|
|
402 }
|
|
403
|
|
404
|
|
405 /* =================== Encode ======================= */
|
|
406
|
|
407 void encode(inout char[] s, dchar c)
|
|
408 in
|
|
409 {
|
|
410 assert(isValidDchar(c));
|
|
411 }
|
|
412 body
|
|
413 {
|
|
414 char[] r = s;
|
|
415
|
|
416 if (c <= 0x7F)
|
|
417 {
|
|
418 r ~= cast(char) c;
|
|
419 }
|
|
420 else
|
|
421 {
|
|
422 char[4] buf;
|
|
423 uint L;
|
|
424
|
|
425 if (c <= 0x7FF)
|
|
426 {
|
|
427 buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
428 buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
429 L = 2;
|
|
430 }
|
|
431 else if (c <= 0xFFFF)
|
|
432 {
|
|
433 buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
434 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
435 buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
436 L = 3;
|
|
437 }
|
|
438 else if (c <= 0x10FFFF)
|
|
439 {
|
|
440 buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
441 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
442 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
443 buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
444 L = 4;
|
|
445 }
|
|
446 else
|
|
447 {
|
|
448 assert(0);
|
|
449 }
|
|
450 r ~= buf[0 .. L];
|
|
451 }
|
|
452 s = r;
|
|
453 }
|
|
454
|
|
455 unittest
|
|
456 {
|
|
457 debug(utf) printf("utf.encode.unittest\n");
|
|
458
|
|
459 char[] s = "abcd";
|
|
460 encode(s, cast(dchar)'a');
|
|
461 assert(s.length == 5);
|
|
462 assert(s == "abcda");
|
|
463
|
|
464 encode(s, cast(dchar)'\u00A9');
|
|
465 assert(s.length == 7);
|
|
466 assert(s == "abcda\xC2\xA9");
|
|
467 //assert(s == "abcda\u00A9"); // BUG: fix compiler
|
|
468
|
|
469 encode(s, cast(dchar)'\u2260');
|
|
470 assert(s.length == 10);
|
|
471 assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
|
|
472 }
|
|
473
|
|
474 /********************************************************/
|
|
475
|
|
476 void encode(inout wchar[] s, dchar c)
|
|
477 in
|
|
478 {
|
|
479 assert(isValidDchar(c));
|
|
480 }
|
|
481 body
|
|
482 {
|
|
483 wchar[] r = s;
|
|
484
|
|
485 if (c <= 0xFFFF)
|
|
486 {
|
|
487 r ~= cast(wchar) c;
|
|
488 }
|
|
489 else
|
|
490 {
|
|
491 wchar[2] buf;
|
|
492
|
|
493 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
|
494 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
|
|
495 r ~= buf;
|
|
496 }
|
|
497 s = r;
|
|
498 }
|
|
499
|
|
500 void encode(inout dchar[] s, dchar c)
|
|
501 in
|
|
502 {
|
|
503 assert(isValidDchar(c));
|
|
504 }
|
|
505 body
|
|
506 {
|
|
507 s ~= c;
|
|
508 }
|
|
509
|
|
510 /* =================== Validation ======================= */
|
|
511
|
|
512 void validate(char[] s)
|
|
513 {
|
|
514 size_t len = s.length;
|
|
515 size_t i;
|
|
516
|
|
517 for (i = 0; i < len; )
|
|
518 {
|
|
519 decode(s, i);
|
|
520 }
|
|
521 }
|
|
522
|
|
523 void validate(wchar[] s)
|
|
524 {
|
|
525 size_t len = s.length;
|
|
526 size_t i;
|
|
527
|
|
528 for (i = 0; i < len; )
|
|
529 {
|
|
530 decode(s, i);
|
|
531 }
|
|
532 }
|
|
533
|
|
534 void validate(dchar[] s)
|
|
535 {
|
|
536 size_t len = s.length;
|
|
537 size_t i;
|
|
538
|
|
539 for (i = 0; i < len; )
|
|
540 {
|
|
541 decode(s, i);
|
|
542 }
|
|
543 }
|
|
544
|
|
545 /* =================== Conversion to UTF8 ======================= */
|
|
546
|
|
547 char[] toUTF8(char[4] buf, dchar c)
|
|
548 in
|
|
549 {
|
|
550 assert(isValidDchar(c));
|
|
551 }
|
|
552 body
|
|
553 {
|
|
554 if (c <= 0x7F)
|
|
555 {
|
|
556 buf[0] = cast(char) c;
|
|
557 return buf[0 .. 1];
|
|
558 }
|
|
559 else if (c <= 0x7FF)
|
|
560 {
|
|
561 buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
562 buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
563 return buf[0 .. 2];
|
|
564 }
|
|
565 else if (c <= 0xFFFF)
|
|
566 {
|
|
567 buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
568 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
569 buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
570 return buf[0 .. 3];
|
|
571 }
|
|
572 else if (c <= 0x10FFFF)
|
|
573 {
|
|
574 buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
575 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
576 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
577 buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
578 return buf[0 .. 4];
|
|
579 }
|
|
580 assert(0);
|
|
581 }
|
|
582
|
|
583 char[] toUTF8(char[] s)
|
|
584 in
|
|
585 {
|
|
586 validate(s);
|
|
587 }
|
|
588 body
|
|
589 {
|
|
590 return s;
|
|
591 }
|
|
592
|
|
593 char[] toUTF8(wchar[] s)
|
|
594 {
|
|
595 char[] r;
|
|
596 size_t i;
|
|
597 size_t slen = s.length;
|
|
598
|
|
599 r.length = slen;
|
|
600
|
|
601 for (i = 0; i < slen; i++)
|
|
602 { wchar c = s[i];
|
|
603
|
|
604 if (c <= 0x7F)
|
|
605 r[i] = cast(char)c; // fast path for ascii
|
|
606 else
|
|
607 {
|
|
608 r.length = i;
|
|
609 foreach (dchar c; s[i .. slen])
|
|
610 {
|
|
611 encode(r, c);
|
|
612 }
|
|
613 break;
|
|
614 }
|
|
615 }
|
|
616 return r;
|
|
617 }
|
|
618
|
|
619 char[] toUTF8(dchar[] s)
|
|
620 {
|
|
621 char[] r;
|
|
622 size_t i;
|
|
623 size_t slen = s.length;
|
|
624
|
|
625 r.length = slen;
|
|
626
|
|
627 for (i = 0; i < slen; i++)
|
|
628 { dchar c = s[i];
|
|
629
|
|
630 if (c <= 0x7F)
|
|
631 r[i] = cast(char)c; // fast path for ascii
|
|
632 else
|
|
633 {
|
|
634 r.length = i;
|
|
635 foreach (dchar d; s[i .. slen])
|
|
636 {
|
|
637 encode(r, d);
|
|
638 }
|
|
639 break;
|
|
640 }
|
|
641 }
|
|
642 return r;
|
|
643 }
|
|
644
|
|
645 /* =================== Conversion to UTF16 ======================= */
|
|
646
|
|
647 wchar[] toUTF16(wchar[2] buf, dchar c)
|
|
648 in
|
|
649 {
|
|
650 assert(isValidDchar(c));
|
|
651 }
|
|
652 body
|
|
653 {
|
|
654 if (c <= 0xFFFF)
|
|
655 {
|
|
656 buf[0] = cast(wchar) c;
|
|
657 return buf[0 .. 1];
|
|
658 }
|
|
659 else
|
|
660 {
|
|
661 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
|
662 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
|
|
663 return buf[0 .. 2];
|
|
664 }
|
|
665 }
|
|
666
|
|
667 wchar[] toUTF16(char[] s)
|
|
668 {
|
|
669 wchar[] r;
|
|
670 size_t slen = s.length;
|
|
671
|
|
672 r.length = slen;
|
|
673 r.length = 0;
|
|
674 for (size_t i = 0; i < slen; )
|
|
675 {
|
|
676 dchar c = s[i];
|
|
677 if (c <= 0x7F)
|
|
678 {
|
|
679 i++;
|
|
680 r ~= cast(wchar)c;
|
|
681 }
|
|
682 else
|
|
683 {
|
|
684 c = decode(s, i);
|
|
685 encode(r, c);
|
|
686 }
|
|
687 }
|
|
688 return r;
|
|
689 }
|
|
690
|
|
691 wchar* toUTF16z(char[] s)
|
|
692 {
|
|
693 wchar[] r;
|
|
694 size_t slen = s.length;
|
|
695
|
|
696 r.length = slen + 1;
|
|
697 r.length = 0;
|
|
698 for (size_t i = 0; i < slen; )
|
|
699 {
|
|
700 dchar c = s[i];
|
|
701 if (c <= 0x7F)
|
|
702 {
|
|
703 i++;
|
|
704 r ~= cast(wchar)c;
|
|
705 }
|
|
706 else
|
|
707 {
|
|
708 c = decode(s, i);
|
|
709 encode(r, c);
|
|
710 }
|
|
711 }
|
|
712 r ~= "\000";
|
|
713 return r.ptr;
|
|
714 }
|
|
715
|
|
716 wchar[] toUTF16(wchar[] s)
|
|
717 in
|
|
718 {
|
|
719 validate(s);
|
|
720 }
|
|
721 body
|
|
722 {
|
|
723 return s;
|
|
724 }
|
|
725
|
|
726 wchar[] toUTF16(dchar[] s)
|
|
727 {
|
|
728 wchar[] r;
|
|
729 size_t slen = s.length;
|
|
730
|
|
731 r.length = slen;
|
|
732 r.length = 0;
|
|
733 for (size_t i = 0; i < slen; i++)
|
|
734 {
|
|
735 encode(r, s[i]);
|
|
736 }
|
|
737 return r;
|
|
738 }
|
|
739
|
|
740 /* =================== Conversion to UTF32 ======================= */
|
|
741
|
|
742 dchar[] toUTF32(char[] s)
|
|
743 {
|
|
744 dchar[] r;
|
|
745 size_t slen = s.length;
|
|
746 size_t j = 0;
|
|
747
|
|
748 r.length = slen; // r[] will never be longer than s[]
|
|
749 for (size_t i = 0; i < slen; )
|
|
750 {
|
|
751 dchar c = s[i];
|
|
752 if (c >= 0x80)
|
|
753 c = decode(s, i);
|
|
754 else
|
|
755 i++; // c is ascii, no need for decode
|
|
756 r[j++] = c;
|
|
757 }
|
|
758 return r[0 .. j];
|
|
759 }
|
|
760
|
|
761 dchar[] toUTF32(wchar[] s)
|
|
762 {
|
|
763 dchar[] r;
|
|
764 size_t slen = s.length;
|
|
765 size_t j = 0;
|
|
766
|
|
767 r.length = slen; // r[] will never be longer than s[]
|
|
768 for (size_t i = 0; i < slen; )
|
|
769 {
|
|
770 dchar c = s[i];
|
|
771 if (c >= 0x80)
|
|
772 c = decode(s, i);
|
|
773 else
|
|
774 i++; // c is ascii, no need for decode
|
|
775 r[j++] = c;
|
|
776 }
|
|
777 return r[0 .. j];
|
|
778 }
|
|
779
|
|
780 dchar[] toUTF32(dchar[] s)
|
|
781 in
|
|
782 {
|
|
783 validate(s);
|
|
784 }
|
|
785 body
|
|
786 {
|
|
787 return s;
|
|
788 }
|
|
789
|
|
790 /* ================================ tests ================================== */
|
|
791
|
|
792 unittest
|
|
793 {
|
|
794 debug(utf) printf("utf.toUTF.unittest\n");
|
|
795
|
|
796 char[] c;
|
|
797 wchar[] w;
|
|
798 dchar[] d;
|
|
799
|
|
800 c = "hello";
|
|
801 w = toUTF16(c);
|
|
802 assert(w == "hello");
|
|
803 d = toUTF32(c);
|
|
804 assert(d == "hello");
|
|
805
|
|
806 c = toUTF8(w);
|
|
807 assert(c == "hello");
|
|
808 d = toUTF32(w);
|
|
809 assert(d == "hello");
|
|
810
|
|
811 c = toUTF8(d);
|
|
812 assert(c == "hello");
|
|
813 w = toUTF16(d);
|
|
814 assert(w == "hello");
|
|
815
|
|
816
|
|
817 c = "hel\u1234o";
|
|
818 w = toUTF16(c);
|
|
819 assert(w == "hel\u1234o");
|
|
820 d = toUTF32(c);
|
|
821 assert(d == "hel\u1234o");
|
|
822
|
|
823 c = toUTF8(w);
|
|
824 assert(c == "hel\u1234o");
|
|
825 d = toUTF32(w);
|
|
826 assert(d == "hel\u1234o");
|
|
827
|
|
828 c = toUTF8(d);
|
|
829 assert(c == "hel\u1234o");
|
|
830 w = toUTF16(d);
|
|
831 assert(w == "hel\u1234o");
|
|
832
|
|
833
|
|
834 c = "he\U0010AAAAllo";
|
|
835 w = toUTF16(c);
|
|
836 //foreach (wchar c; w) printf("c = x%x\n", c);
|
|
837 //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
|
|
838 assert(w == "he\U0010AAAAllo");
|
|
839 d = toUTF32(c);
|
|
840 assert(d == "he\U0010AAAAllo");
|
|
841
|
|
842 c = toUTF8(w);
|
|
843 assert(c == "he\U0010AAAAllo");
|
|
844 d = toUTF32(w);
|
|
845 assert(d == "he\U0010AAAAllo");
|
|
846
|
|
847 c = toUTF8(d);
|
|
848 assert(c == "he\U0010AAAAllo");
|
|
849 w = toUTF16(d);
|
|
850 assert(w == "he\U0010AAAAllo");
|
|
851 }
|