132
|
1 /*******************************************************************************
|
|
2
|
|
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved
|
|
4
|
|
5 license: BSD style: $(LICENSE)
|
|
6
|
|
7 version: Initial release: Oct 2004
|
|
8
|
|
9 authors: Kris
|
|
10
|
|
11 Fast Unicode transcoders. These are particularly sensitive to
|
|
12 minor changes on 32bit x86 devices, because the register set of
|
|
13 those devices is so small. Beware of subtle changes which might
|
|
14 extend the execution-period by as much as 200%. Because of this,
|
|
15 three of the six transcoders might read past the end of input by
|
|
16 one, two, or three bytes before arresting themselves. Note that
|
|
17 support for streaming adds a 15% overhead to the dchar => char
|
|
18 conversion, but has little effect on the others.
|
|
19
|
|
20 These routines were tuned on an Intel P4; other devices may work
|
|
21 more efficiently with a slightly different approach, though this
|
|
22 is likely to be reasonably optimal on AMD x86 CPUs also. These
|
|
23 algorithms would benefit significantly from those extra AMD64
|
|
24 registers. On a 3GHz P4, the dchar/char conversions take around
|
|
25 2500ns to process an array of 1000 ASCII elements. Invoking the
|
|
26 memory manager doubles that period, and quadruples the time for
|
|
27 arrays of 100 elements. Memory allocation can slow down notably
|
|
28 in a multi-threaded environment, so avoid that where possible.
|
|
29
|
|
30 Surrogate-pairs are dealt with in a non-optimal fashion when
|
|
31 transcoding between utf16 and utf8. Such cases are considered
|
|
32 to be boundary-conditions for this module.
|
|
33
|
|
34 There are three common cases where the input may be incomplete,
|
|
35 including each 'widening' case of utf8 => utf16, utf8 => utf32,
|
|
36 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
|
|
37 pairs are present. Such cases will throw an exception, unless
|
|
38 streaming-mode is enabled ~ in the latter mode, an additional
|
|
39 integer is returned indicating how many elements of the input
|
|
40 have been consumed. In all cases, a correct slice of the output
|
|
41 is returned.
|
|
42
|
|
43 For details on Unicode processing see:
|
|
44 $(UL $(LINK http://www.utf-8.com/))
|
|
45 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
|
|
46 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
|
|
47 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
|
|
48
|
|
49 *******************************************************************************/
|
|
50
|
|
51 module tango.text.convert.Utf;
|
|
52
|
|
53 public extern (C) void onUnicodeError (char[] msg, size_t idx = 0);
|
|
54
|
|
55
|
|
56 /*******************************************************************************
|
|
57
|
|
58 Encode Utf8 up to a maximum of 4 bytes long (five & six byte
|
|
59 variations are not supported).
|
|
60
|
|
61 If the output is provided off the stack, it should be large
|
|
62 enough to encompass the entire transcoding; failing to do
|
|
63 so will cause the output to be moved onto the heap instead.
|
|
64
|
|
65 Returns a slice of the output buffer, corresponding to the
|
|
66 converted characters. For optimum performance, the returned
|
|
67 buffer should be specified as 'output' on subsequent calls.
|
|
68 For example:
|
|
69
|
|
70 ---
|
|
71 char[] output;
|
|
72
|
|
73 char[] result = toString (input, output);
|
|
74
|
|
75 // reset output after a realloc
|
|
76 if (result.length > output.length)
|
|
77 output = result;
|
|
78 ---
|
|
79
|
|
80 *******************************************************************************/
|
|
81
|
|
82 char[] toString (wchar[] input, char[] output=null, uint* ate=null)
|
|
83 {
|
|
84 if (ate)
|
|
85 *ate = input.length;
|
|
86 else
|
|
87 {
|
|
88 // potentially reallocate output
|
|
89 int estimate = input.length * 2 + 3;
|
|
90 if (output.length < estimate)
|
|
91 output.length = estimate;
|
|
92 }
|
|
93
|
|
94 char* pOut = output.ptr;
|
|
95 char* pMax = pOut + output.length - 3;
|
|
96
|
|
97 foreach (int eaten, wchar b; input)
|
|
98 {
|
|
99 // about to overflow the output?
|
|
100 if (pOut > pMax)
|
|
101 {
|
|
102 // if streaming, just return the unused input
|
|
103 if (ate)
|
|
104 {
|
|
105 *ate = eaten;
|
|
106 break;
|
|
107 }
|
|
108
|
|
109 // reallocate the output buffer
|
|
110 int len = pOut - output.ptr;
|
|
111 output.length = len + len / 2;
|
|
112 pOut = output.ptr + len;
|
|
113 pMax = output.ptr + output.length - 3;
|
|
114 }
|
|
115
|
|
116 if (b < 0x80)
|
|
117 *pOut++ = b;
|
|
118 else
|
|
119 if (b < 0x0800)
|
|
120 {
|
|
121 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
|
|
122 pOut[1] = 0x80 | (b & 0x3f);
|
|
123 pOut += 2;
|
|
124 }
|
|
125 else
|
|
126 if (b < 0xd800 || b > 0xdfff)
|
|
127 {
|
|
128 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
|
|
129 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
|
|
130 pOut[2] = 0x80 | (b & 0x3f);
|
|
131 pOut += 3;
|
|
132 }
|
|
133 else
|
|
134 // deal with surrogate-pairs
|
|
135 return toString (toString32(input, null, ate), output);
|
|
136 }
|
|
137
|
|
138 // return the produced output
|
|
139 return output [0..(pOut - output.ptr)];
|
|
140 }
|
|
141
|
|
142 /*******************************************************************************
|
|
143
|
|
144 Decode Utf8 produced by the above toString() method.
|
|
145
|
|
146 If the output is provided off the stack, it should be large
|
|
147 enough to encompass the entire transcoding; failing to do
|
|
148 so will cause the output to be moved onto the heap instead.
|
|
149
|
|
150 Returns a slice of the output buffer, corresponding to the
|
|
151 converted characters. For optimum performance, the returned
|
|
152 buffer should be specified as 'output' on subsequent calls.
|
|
153
|
|
154 *******************************************************************************/
|
|
155
|
|
156 wchar[] toString16 (char[] input, wchar[] output=null, uint* ate=null)
|
|
157 {
|
|
158 int produced;
|
|
159 char* pIn = input.ptr;
|
|
160 char* pMax = pIn + input.length;
|
|
161 char* pValid;
|
|
162
|
|
163 if (ate is null)
|
|
164 if (input.length > output.length)
|
|
165 output.length = input.length;
|
|
166
|
|
167 if (input.length)
|
|
168 foreach (inout wchar d; output)
|
|
169 {
|
|
170 pValid = pIn;
|
|
171 wchar b = cast(wchar) *pIn;
|
|
172
|
|
173 if (b & 0x80)
|
|
174 if (b < 0xe0)
|
|
175 {
|
|
176 b &= 0x1f;
|
|
177 b = (b << 6) | (*++pIn & 0x3f);
|
|
178 }
|
|
179 else
|
|
180 if (b < 0xf0)
|
|
181 {
|
|
182 b &= 0x0f;
|
|
183 b = (b << 6) | (pIn[1] & 0x3f);
|
|
184 b = (b << 6) | (pIn[2] & 0x3f);
|
|
185 pIn += 2;
|
|
186 }
|
|
187 else
|
|
188 // deal with surrogate-pairs
|
|
189 return toString16 (toString32(input, null, ate), output);
|
|
190
|
|
191 d = b;
|
|
192 ++produced;
|
|
193
|
|
194 // did we read past the end of the input?
|
|
195 if (++pIn >= pMax)
|
|
196 if (pIn > pMax)
|
|
197 {
|
|
198 // yep ~ return tail or throw error?
|
|
199 if (ate)
|
|
200 {
|
|
201 pIn = pValid;
|
|
202 --produced;
|
|
203 break;
|
|
204 }
|
|
205 onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
|
|
206 }
|
|
207 else
|
|
208 break;
|
|
209 }
|
|
210
|
|
211 // do we still have some input left?
|
|
212 if (ate)
|
|
213 *ate = pIn - input.ptr;
|
|
214 else
|
|
215 if (pIn < pMax)
|
|
216 // this should never happen!
|
|
217 onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
|
|
218
|
|
219 // return the produced output
|
|
220 return output [0..produced];
|
|
221 }
|
|
222
|
|
223
|
|
224 /*******************************************************************************
|
|
225
|
|
226 Encode Utf8 up to a maximum of 4 bytes long (five & six
|
|
227 byte variations are not supported). Throws an exception
|
|
228 where the input dchar is greater than 0x10ffff.
|
|
229
|
|
230 If the output is provided off the stack, it should be large
|
|
231 enough to encompass the entire transcoding; failing to do
|
|
232 so will cause the output to be moved onto the heap instead.
|
|
233
|
|
234 Returns a slice of the output buffer, corresponding to the
|
|
235 converted characters. For optimum performance, the returned
|
|
236 buffer should be specified as 'output' on subsequent calls.
|
|
237
|
|
238 *******************************************************************************/
|
|
239
|
|
240 char[] toString (dchar[] input, char[] output=null, uint* ate=null)
|
|
241 {
|
|
242 if (ate)
|
|
243 *ate = input.length;
|
|
244 else
|
|
245 {
|
|
246 // potentially reallocate output
|
|
247 int estimate = input.length * 2 + 4;
|
|
248 if (output.length < estimate)
|
|
249 output.length = estimate;
|
|
250 }
|
|
251
|
|
252 char* pOut = output.ptr;
|
|
253 char* pMax = pOut + output.length - 4;
|
|
254
|
|
255 foreach (int eaten, dchar b; input)
|
|
256 {
|
|
257 // about to overflow the output?
|
|
258 if (pOut > pMax)
|
|
259 {
|
|
260 // if streaming, just return the unused input
|
|
261 if (ate)
|
|
262 {
|
|
263 *ate = eaten;
|
|
264 break;
|
|
265 }
|
|
266
|
|
267 // reallocate the output buffer
|
|
268 int len = pOut - output.ptr;
|
|
269 output.length = len + len / 2;
|
|
270 pOut = output.ptr + len;
|
|
271 pMax = output.ptr + output.length - 4;
|
|
272 }
|
|
273
|
|
274 if (b < 0x80)
|
|
275 *pOut++ = b;
|
|
276 else
|
|
277 if (b < 0x0800)
|
|
278 {
|
|
279 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
|
|
280 pOut[1] = 0x80 | (b & 0x3f);
|
|
281 pOut += 2;
|
|
282 }
|
|
283 else
|
|
284 if (b < 0x10000)
|
|
285 {
|
|
286 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
|
|
287 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
|
|
288 pOut[2] = 0x80 | (b & 0x3f);
|
|
289 pOut += 3;
|
|
290 }
|
|
291 else
|
|
292 if (b < 0x110000)
|
|
293 {
|
|
294 pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
|
|
295 pOut[1] = 0x80 | ((b >> 12) & 0x3f);
|
|
296 pOut[2] = 0x80 | ((b >> 6) & 0x3f);
|
|
297 pOut[3] = 0x80 | (b & 0x3f);
|
|
298 pOut += 4;
|
|
299 }
|
|
300 else
|
|
301 onUnicodeError ("Unicode.toString : invalid dchar", eaten);
|
|
302 }
|
|
303
|
|
304 // return the produced output
|
|
305 return output [0..(pOut - output.ptr)];
|
|
306 }
|
|
307
|
|
308
|
|
309 /*******************************************************************************
|
|
310
|
|
311 Decode Utf8 produced by the above toString() method.
|
|
312
|
|
313 If the output is provided off the stack, it should be large
|
|
314 enough to encompass the entire transcoding; failing to do
|
|
315 so will cause the output to be moved onto the heap instead.
|
|
316
|
|
317 Returns a slice of the output buffer, corresponding to the
|
|
318 converted characters. For optimum performance, the returned
|
|
319 buffer should be specified as 'output' on subsequent calls.
|
|
320
|
|
321 *******************************************************************************/
|
|
322
|
|
323 dchar[] toString32 (char[] input, dchar[] output=null, uint* ate=null)
|
|
324 {
|
|
325 int produced;
|
|
326 char* pIn = input.ptr;
|
|
327 char* pMax = pIn + input.length;
|
|
328 char* pValid;
|
|
329
|
|
330 if (ate is null)
|
|
331 if (input.length > output.length)
|
|
332 output.length = input.length;
|
|
333
|
|
334 if (input.length)
|
|
335 foreach (inout dchar d; output)
|
|
336 {
|
|
337 pValid = pIn;
|
|
338 dchar b = cast(dchar) *pIn;
|
|
339
|
|
340 if (b & 0x80)
|
|
341 if (b < 0xe0)
|
|
342 {
|
|
343 b &= 0x1f;
|
|
344 b = (b << 6) | (*++pIn & 0x3f);
|
|
345 }
|
|
346 else
|
|
347 if (b < 0xf0)
|
|
348 {
|
|
349 b &= 0x0f;
|
|
350 b = (b << 6) | (pIn[1] & 0x3f);
|
|
351 b = (b << 6) | (pIn[2] & 0x3f);
|
|
352 pIn += 2;
|
|
353 }
|
|
354 else
|
|
355 {
|
|
356 b &= 0x07;
|
|
357 b = (b << 6) | (pIn[1] & 0x3f);
|
|
358 b = (b << 6) | (pIn[2] & 0x3f);
|
|
359 b = (b << 6) | (pIn[3] & 0x3f);
|
|
360
|
|
361 if (b >= 0x110000)
|
|
362 onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
|
|
363 pIn += 3;
|
|
364 }
|
|
365
|
|
366 d = b;
|
|
367 ++produced;
|
|
368
|
|
369 // did we read past the end of the input?
|
|
370 if (++pIn >= pMax)
|
|
371 if (pIn > pMax)
|
|
372 {
|
|
373 // yep ~ return tail or throw error?
|
|
374 if (ate)
|
|
375 {
|
|
376 pIn = pValid;
|
|
377 --produced;
|
|
378 break;
|
|
379 }
|
|
380 onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
|
|
381 }
|
|
382 else
|
|
383 break;
|
|
384 }
|
|
385
|
|
386 // do we still have some input left?
|
|
387 if (ate)
|
|
388 *ate = pIn - input.ptr;
|
|
389 else
|
|
390 if (pIn < pMax)
|
|
391 // this should never happen!
|
|
392 onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
|
|
393
|
|
394 // return the produced output
|
|
395 return output [0..produced];
|
|
396 }
|
|
397
|
|
398 /*******************************************************************************
|
|
399
|
|
400 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
|
|
401 where the input dchar is greater than 0x10ffff.
|
|
402
|
|
403 If the output is provided off the stack, it should be large
|
|
404 enough to encompass the entire transcoding; failing to do
|
|
405 so will cause the output to be moved onto the heap instead.
|
|
406
|
|
407 Returns a slice of the output buffer, corresponding to the
|
|
408 converted characters. For optimum performance, the returned
|
|
409 buffer should be specified as 'output' on subsequent calls.
|
|
410
|
|
411 *******************************************************************************/
|
|
412
|
|
413 wchar[] toString16 (dchar[] input, wchar[] output=null, uint* ate=null)
|
|
414 {
|
|
415 if (ate)
|
|
416 *ate = input.length;
|
|
417 else
|
|
418 {
|
|
419 int estimate = input.length * 2 + 2;
|
|
420 if (output.length < estimate)
|
|
421 output.length = estimate;
|
|
422 }
|
|
423
|
|
424 wchar* pOut = output.ptr;
|
|
425 wchar* pMax = pOut + output.length - 2;
|
|
426
|
|
427 foreach (int eaten, dchar b; input)
|
|
428 {
|
|
429 // about to overflow the output?
|
|
430 if (pOut > pMax)
|
|
431 {
|
|
432 // if streaming, just return the unused input
|
|
433 if (ate)
|
|
434 {
|
|
435 *ate = eaten;
|
|
436 break;
|
|
437 }
|
|
438
|
|
439 // reallocate the output buffer
|
|
440 int len = pOut - output.ptr;
|
|
441 output.length = len + len / 2;
|
|
442 pOut = output.ptr + len;
|
|
443 pMax = output.ptr + output.length - 2;
|
|
444 }
|
|
445
|
|
446 if (b < 0x10000)
|
|
447 *pOut++ = b;
|
|
448 else
|
|
449 if (b < 0x110000)
|
|
450 {
|
|
451 pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff);
|
|
452 pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff);
|
|
453 pOut += 2;
|
|
454 }
|
|
455 else
|
|
456 onUnicodeError ("Unicode.toString16 : invalid dchar", eaten);
|
|
457 }
|
|
458
|
|
459 // return the produced output
|
|
460 return output [0..(pOut - output.ptr)];
|
|
461 }
|
|
462
|
|
463 /*******************************************************************************
|
|
464
|
|
465 Decode Utf16 produced by the above toString16() method.
|
|
466
|
|
467 If the output is provided off the stack, it should be large
|
|
468 enough to encompass the entire transcoding; failing to do
|
|
469 so will cause the output to be moved onto the heap instead.
|
|
470
|
|
471 Returns a slice of the output buffer, corresponding to the
|
|
472 converted characters. For optimum performance, the returned
|
|
473 buffer should be specified as 'output' on subsequent calls.
|
|
474
|
|
475 *******************************************************************************/
|
|
476
|
|
477 dchar[] toString32 (wchar[] input, dchar[] output=null, uint* ate=null)
|
|
478 {
|
|
479 int produced;
|
|
480 wchar* pIn = input.ptr;
|
|
481 wchar* pMax = pIn + input.length;
|
|
482 wchar* pValid;
|
|
483
|
|
484 if (ate is null)
|
|
485 if (input.length > output.length)
|
|
486 output.length = input.length;
|
|
487
|
|
488 if (input.length)
|
|
489 foreach (inout dchar d; output)
|
|
490 {
|
|
491 pValid = pIn;
|
|
492 dchar b = cast(dchar) *pIn;
|
|
493
|
|
494 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
|
|
495 if (b >= 0xd800 && b <= 0xdfff)
|
|
496 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
|
|
497
|
|
498 if (b >= 0x110000)
|
|
499 onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
|
|
500
|
|
501 d = b;
|
|
502 ++produced;
|
|
503
|
|
504 if (++pIn >= pMax)
|
|
505 if (pIn > pMax)
|
|
506 {
|
|
507 // yep ~ return tail or throw error?
|
|
508 if (ate)
|
|
509 {
|
|
510 pIn = pValid;
|
|
511 --produced;
|
|
512 break;
|
|
513 }
|
|
514 onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
|
|
515 }
|
|
516 else
|
|
517 break;
|
|
518 }
|
|
519
|
|
520 // do we still have some input left?
|
|
521 if (ate)
|
|
522 *ate = pIn - input.ptr;
|
|
523 else
|
|
524 if (pIn < pMax)
|
|
525 // this should never happen!
|
|
526 onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
|
|
527
|
|
528 // return the produced output
|
|
529 return output [0..produced];
|
|
530 }
|
|
531
|
|
532
|