comparison tango/tango/text/convert/Utf.d @ 132:1700239cab2e trunk

[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author lindquist
date Fri, 11 Jan 2008 17:57:40 +0100
parents
children
comparison
equal deleted inserted replaced
131:5825d48b27d1 132:1700239cab2e
1 /*******************************************************************************
2
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved
4
5 license: BSD style: $(LICENSE)
6
7 version: Initial release: Oct 2004
8
9 authors: Kris
10
11 Fast Unicode transcoders. These are particularly sensitive to
12 minor changes on 32bit x86 devices, because the register set of
13 those devices is so small. Beware of subtle changes which might
14 extend the execution-period by as much as 200%. Because of this,
15 three of the six transcoders might read past the end of input by
16 one, two, or three bytes before arresting themselves. Note that
17 support for streaming adds a 15% overhead to the dchar => char
18 conversion, but has little effect on the others.
19
20 These routines were tuned on an Intel P4; other devices may work
21 more efficiently with a slightly different approach, though this
22 is likely to be reasonably optimal on AMD x86 CPUs also. These
23 algorithms would benefit significantly from those extra AMD64
24 registers. On a 3GHz P4, the dchar/char conversions take around
25 2500ns to process an array of 1000 ASCII elements. Invoking the
26 memory manager doubles that period, and quadruples the time for
27 arrays of 100 elements. Memory allocation can slow down notably
28 in a multi-threaded environment, so avoid that where possible.
29
30 Surrogate-pairs are dealt with in a non-optimal fashion when
31 transcoding between utf16 and utf8. Such cases are considered
32 to be boundary-conditions for this module.
33
34 There are three common cases where the input may be incomplete,
35 including each 'widening' case of utf8 => utf16, utf8 => utf32,
36 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
37 pairs are present. Such cases will throw an exception, unless
38 streaming-mode is enabled ~ in the latter mode, an additional
39 integer is returned indicating how many elements of the input
40 have been consumed. In all cases, a correct slice of the output
41 is returned.
42
43 For details on Unicode processing see:
44 $(UL $(LINK http://www.utf-8.com/))
45 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
46 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
47 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
48
49 *******************************************************************************/
50
51 module tango.text.convert.Utf;
52
53 public extern (C) void onUnicodeError (char[] msg, size_t idx = 0);
54
55
56 /*******************************************************************************
57
58 Encode Utf8 up to a maximum of 4 bytes long (five & six byte
59 variations are not supported).
60
61 If the output is provided off the stack, it should be large
62 enough to encompass the entire transcoding; failing to do
63 so will cause the output to be moved onto the heap instead.
64
65 Returns a slice of the output buffer, corresponding to the
66 converted characters. For optimum performance, the returned
67 buffer should be specified as 'output' on subsequent calls.
68 For example:
69
70 ---
71 char[] output;
72
73 char[] result = toString (input, output);
74
75 // reset output after a realloc
76 if (result.length > output.length)
77 output = result;
78 ---
79
80 *******************************************************************************/
81
82 char[] toString (wchar[] input, char[] output=null, uint* ate=null)
83 {
84 if (ate)
85 *ate = input.length;
86 else
87 {
88 // potentially reallocate output
89 int estimate = input.length * 2 + 3;
90 if (output.length < estimate)
91 output.length = estimate;
92 }
93
94 char* pOut = output.ptr;
95 char* pMax = pOut + output.length - 3;
96
97 foreach (int eaten, wchar b; input)
98 {
99 // about to overflow the output?
100 if (pOut > pMax)
101 {
102 // if streaming, just return the unused input
103 if (ate)
104 {
105 *ate = eaten;
106 break;
107 }
108
109 // reallocate the output buffer
110 int len = pOut - output.ptr;
111 output.length = len + len / 2;
112 pOut = output.ptr + len;
113 pMax = output.ptr + output.length - 3;
114 }
115
116 if (b < 0x80)
117 *pOut++ = b;
118 else
119 if (b < 0x0800)
120 {
121 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
122 pOut[1] = 0x80 | (b & 0x3f);
123 pOut += 2;
124 }
125 else
126 if (b < 0xd800 || b > 0xdfff)
127 {
128 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
129 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
130 pOut[2] = 0x80 | (b & 0x3f);
131 pOut += 3;
132 }
133 else
134 // deal with surrogate-pairs
135 return toString (toString32(input, null, ate), output);
136 }
137
138 // return the produced output
139 return output [0..(pOut - output.ptr)];
140 }
141
142 /*******************************************************************************
143
144 Decode Utf8 produced by the above toString() method.
145
146 If the output is provided off the stack, it should be large
147 enough to encompass the entire transcoding; failing to do
148 so will cause the output to be moved onto the heap instead.
149
150 Returns a slice of the output buffer, corresponding to the
151 converted characters. For optimum performance, the returned
152 buffer should be specified as 'output' on subsequent calls.
153
154 *******************************************************************************/
155
156 wchar[] toString16 (char[] input, wchar[] output=null, uint* ate=null)
157 {
158 int produced;
159 char* pIn = input.ptr;
160 char* pMax = pIn + input.length;
161 char* pValid;
162
163 if (ate is null)
164 if (input.length > output.length)
165 output.length = input.length;
166
167 if (input.length)
168 foreach (inout wchar d; output)
169 {
170 pValid = pIn;
171 wchar b = cast(wchar) *pIn;
172
173 if (b & 0x80)
174 if (b < 0xe0)
175 {
176 b &= 0x1f;
177 b = (b << 6) | (*++pIn & 0x3f);
178 }
179 else
180 if (b < 0xf0)
181 {
182 b &= 0x0f;
183 b = (b << 6) | (pIn[1] & 0x3f);
184 b = (b << 6) | (pIn[2] & 0x3f);
185 pIn += 2;
186 }
187 else
188 // deal with surrogate-pairs
189 return toString16 (toString32(input, null, ate), output);
190
191 d = b;
192 ++produced;
193
194 // did we read past the end of the input?
195 if (++pIn >= pMax)
196 if (pIn > pMax)
197 {
198 // yep ~ return tail or throw error?
199 if (ate)
200 {
201 pIn = pValid;
202 --produced;
203 break;
204 }
205 onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
206 }
207 else
208 break;
209 }
210
211 // do we still have some input left?
212 if (ate)
213 *ate = pIn - input.ptr;
214 else
215 if (pIn < pMax)
216 // this should never happen!
217 onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
218
219 // return the produced output
220 return output [0..produced];
221 }
222
223
224 /*******************************************************************************
225
226 Encode Utf8 up to a maximum of 4 bytes long (five & six
227 byte variations are not supported). Throws an exception
228 where the input dchar is greater than 0x10ffff.
229
230 If the output is provided off the stack, it should be large
231 enough to encompass the entire transcoding; failing to do
232 so will cause the output to be moved onto the heap instead.
233
234 Returns a slice of the output buffer, corresponding to the
235 converted characters. For optimum performance, the returned
236 buffer should be specified as 'output' on subsequent calls.
237
238 *******************************************************************************/
239
240 char[] toString (dchar[] input, char[] output=null, uint* ate=null)
241 {
242 if (ate)
243 *ate = input.length;
244 else
245 {
246 // potentially reallocate output
247 int estimate = input.length * 2 + 4;
248 if (output.length < estimate)
249 output.length = estimate;
250 }
251
252 char* pOut = output.ptr;
253 char* pMax = pOut + output.length - 4;
254
255 foreach (int eaten, dchar b; input)
256 {
257 // about to overflow the output?
258 if (pOut > pMax)
259 {
260 // if streaming, just return the unused input
261 if (ate)
262 {
263 *ate = eaten;
264 break;
265 }
266
267 // reallocate the output buffer
268 int len = pOut - output.ptr;
269 output.length = len + len / 2;
270 pOut = output.ptr + len;
271 pMax = output.ptr + output.length - 4;
272 }
273
274 if (b < 0x80)
275 *pOut++ = b;
276 else
277 if (b < 0x0800)
278 {
279 pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
280 pOut[1] = 0x80 | (b & 0x3f);
281 pOut += 2;
282 }
283 else
284 if (b < 0x10000)
285 {
286 pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
287 pOut[1] = 0x80 | ((b >> 6) & 0x3f);
288 pOut[2] = 0x80 | (b & 0x3f);
289 pOut += 3;
290 }
291 else
292 if (b < 0x110000)
293 {
294 pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
295 pOut[1] = 0x80 | ((b >> 12) & 0x3f);
296 pOut[2] = 0x80 | ((b >> 6) & 0x3f);
297 pOut[3] = 0x80 | (b & 0x3f);
298 pOut += 4;
299 }
300 else
301 onUnicodeError ("Unicode.toString : invalid dchar", eaten);
302 }
303
304 // return the produced output
305 return output [0..(pOut - output.ptr)];
306 }
307
308
309 /*******************************************************************************
310
311 Decode Utf8 produced by the above toString() method.
312
313 If the output is provided off the stack, it should be large
314 enough to encompass the entire transcoding; failing to do
315 so will cause the output to be moved onto the heap instead.
316
317 Returns a slice of the output buffer, corresponding to the
318 converted characters. For optimum performance, the returned
319 buffer should be specified as 'output' on subsequent calls.
320
321 *******************************************************************************/
322
323 dchar[] toString32 (char[] input, dchar[] output=null, uint* ate=null)
324 {
325 int produced;
326 char* pIn = input.ptr;
327 char* pMax = pIn + input.length;
328 char* pValid;
329
330 if (ate is null)
331 if (input.length > output.length)
332 output.length = input.length;
333
334 if (input.length)
335 foreach (inout dchar d; output)
336 {
337 pValid = pIn;
338 dchar b = cast(dchar) *pIn;
339
340 if (b & 0x80)
341 if (b < 0xe0)
342 {
343 b &= 0x1f;
344 b = (b << 6) | (*++pIn & 0x3f);
345 }
346 else
347 if (b < 0xf0)
348 {
349 b &= 0x0f;
350 b = (b << 6) | (pIn[1] & 0x3f);
351 b = (b << 6) | (pIn[2] & 0x3f);
352 pIn += 2;
353 }
354 else
355 {
356 b &= 0x07;
357 b = (b << 6) | (pIn[1] & 0x3f);
358 b = (b << 6) | (pIn[2] & 0x3f);
359 b = (b << 6) | (pIn[3] & 0x3f);
360
361 if (b >= 0x110000)
362 onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
363 pIn += 3;
364 }
365
366 d = b;
367 ++produced;
368
369 // did we read past the end of the input?
370 if (++pIn >= pMax)
371 if (pIn > pMax)
372 {
373 // yep ~ return tail or throw error?
374 if (ate)
375 {
376 pIn = pValid;
377 --produced;
378 break;
379 }
380 onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
381 }
382 else
383 break;
384 }
385
386 // do we still have some input left?
387 if (ate)
388 *ate = pIn - input.ptr;
389 else
390 if (pIn < pMax)
391 // this should never happen!
392 onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
393
394 // return the produced output
395 return output [0..produced];
396 }
397
398 /*******************************************************************************
399
400 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
401 where the input dchar is greater than 0x10ffff.
402
403 If the output is provided off the stack, it should be large
404 enough to encompass the entire transcoding; failing to do
405 so will cause the output to be moved onto the heap instead.
406
407 Returns a slice of the output buffer, corresponding to the
408 converted characters. For optimum performance, the returned
409 buffer should be specified as 'output' on subsequent calls.
410
411 *******************************************************************************/
412
413 wchar[] toString16 (dchar[] input, wchar[] output=null, uint* ate=null)
414 {
415 if (ate)
416 *ate = input.length;
417 else
418 {
419 int estimate = input.length * 2 + 2;
420 if (output.length < estimate)
421 output.length = estimate;
422 }
423
424 wchar* pOut = output.ptr;
425 wchar* pMax = pOut + output.length - 2;
426
427 foreach (int eaten, dchar b; input)
428 {
429 // about to overflow the output?
430 if (pOut > pMax)
431 {
432 // if streaming, just return the unused input
433 if (ate)
434 {
435 *ate = eaten;
436 break;
437 }
438
439 // reallocate the output buffer
440 int len = pOut - output.ptr;
441 output.length = len + len / 2;
442 pOut = output.ptr + len;
443 pMax = output.ptr + output.length - 2;
444 }
445
446 if (b < 0x10000)
447 *pOut++ = b;
448 else
449 if (b < 0x110000)
450 {
451 pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff);
452 pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff);
453 pOut += 2;
454 }
455 else
456 onUnicodeError ("Unicode.toString16 : invalid dchar", eaten);
457 }
458
459 // return the produced output
460 return output [0..(pOut - output.ptr)];
461 }
462
463 /*******************************************************************************
464
465 Decode Utf16 produced by the above toString16() method.
466
467 If the output is provided off the stack, it should be large
468 enough to encompass the entire transcoding; failing to do
469 so will cause the output to be moved onto the heap instead.
470
471 Returns a slice of the output buffer, corresponding to the
472 converted characters. For optimum performance, the returned
473 buffer should be specified as 'output' on subsequent calls.
474
475 *******************************************************************************/
476
477 dchar[] toString32 (wchar[] input, dchar[] output=null, uint* ate=null)
478 {
479 int produced;
480 wchar* pIn = input.ptr;
481 wchar* pMax = pIn + input.length;
482 wchar* pValid;
483
484 if (ate is null)
485 if (input.length > output.length)
486 output.length = input.length;
487
488 if (input.length)
489 foreach (inout dchar d; output)
490 {
491 pValid = pIn;
492 dchar b = cast(dchar) *pIn;
493
494 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
495 if (b >= 0xd800 && b <= 0xdfff)
496 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
497
498 if (b >= 0x110000)
499 onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
500
501 d = b;
502 ++produced;
503
504 if (++pIn >= pMax)
505 if (pIn > pMax)
506 {
507 // yep ~ return tail or throw error?
508 if (ate)
509 {
510 pIn = pValid;
511 --produced;
512 break;
513 }
514 onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
515 }
516 else
517 break;
518 }
519
520 // do we still have some input left?
521 if (ate)
522 *ate = pIn - input.ptr;
523 else
524 if (pIn < pMax)
525 // this should never happen!
526 onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
527
528 // return the produced output
529 return output [0..produced];
530 }
531
532