Mercurial > projects > ldc
comparison tango/tango/text/convert/Utf.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:5825d48b27d1 | 132:1700239cab2e |
---|---|
1 /******************************************************************************* | |
2 | |
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved | |
4 | |
5 license: BSD style: $(LICENSE) | |
6 | |
7 version: Initial release: Oct 2004 | |
8 | |
9 authors: Kris | |
10 | |
11 Fast Unicode transcoders. These are particularly sensitive to | |
12 minor changes on 32bit x86 devices, because the register set of | |
13 those devices is so small. Beware of subtle changes which might | |
14 extend the execution-period by as much as 200%. Because of this, | |
15 three of the six transcoders might read past the end of input by | |
16 one, two, or three bytes before arresting themselves. Note that | |
17 support for streaming adds a 15% overhead to the dchar => char | |
18 conversion, but has little effect on the others. | |
19 | |
20 These routines were tuned on an Intel P4; other devices may work | |
21 more efficiently with a slightly different approach, though this | |
22 is likely to be reasonably optimal on AMD x86 CPUs also. These | |
23 algorithms would benefit significantly from those extra AMD64 | |
24 registers. On a 3GHz P4, the dchar/char conversions take around | |
25 2500ns to process an array of 1000 ASCII elements. Invoking the | |
26 memory manager doubles that period, and quadruples the time for | |
27 arrays of 100 elements. Memory allocation can slow down notably | |
28 in a multi-threaded environment, so avoid that where possible. | |
29 | |
30 Surrogate-pairs are dealt with in a non-optimal fashion when | |
31 transcoding between utf16 and utf8. Such cases are considered | |
32 to be boundary-conditions for this module. | |
33 | |
34 There are three common cases where the input may be incomplete, | |
35 including each 'widening' case of utf8 => utf16, utf8 => utf32, | |
36 and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate | |
37 pairs are present. Such cases will throw an exception, unless | |
38 streaming-mode is enabled ~ in the latter mode, an additional | |
39 integer is returned indicating how many elements of the input | |
40 have been consumed. In all cases, a correct slice of the output | |
41 is returned. | |
42 | |
43 For details on Unicode processing see: | |
44 $(UL $(LINK http://www.utf-8.com/)) | |
45 $(UL $(LINK http://www.hackcraft.net/xmlUnicode/)) | |
46 $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/)) | |
47 $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/)) | |
48 | |
49 *******************************************************************************/ | |
50 | |
51 module tango.text.convert.Utf; | |
52 | |
53 public extern (C) void onUnicodeError (char[] msg, size_t idx = 0); | |
54 | |
55 | |
56 /******************************************************************************* | |
57 | |
58 Encode Utf8 up to a maximum of 4 bytes long (five & six byte | |
59 variations are not supported). | |
60 | |
61 If the output is provided off the stack, it should be large | |
62 enough to encompass the entire transcoding; failing to do | |
63 so will cause the output to be moved onto the heap instead. | |
64 | |
65 Returns a slice of the output buffer, corresponding to the | |
66 converted characters. For optimum performance, the returned | |
67 buffer should be specified as 'output' on subsequent calls. | |
68 For example: | |
69 | |
70 --- | |
71 char[] output; | |
72 | |
73 char[] result = toString (input, output); | |
74 | |
75 // reset output after a realloc | |
76 if (result.length > output.length) | |
77 output = result; | |
78 --- | |
79 | |
80 *******************************************************************************/ | |
81 | |
82 char[] toString (wchar[] input, char[] output=null, uint* ate=null) | |
83 { | |
84 if (ate) | |
85 *ate = input.length; | |
86 else | |
87 { | |
88 // potentially reallocate output | |
89 int estimate = input.length * 2 + 3; | |
90 if (output.length < estimate) | |
91 output.length = estimate; | |
92 } | |
93 | |
94 char* pOut = output.ptr; | |
95 char* pMax = pOut + output.length - 3; | |
96 | |
97 foreach (int eaten, wchar b; input) | |
98 { | |
99 // about to overflow the output? | |
100 if (pOut > pMax) | |
101 { | |
102 // if streaming, just return the unused input | |
103 if (ate) | |
104 { | |
105 *ate = eaten; | |
106 break; | |
107 } | |
108 | |
109 // reallocate the output buffer | |
110 int len = pOut - output.ptr; | |
111 output.length = len + len / 2; | |
112 pOut = output.ptr + len; | |
113 pMax = output.ptr + output.length - 3; | |
114 } | |
115 | |
116 if (b < 0x80) | |
117 *pOut++ = b; | |
118 else | |
119 if (b < 0x0800) | |
120 { | |
121 pOut[0] = 0xc0 | ((b >> 6) & 0x3f); | |
122 pOut[1] = 0x80 | (b & 0x3f); | |
123 pOut += 2; | |
124 } | |
125 else | |
126 if (b < 0xd800 || b > 0xdfff) | |
127 { | |
128 pOut[0] = 0xe0 | ((b >> 12) & 0x3f); | |
129 pOut[1] = 0x80 | ((b >> 6) & 0x3f); | |
130 pOut[2] = 0x80 | (b & 0x3f); | |
131 pOut += 3; | |
132 } | |
133 else | |
134 // deal with surrogate-pairs | |
135 return toString (toString32(input, null, ate), output); | |
136 } | |
137 | |
138 // return the produced output | |
139 return output [0..(pOut - output.ptr)]; | |
140 } | |
141 | |
142 /******************************************************************************* | |
143 | |
144 Decode Utf8 produced by the above toString() method. | |
145 | |
146 If the output is provided off the stack, it should be large | |
147 enough to encompass the entire transcoding; failing to do | |
148 so will cause the output to be moved onto the heap instead. | |
149 | |
150 Returns a slice of the output buffer, corresponding to the | |
151 converted characters. For optimum performance, the returned | |
152 buffer should be specified as 'output' on subsequent calls. | |
153 | |
154 *******************************************************************************/ | |
155 | |
156 wchar[] toString16 (char[] input, wchar[] output=null, uint* ate=null) | |
157 { | |
158 int produced; | |
159 char* pIn = input.ptr; | |
160 char* pMax = pIn + input.length; | |
161 char* pValid; | |
162 | |
163 if (ate is null) | |
164 if (input.length > output.length) | |
165 output.length = input.length; | |
166 | |
167 if (input.length) | |
168 foreach (inout wchar d; output) | |
169 { | |
170 pValid = pIn; | |
171 wchar b = cast(wchar) *pIn; | |
172 | |
173 if (b & 0x80) | |
174 if (b < 0xe0) | |
175 { | |
176 b &= 0x1f; | |
177 b = (b << 6) | (*++pIn & 0x3f); | |
178 } | |
179 else | |
180 if (b < 0xf0) | |
181 { | |
182 b &= 0x0f; | |
183 b = (b << 6) | (pIn[1] & 0x3f); | |
184 b = (b << 6) | (pIn[2] & 0x3f); | |
185 pIn += 2; | |
186 } | |
187 else | |
188 // deal with surrogate-pairs | |
189 return toString16 (toString32(input, null, ate), output); | |
190 | |
191 d = b; | |
192 ++produced; | |
193 | |
194 // did we read past the end of the input? | |
195 if (++pIn >= pMax) | |
196 if (pIn > pMax) | |
197 { | |
198 // yep ~ return tail or throw error? | |
199 if (ate) | |
200 { | |
201 pIn = pValid; | |
202 --produced; | |
203 break; | |
204 } | |
205 onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr); | |
206 } | |
207 else | |
208 break; | |
209 } | |
210 | |
211 // do we still have some input left? | |
212 if (ate) | |
213 *ate = pIn - input.ptr; | |
214 else | |
215 if (pIn < pMax) | |
216 // this should never happen! | |
217 onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr); | |
218 | |
219 // return the produced output | |
220 return output [0..produced]; | |
221 } | |
222 | |
223 | |
224 /******************************************************************************* | |
225 | |
226 Encode Utf8 up to a maximum of 4 bytes long (five & six | |
227 byte variations are not supported). Throws an exception | |
228 where the input dchar is greater than 0x10ffff. | |
229 | |
230 If the output is provided off the stack, it should be large | |
231 enough to encompass the entire transcoding; failing to do | |
232 so will cause the output to be moved onto the heap instead. | |
233 | |
234 Returns a slice of the output buffer, corresponding to the | |
235 converted characters. For optimum performance, the returned | |
236 buffer should be specified as 'output' on subsequent calls. | |
237 | |
238 *******************************************************************************/ | |
239 | |
240 char[] toString (dchar[] input, char[] output=null, uint* ate=null) | |
241 { | |
242 if (ate) | |
243 *ate = input.length; | |
244 else | |
245 { | |
246 // potentially reallocate output | |
247 int estimate = input.length * 2 + 4; | |
248 if (output.length < estimate) | |
249 output.length = estimate; | |
250 } | |
251 | |
252 char* pOut = output.ptr; | |
253 char* pMax = pOut + output.length - 4; | |
254 | |
255 foreach (int eaten, dchar b; input) | |
256 { | |
257 // about to overflow the output? | |
258 if (pOut > pMax) | |
259 { | |
260 // if streaming, just return the unused input | |
261 if (ate) | |
262 { | |
263 *ate = eaten; | |
264 break; | |
265 } | |
266 | |
267 // reallocate the output buffer | |
268 int len = pOut - output.ptr; | |
269 output.length = len + len / 2; | |
270 pOut = output.ptr + len; | |
271 pMax = output.ptr + output.length - 4; | |
272 } | |
273 | |
274 if (b < 0x80) | |
275 *pOut++ = b; | |
276 else | |
277 if (b < 0x0800) | |
278 { | |
279 pOut[0] = 0xc0 | ((b >> 6) & 0x3f); | |
280 pOut[1] = 0x80 | (b & 0x3f); | |
281 pOut += 2; | |
282 } | |
283 else | |
284 if (b < 0x10000) | |
285 { | |
286 pOut[0] = 0xe0 | ((b >> 12) & 0x3f); | |
287 pOut[1] = 0x80 | ((b >> 6) & 0x3f); | |
288 pOut[2] = 0x80 | (b & 0x3f); | |
289 pOut += 3; | |
290 } | |
291 else | |
292 if (b < 0x110000) | |
293 { | |
294 pOut[0] = 0xf0 | ((b >> 18) & 0x3f); | |
295 pOut[1] = 0x80 | ((b >> 12) & 0x3f); | |
296 pOut[2] = 0x80 | ((b >> 6) & 0x3f); | |
297 pOut[3] = 0x80 | (b & 0x3f); | |
298 pOut += 4; | |
299 } | |
300 else | |
301 onUnicodeError ("Unicode.toString : invalid dchar", eaten); | |
302 } | |
303 | |
304 // return the produced output | |
305 return output [0..(pOut - output.ptr)]; | |
306 } | |
307 | |
308 | |
309 /******************************************************************************* | |
310 | |
311 Decode Utf8 produced by the above toString() method. | |
312 | |
313 If the output is provided off the stack, it should be large | |
314 enough to encompass the entire transcoding; failing to do | |
315 so will cause the output to be moved onto the heap instead. | |
316 | |
317 Returns a slice of the output buffer, corresponding to the | |
318 converted characters. For optimum performance, the returned | |
319 buffer should be specified as 'output' on subsequent calls. | |
320 | |
321 *******************************************************************************/ | |
322 | |
323 dchar[] toString32 (char[] input, dchar[] output=null, uint* ate=null) | |
324 { | |
325 int produced; | |
326 char* pIn = input.ptr; | |
327 char* pMax = pIn + input.length; | |
328 char* pValid; | |
329 | |
330 if (ate is null) | |
331 if (input.length > output.length) | |
332 output.length = input.length; | |
333 | |
334 if (input.length) | |
335 foreach (inout dchar d; output) | |
336 { | |
337 pValid = pIn; | |
338 dchar b = cast(dchar) *pIn; | |
339 | |
340 if (b & 0x80) | |
341 if (b < 0xe0) | |
342 { | |
343 b &= 0x1f; | |
344 b = (b << 6) | (*++pIn & 0x3f); | |
345 } | |
346 else | |
347 if (b < 0xf0) | |
348 { | |
349 b &= 0x0f; | |
350 b = (b << 6) | (pIn[1] & 0x3f); | |
351 b = (b << 6) | (pIn[2] & 0x3f); | |
352 pIn += 2; | |
353 } | |
354 else | |
355 { | |
356 b &= 0x07; | |
357 b = (b << 6) | (pIn[1] & 0x3f); | |
358 b = (b << 6) | (pIn[2] & 0x3f); | |
359 b = (b << 6) | (pIn[3] & 0x3f); | |
360 | |
361 if (b >= 0x110000) | |
362 onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr); | |
363 pIn += 3; | |
364 } | |
365 | |
366 d = b; | |
367 ++produced; | |
368 | |
369 // did we read past the end of the input? | |
370 if (++pIn >= pMax) | |
371 if (pIn > pMax) | |
372 { | |
373 // yep ~ return tail or throw error? | |
374 if (ate) | |
375 { | |
376 pIn = pValid; | |
377 --produced; | |
378 break; | |
379 } | |
380 onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr); | |
381 } | |
382 else | |
383 break; | |
384 } | |
385 | |
386 // do we still have some input left? | |
387 if (ate) | |
388 *ate = pIn - input.ptr; | |
389 else | |
390 if (pIn < pMax) | |
391 // this should never happen! | |
392 onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr); | |
393 | |
394 // return the produced output | |
395 return output [0..produced]; | |
396 } | |
397 | |
398 /******************************************************************************* | |
399 | |
400 Encode Utf16 up to a maximum of 2 bytes long. Throws an exception | |
401 where the input dchar is greater than 0x10ffff. | |
402 | |
403 If the output is provided off the stack, it should be large | |
404 enough to encompass the entire transcoding; failing to do | |
405 so will cause the output to be moved onto the heap instead. | |
406 | |
407 Returns a slice of the output buffer, corresponding to the | |
408 converted characters. For optimum performance, the returned | |
409 buffer should be specified as 'output' on subsequent calls. | |
410 | |
411 *******************************************************************************/ | |
412 | |
413 wchar[] toString16 (dchar[] input, wchar[] output=null, uint* ate=null) | |
414 { | |
415 if (ate) | |
416 *ate = input.length; | |
417 else | |
418 { | |
419 int estimate = input.length * 2 + 2; | |
420 if (output.length < estimate) | |
421 output.length = estimate; | |
422 } | |
423 | |
424 wchar* pOut = output.ptr; | |
425 wchar* pMax = pOut + output.length - 2; | |
426 | |
427 foreach (int eaten, dchar b; input) | |
428 { | |
429 // about to overflow the output? | |
430 if (pOut > pMax) | |
431 { | |
432 // if streaming, just return the unused input | |
433 if (ate) | |
434 { | |
435 *ate = eaten; | |
436 break; | |
437 } | |
438 | |
439 // reallocate the output buffer | |
440 int len = pOut - output.ptr; | |
441 output.length = len + len / 2; | |
442 pOut = output.ptr + len; | |
443 pMax = output.ptr + output.length - 2; | |
444 } | |
445 | |
446 if (b < 0x10000) | |
447 *pOut++ = b; | |
448 else | |
449 if (b < 0x110000) | |
450 { | |
451 pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff); | |
452 pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff); | |
453 pOut += 2; | |
454 } | |
455 else | |
456 onUnicodeError ("Unicode.toString16 : invalid dchar", eaten); | |
457 } | |
458 | |
459 // return the produced output | |
460 return output [0..(pOut - output.ptr)]; | |
461 } | |
462 | |
463 /******************************************************************************* | |
464 | |
465 Decode Utf16 produced by the above toString16() method. | |
466 | |
467 If the output is provided off the stack, it should be large | |
468 enough to encompass the entire transcoding; failing to do | |
469 so will cause the output to be moved onto the heap instead. | |
470 | |
471 Returns a slice of the output buffer, corresponding to the | |
472 converted characters. For optimum performance, the returned | |
473 buffer should be specified as 'output' on subsequent calls. | |
474 | |
475 *******************************************************************************/ | |
476 | |
477 dchar[] toString32 (wchar[] input, dchar[] output=null, uint* ate=null) | |
478 { | |
479 int produced; | |
480 wchar* pIn = input.ptr; | |
481 wchar* pMax = pIn + input.length; | |
482 wchar* pValid; | |
483 | |
484 if (ate is null) | |
485 if (input.length > output.length) | |
486 output.length = input.length; | |
487 | |
488 if (input.length) | |
489 foreach (inout dchar d; output) | |
490 { | |
491 pValid = pIn; | |
492 dchar b = cast(dchar) *pIn; | |
493 | |
494 // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35 | |
495 if (b >= 0xd800 && b <= 0xdfff) | |
496 b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00); | |
497 | |
498 if (b >= 0x110000) | |
499 onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr); | |
500 | |
501 d = b; | |
502 ++produced; | |
503 | |
504 if (++pIn >= pMax) | |
505 if (pIn > pMax) | |
506 { | |
507 // yep ~ return tail or throw error? | |
508 if (ate) | |
509 { | |
510 pIn = pValid; | |
511 --produced; | |
512 break; | |
513 } | |
514 onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr); | |
515 } | |
516 else | |
517 break; | |
518 } | |
519 | |
520 // do we still have some input left? | |
521 if (ate) | |
522 *ate = pIn - input.ptr; | |
523 else | |
524 if (pIn < pMax) | |
525 // this should never happen! | |
526 onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr); | |
527 | |
528 // return the produced output | |
529 return output [0..produced]; | |
530 } | |
531 | |
532 |