comparison base/src/java/mangoicu/UConverter.d @ 27:1bf55a6eb092

Renamed java tree to base
author Frank Benoit <benoit@tionex.de>
date Sat, 21 Mar 2009 11:33:57 +0100
parents java/src/java/mangoicu/UConverter.d@dbfb303e8fb0
children
comparison
equal deleted inserted replaced
26:f589fc20a5f9 27:1bf55a6eb092
1 /*******************************************************************************
2
3 @file UConverter.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, October 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module java.mangoicu.UConverter;
86
87 private import java.mangoicu.ICU;
88
89 /*******************************************************************************
90
91 *******************************************************************************/
92
93 struct UAdjust // used with encode() & decode() methods
94 {
95 uint input, // how much was read from the input
96 output; // how much was written to the output
97 }
98
99 /*******************************************************************************
100
101 *******************************************************************************/
102
103 interface ITranscoder
104 {
105 void reset ();
106
107 bool convert (void[] input, void[] output, inout UAdjust x, bool flush);
108 }
109
110 /*******************************************************************************
111
112 This API is used to convert codepage or character encoded data to
113 and from UTF-16. You can open a converter with ucnv_open(). With
114 that converter, you can get its properties, set options, convert
115 your data and close the converter.
116
117 Since many software programs recogize different converter names
118 for different types of converters, there are other functions in
119 this API to iterate over the converter aliases.
120
121 See <A HREF="http://oss.software.ibm.com/icu/apiref/ucnv_8h.html">
122 this page</A> for full details.
123
124 *******************************************************************************/
125
126 class UConverter : ICU
127 {
128 private Handle handle;
129
130
131
132 /***********************************************************************
133
134 Creates a UConverter object with the names specified as a
135 string.
136
137 The actual name will be resolved with the alias file using
138 a case-insensitive string comparison that ignores delimiters
139 '-', '_', and ' ' (dash, underscore, and space). E.g., the
140 names "UTF8", "utf-8", and "Utf 8" are all equivalent. If null
141 is passed for the converter name, it will create one with the
142 getDefaultName() return value.
143
144 A converter name may contain options like a locale specification
145 to control the specific behavior of the converter instantiated.
146 The meaning of the options depends on the particular converter:
147 if an option is not defined for or recognized, it is ignored.
148
149 Options are appended to the converter name string, with an
150 OptionSepChar between the name and the first option and also
151 between adjacent options.
152
153 The conversion behavior and names can vary between platforms,
154 and ICU may convert some characters differently from other
155 platforms. Details on this topic are in the User's Guide.
156
157 ***********************************************************************/
158
159 this (char[] name)
160 {
161 UErrorCode e;
162
163 handle = ucnv_open (toString (name), e);
164 if (isError (e))
165 exception ("failed to create converter for '"~name~"'");
166 }
167
168 /***********************************************************************
169
170 Deletes the unicode converter and releases resources
171 associated with just this instance. Does not free up
172 shared converter tables.
173
174 ***********************************************************************/
175
176 ~this ()
177 {
178 ucnv_close (handle);
179 }
180
181 /***********************************************************************
182
183 Do a fuzzy compare of two converter/alias names. The
184 comparison is case-insensitive. It also ignores the
185 characters '-', '_', and ' ' (dash, underscore, and space).
186 Thus the strings "UTF-8", "utf_8", and "Utf 8" are exactly
187 equivalent
188
189 ***********************************************************************/
190
191 static final int compareNames (char[] a, char[] b)
192 {
193 return ucnv_compareNames (toString(a), toString(b));
194 }
195
196 /***********************************************************************
197
198 Resets the state of this converter to the default state.
199
200 This is used in the case of an error, to restart a
201 conversion from a known default state. It will also
202 empty the internal output buffers.
203
204 ***********************************************************************/
205
206 void reset ()
207 {
208 ucnv_reset (handle);
209 }
210
211 /***********************************************************************
212
213 Resets the from-Unicode part of this converter state to the
214 default state.
215
216 This is used in the case of an error to restart a conversion
217 from Unicode to a known default state. It will also empty the
218 internal output buffers used for the conversion from Unicode
219 codepoints.
220
221 ***********************************************************************/
222
223 void resetDecoder ()
224 {
225 ucnv_resetToUnicode (handle);
226 }
227
228 /***********************************************************************
229
230 Resets the from-Unicode part of this converter state to the
231 default state.
232
233 This is used in the case of an error to restart a conversion
234 from Unicode to a known default state. It will also empty the
235 internal output buffers used for the conversion from Unicode
236 codepoints.
237
238 ***********************************************************************/
239
240 void resetEncoder ()
241 {
242 ucnv_resetFromUnicode (handle);
243 }
244
245 /***********************************************************************
246
247 Returns the maximum number of bytes that are output per
248 UChar in conversion from Unicode using this converter.
249
250 The returned number can be used to calculate the size of
251 a target buffer for conversion from Unicode.
252
253 This number may not be the same as the maximum number of
254 bytes per "conversion unit". In other words, it may not
255 be the intuitively expected number of bytes per character
256 that would be published for a charset, and may not fulfill
257 any other purpose than the allocation of an output buffer
258 of guaranteed sufficient size for a given input length and
259 converter.
260
261 Examples for special cases that are taken into account:
262
263 * Supplementary code points may convert to more bytes than
264 BMP code points. This function returns bytes per UChar
265 (UTF-16 code unit), not per Unicode code point, for efficient
266 buffer allocation.
267 * State-shifting output (SI/SO, escapes, etc.) from stateful
268 converters.
269 * When m input UChars are converted to n output bytes, then
270 the maximum m/n is taken into account.
271
272 The number returned here does not take into account:
273
274 * callbacks which output more than one charset character
275 sequence per call, like escape callbacks
276 * initial and final non-character bytes that are output by
277 some converters (automatic BOMs, initial escape sequence,
278 final SI, etc.)
279
280 Examples for returned values:
281
282 * SBCS charsets: 1
283 * Shift-JIS: 2
284 * UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
285 * UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
286 * EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
287 * ISO-2022: 3 (always outputs UTF-8)
288 * ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
289 * ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3
290 + DBCS)
291
292 ***********************************************************************/
293
294 ubyte getMaxCharSize ()
295 {
296 return ucnv_getMaxCharSize (handle);
297 }
298
299 /***********************************************************************
300
301 Returns the minimum byte length for characters in this
302 codepage. This is usually either 1 or 2.
303
304 ***********************************************************************/
305
306 ubyte getMinCharSize ()
307 {
308 return ucnv_getMinCharSize (handle);
309 }
310
311 /***********************************************************************
312
313 Gets the internal, canonical name of the converter (zero-
314 terminated).
315
316 ***********************************************************************/
317
318 char[] getName ()
319 {
320 UErrorCode e;
321
322 char[] name = toArray (ucnv_getName (handle, e));
323 testError (e, "failed to get converter name");
324 return name;
325 }
326
327 /***********************************************************************
328
329 Determines if the converter contains ambiguous mappings of
330 the same character or not
331
332 ***********************************************************************/
333
334 bool isAmbiguous ()
335 {
336 return cast(bool) ucnv_isAmbiguous (handle);
337 }
338
339 /***********************************************************************
340
341 Detects Unicode signature byte sequences at the start
342 of the byte stream and returns the charset name of the
343 indicated Unicode charset. A null is returned where no
344 Unicode signature is recognized.
345
346 A caller can create a UConverter using the charset name.
347 The first code unit (wchar) from the start of the stream
348 will be U+FEFF (the Unicode BOM/signature character) and
349 can usually be ignored.
350
351 ***********************************************************************/
352
353 static final char[] detectSignature (void[] input)
354 {
355 UErrorCode e;
356 uint len;
357 char* name;
358
359 name = ucnv_detectUnicodeSignature (input.ptr, input.length, len, e);
360 if (name == null || isError (e))
361 return null;
362 return toArray (name);
363 }
364
365 /***********************************************************************
366
367 Converts an array of unicode characters to an array of
368 codepage characters.
369
370 This function is optimized for converting a continuous
371 stream of data in buffer-sized chunks, where the entire
372 source and target does not fit in available buffers.
373
374 The source pointer is an in/out parameter. It starts out
375 pointing where the conversion is to begin, and ends up
376 pointing after the last UChar consumed.
377
378 Target similarly starts out pointer at the first available
379 byte in the output buffer, and ends up pointing after the
380 last byte written to the output.
381
382 The converter always attempts to consume the entire source
383 buffer, unless (1.) the target buffer is full, or (2.) a
384 failing error is returned from the current callback function.
385 When a successful error status has been returned, it means
386 that all of the source buffer has been consumed. At that
387 point, the caller should reset the source and sourceLimit
388 pointers to point to the next chunk.
389
390 At the end of the stream (flush==true), the input is completely
391 consumed when *source==sourceLimit and no error code is set.
392 The converter object is then automatically reset by this
393 function. (This means that a converter need not be reset
394 explicitly between data streams if it finishes the previous
395 stream without errors.)
396
397 This is a stateful conversion. Additionally, even when all
398 source data has been consumed, some data may be in the
399 converters' internal state. Call this function repeatedly,
400 updating the target pointers with the next empty chunk of
401 target in case of a U_BUFFER_OVERFLOW_ERROR, and updating
402 the source pointers with the next chunk of source when a
403 successful error status is returned, until there are no more
404 chunks of source data.
405
406 Parameters:
407
408 converter the Unicode converter
409 target I/O parameter. Input : Points to the
410 beginning of the buffer to copy codepage
411 characters to. Output : points to after
412 the last codepage character copied to
413 target.
414 targetLimit the pointer just after last of the
415 target buffer
416 source I/O parameter, pointer to pointer to
417 the source Unicode character buffer.
418 sourceLimit the pointer just after the last of
419 the source buffer
420 offsets if NULL is passed, nothing will happen
421 to it, otherwise it needs to have the
422 same number of allocated cells as target.
423 Will fill in offsets from target to source
424 pointer e.g: offsets[3] is equal to 6, it
425 means that the target[3] was a result of
426 transcoding source[6] For output data
427 carried across calls, and other data
428 without a specific source character
429 (such as from escape sequences or
430 callbacks) -1 will be placed for offsets.
431 flush set to TRUE if the current source buffer
432 is the last available chunk of the source,
433 FALSE otherwise. Note that if a failing
434 status is returned, this function may
435 have to be called multiple times with
436 flush set to TRUE until the source buffer
437 is consumed.
438
439 ***********************************************************************/
440
441 bool encode (wchar[] input, void[] output, inout UAdjust x, bool flush)
442 {
443 UErrorCode e;
444 wchar* src = input.ptr;
445 void* dst = output.ptr;
446 wchar* srcLimit = src + input.length;
447 void* dstLimit = dst + output.length;
448
449 ucnv_fromUnicode (handle, &dst, dstLimit, &src, srcLimit, null, flush, e);
450 x.input = src - input.ptr;
451 x.output = dst - output.ptr;
452
453 if (e == e.BufferOverflow)
454 return true;
455
456 testError (e, "failed to encode");
457 return false;
458 }
459
460 /***********************************************************************
461
462 Encode the Unicode string into a codepage string.
463
464 This function is a more convenient but less powerful version
465 of encode(). It is only useful for whole strings, not
466 for streaming conversion. The maximum output buffer capacity
467 required (barring output from callbacks) should be calculated
468 using getMaxCharSize().
469
470 ***********************************************************************/
471
472 uint encode (wchar[] input, void[] output)
473 {
474 UErrorCode e;
475 uint len;
476
477 len = ucnv_fromUChars (handle, output.ptr, output.length, input.ptr, input.length, e);
478 testError (e, "failed to encode");
479 return len;
480 }
481
482 /***********************************************************************
483
484 Converts a buffer of codepage bytes into an array of unicode
485 UChars characters.
486
487 This function is optimized for converting a continuous stream
488 of data in buffer-sized chunks, where the entire source and
489 target does not fit in available buffers.
490
491 The source pointer is an in/out parameter. It starts out pointing
492 where the conversion is to begin, and ends up pointing after the
493 last byte of source consumed.
494
495 Target similarly starts out pointer at the first available UChar
496 in the output buffer, and ends up pointing after the last UChar
497 written to the output. It does NOT necessarily keep UChar sequences
498 together.
499
500 The converter always attempts to consume the entire source buffer,
501 unless (1.) the target buffer is full, or (2.) a failing error is
502 returned from the current callback function. When a successful
503 error status has been returned, it means that all of the source
504 buffer has been consumed. At that point, the caller should reset
505 the source and sourceLimit pointers to point to the next chunk.
506
507 At the end of the stream (flush==true), the input is completely
508 consumed when *source==sourceLimit and no error code is set The
509 converter object is then automatically reset by this function.
510 (This means that a converter need not be reset explicitly between
511 data streams if it finishes the previous stream without errors.)
512
513 This is a stateful conversion. Additionally, even when all source
514 data has been consumed, some data may be in the converters' internal
515 state. Call this function repeatedly, updating the target pointers
516 with the next empty chunk of target in case of a BufferOverflow, and
517 updating the source pointers with the next chunk of source when a
518 successful error status is returned, until there are no more chunks
519 of source data.
520
521 Parameters:
522 converter the Unicode converter
523 target I/O parameter. Input : Points to the beginning
524 of the buffer to copy UChars into. Output :
525 points to after the last UChar copied.
526 targetLimit the pointer just after the end of the target
527 buffer
528 source I/O parameter, pointer to pointer to the source
529 codepage buffer.
530 sourceLimit the pointer to the byte after the end of the
531 source buffer
532 offsets if NULL is passed, nothing will happen to
533 it, otherwise it needs to have the same
534 number of allocated cells as target. Will
535 fill in offsets from target to source pointer
536 e.g: offsets[3] is equal to 6, it means that
537 the target[3] was a result of transcoding
538 source[6] For output data carried across
539 calls, and other data without a specific
540 source character (such as from escape
541 sequences or callbacks) -1 will be placed
542 for offsets.
543 flush set to true if the current source buffer
544 is the last available chunk of the source,
545 false otherwise. Note that if a failing
546 status is returned, this function may have
547 to be called multiple times with flush set
548 to true until the source buffer is consumed.
549
550 ***********************************************************************/
551
552 bool decode (void[] input, wchar[] output, inout UAdjust x, bool flush)
553 {
554 UErrorCode e;
555 void* src = input.ptr;
556 wchar* dst = output.ptr;
557 void* srcLimit = src + input.length;
558 wchar* dstLimit = dst + output.length;
559
560 ucnv_toUnicode (handle, &dst, dstLimit, &src, srcLimit, null, flush, e);
561 x.input = src - input.ptr;
562 x.output = dst - output.ptr;
563
564 if (e == e.BufferOverflow)
565 return true;
566
567 testError (e, "failed to decode");
568 return false;
569 }
570
571 /***********************************************************************
572
573 Decode the codepage string into a Unicode string.
574
575 This function is a more convenient but less powerful version
576 of decode(). It is only useful for whole strings, not for
577 streaming conversion. The maximum output buffer capacity
578 required (barring output from callbacks) will be 2*src.length
579 (each char may be converted into a surrogate pair)
580
581 ***********************************************************************/
582
583 uint decode (void[] input, wchar[] output)
584 {
585 UErrorCode e;
586 uint len;
587
588 len = ucnv_toUChars (handle, output.ptr, output.length, input.ptr, input.length, e);
589 testError (e, "failed to decode");
590 return len;
591 }
592
593 /**********************************************************************
594
595 Iterate over the available converter names
596
597 **********************************************************************/
598
599 static int opApply (int delegate(inout char[] element) dg)
600 {
601 char[] name;
602 int result;
603 uint count = ucnv_countAvailable ();
604
605 for (uint i=0; i < count; ++i)
606 {
607 name = toArray (ucnv_getAvailableName (i));
608 result = dg (name);
609 if (result)
610 break;
611 }
612 return result;
613 }
614
615 /***********************************************************************
616
617 ***********************************************************************/
618
619 ITranscoder createTranscoder (UConverter dst)
620 {
621 return new UTranscoder (this, dst);
622 }
623
624 /**********************************************************************
625
626 **********************************************************************/
627
628 private class UTranscoder : ITranscoder
629 {
630 private UConverter cSrc,
631 cDst;
632 private bool clear = true;
633
634 /**************************************************************
635
636 **************************************************************/
637
638 this (UConverter src, UConverter dst)
639 {
640 cSrc = src;
641 cDst = dst;
642 }
643
644 /**************************************************************
645
646 **************************************************************/
647
648 void reset ()
649 {
650 clear = true;
651 }
652
653 /**************************************************************
654
655 **************************************************************/
656
657 bool convert (void[] input, void[] output, inout UAdjust x, bool flush)
658 {
659 UErrorCode e;
660 void* src = input.ptr;
661 void* dst = output.ptr;
662 void* srcLimit = src + input.length;
663 void* dstLimit = dst + output.length;
664
665 ucnv_convertEx (cDst.handle, cSrc.handle, &dst, dstLimit,
666 &src, srcLimit, null, null, null, null,
667 clear, flush, e);
668 clear = false;
669 x.input = src - input.ptr;
670 x.output = dst - output.ptr;
671
672 if (e == e.BufferOverflow)
673 return true;
674
675 testError (e, "failed to decode");
676 return false;
677 }
678 }
679
680
681 /***********************************************************************
682
683 Bind the ICU functions from a shared library. This is
684 complicated by the issues regarding D and DLLs on the
685 Windows platform
686
687 ***********************************************************************/
688
689 private static void* library;
690
691 /***********************************************************************
692
693 ***********************************************************************/
694
695 private static extern (C)
696 {
697 int function (char*, char*) ucnv_compareNames;
698 Handle function (char*, inout UErrorCode) ucnv_open;
699 char* function (void*, uint, inout uint, inout UErrorCode) ucnv_detectUnicodeSignature;
700 void function (Handle) ucnv_close;
701 void function (Handle) ucnv_reset;
702 int function (Handle) ucnv_resetToUnicode;
703 int function (Handle) ucnv_resetFromUnicode;
704 ubyte function (Handle) ucnv_getMaxCharSize;
705 ubyte function (Handle) ucnv_getMinCharSize;
706 char* function (Handle, inout UErrorCode) ucnv_getName;
707 uint function (Handle, wchar*, uint, void*, uint, inout UErrorCode) ucnv_toUChars;
708 uint function (Handle, void*, uint, wchar*, uint, inout UErrorCode) ucnv_fromUChars;
709 void function (Handle, void**, void*, wchar**, wchar*, int*, ubyte, inout UErrorCode) ucnv_fromUnicode;
710 void function (Handle, wchar**, wchar*, void**, void*, int*, ubyte, inout UErrorCode) ucnv_toUnicode;
711 void function (Handle, Handle, void**, void*, void**, void*, wchar*, wchar*, wchar*, wchar*, ubyte, ubyte, inout UErrorCode) ucnv_convertEx;
712 ubyte function (Handle) ucnv_isAmbiguous;
713 char* function (uint) ucnv_getAvailableName;
714 uint function () ucnv_countAvailable;
715 }
716
717 /***********************************************************************
718
719 ***********************************************************************/
720
721 static FunctionLoader.Bind[] targets =
722 [
723 {cast(void**) &ucnv_open, "ucnv_open"},
724 {cast(void**) &ucnv_close, "ucnv_close"},
725 {cast(void**) &ucnv_reset, "ucnv_reset"},
726 {cast(void**) &ucnv_resetToUnicode, "ucnv_resetToUnicode"},
727 {cast(void**) &ucnv_resetFromUnicode, "ucnv_resetFromUnicode"},
728 {cast(void**) &ucnv_compareNames, "ucnv_compareNames"},
729 {cast(void**) &ucnv_getMaxCharSize, "ucnv_getMaxCharSize"},
730 {cast(void**) &ucnv_getMinCharSize, "ucnv_getMinCharSize"},
731 {cast(void**) &ucnv_getName, "ucnv_getName"},
732 {cast(void**) &ucnv_detectUnicodeSignature, "ucnv_detectUnicodeSignature"},
733 {cast(void**) &ucnv_toUChars, "ucnv_toUChars"},
734 {cast(void**) &ucnv_fromUChars, "ucnv_fromUChars"},
735 {cast(void**) &ucnv_toUnicode, "ucnv_toUnicode"},
736 {cast(void**) &ucnv_fromUnicode, "ucnv_fromUnicode"},
737 {cast(void**) &ucnv_convertEx, "ucnv_convertEx"},
738 {cast(void**) &ucnv_isAmbiguous, "ucnv_isAmbiguous"},
739 {cast(void**) &ucnv_countAvailable, "ucnv_countAvailable"},
740 {cast(void**) &ucnv_getAvailableName, "ucnv_getAvailableName"},
741 ];
742
743 /***********************************************************************
744
745 ***********************************************************************/
746
747 static this ()
748 {
749 library = FunctionLoader.bind (icuuc, targets);
750 /+
751 foreach (char[] name; UConverter)
752 printf ("%.*s\n", name);
753 +/
754 }
755
756 /***********************************************************************
757
758 ***********************************************************************/
759
760 static ~this ()
761 {
762 FunctionLoader.unbind (library);
763 }
764 }