comparison dwtx/dwtxhelper/mangoicu/UCollator.d @ 89:040da1cb0d76

Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author Frank Benoit <benoit@tionex.de>
date Sun, 22 Jun 2008 22:57:31 +0200
parents
children 11e8159caf7a
comparison
equal deleted inserted replaced
88:cd18fa3b71f1 89:040da1cb0d76
1 /*******************************************************************************
2
3 @file UCollator.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, November 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module dwtx.dwthelper.mangoicu.UCollator;
86
87 private import dwtx.dwthelper.mangoicu.ICU,
88 dwtx.dwthelper.mangoicu.USet,
89 dwtx.dwthelper.mangoicu.ULocale,
90 dwtx.dwthelper.mangoicu.UString;
91
92 /*******************************************************************************
93
94 The API for Collator performs locale-sensitive string comparison.
95 You use this service to build searching and sorting routines for
96 natural language text. Important: The ICU collation service has been
97 reimplemented in order to achieve better performance and UCA compliance.
98 For details, see the collation design document.
99
100 For more information about the collation service see the users guide.
101
102 Collation service provides correct sorting orders for most locales
103 supported in ICU. If specific data for a locale is not available,
104 the orders eventually falls back to the UCA sort order.
105
106 Sort ordering may be customized by providing your own set of rules.
107 For more on this subject see the Collation customization section of
108 the users guide.
109
110 See <A HREF="http://oss.software.ibm.com/icu/apiref/ucol_8h.html">
111 this page</A> for full details.
112
113 *******************************************************************************/
114
115 class UCollator : ICU
116 {
117 package Handle handle;
118
119 typedef void* UParseError;
120
121 enum Attribute
122 {
123 FrenchCollation,
124 AlternateHandling,
125 CaseFirst,
126 CaseLevel,
127 NormalizationMode,
128 DecompositionMode = NormalizationMode,
129 strength,
130 HiraganaQuaternaryMode,
131 NumericCollation,
132 AttributeCount
133 }
134
135 enum AttributeValue
136 {
137 Default = -1,
138 Primary = 0,
139 Secondary = 1,
140 Tertiary = 2,
141 DefaultStrength = Tertiary,
142 CeStrengthLimit,
143 Quaternary = 3,
144 Identical = 15,
145 strengthLimit,
146 Off = 16,
147 On = 17,
148 Shifted = 20,
149 NonIgnorable = 21,
150 LowerFirst = 24,
151 UpperFirst = 25,
152 AttributeValueCount
153 }
154
155 enum RuleOption
156 {
157 TailoringOnly,
158 FullRules
159 }
160
161 enum BoundMode
162 {
163 BoundLower = 0,
164 BoundUpper = 1,
165 BoundUpperLong = 2,
166 BoundValueCount
167 }
168
169 typedef AttributeValue Strength;
170
171 /***********************************************************************
172
173 Open a UCollator for comparing strings. The locale specified
174 determines the required collation rules. Special values for
175 locales can be passed in - if ULocale.Default is passed for
176 the locale, the default locale collation rules will be used.
177 If ULocale.Root is passed, UCA rules will be used
178
179 ***********************************************************************/
180
181 this (ULocale locale)
182 {
183 Error e;
184
185 handle = ucol_open (toString(locale.name), e);
186 testError (e, "failed to open collator");
187 }
188
189 /***********************************************************************
190
191 Produce a UCollator instance according to the rules supplied.
192
193 The rules are used to change the default ordering, defined in
194 the UCA in a process called tailoring. For the syntax of the
195 rules please see users guide
196
197 ***********************************************************************/
198
199 this (UText rules, AttributeValue mode, Strength strength)
200 {
201 Error e;
202
203 handle = ucol_openRules (rules.get.ptr, rules.len, mode, strength, null, e);
204 testError (e, "failed to open rules-based collator");
205 }
206
207 /***********************************************************************
208
209 Open a collator defined by a short form string. The
210 structure and the syntax of the string is defined in
211 the "Naming collators" section of the users guide:
212 http://oss.software.ibm.com/icu/userguide/Collate_Concepts.html#Naming_Collators
213 Attributes are overriden by the subsequent attributes.
214 So, for "S2_S3", final strength will be 3. 3066bis
215 locale overrides individual locale parts.
216
217 The call to this constructor is equivalent to a plain
218 constructor, followed by a series of calls to setAttribute
219 and setVariableTop
220
221 ***********************************************************************/
222
223 this (char[] shortName, bool forceDefaults)
224 {
225 Error e;
226
227 handle = ucol_openFromShortString (toString(shortName), forceDefaults, null, e);
228 testError (e, "failed to open short-name collator");
229 }
230
231 /***********************************************************************
232
233 Internal constructor invoked via USearch
234
235 ***********************************************************************/
236
237 package this (Handle handle)
238 {
239 this.handle = handle;
240 }
241
242 /***********************************************************************
243
244 Close a UCollator
245
246 ***********************************************************************/
247
248 ~this ()
249 {
250 ucol_close (handle);
251 }
252
253 /***********************************************************************
254
255 Get a set containing the contractions defined by the
256 collator.
257
258 The set includes both the UCA contractions and the
259 contractions defined by the collator. This set will
260 contain only strings. If a tailoring explicitly
261 suppresses contractions from the UCA (like Russian),
262 removed contractions will not be in the resulting set.
263
264 ***********************************************************************/
265
266 void getContractions (USet set)
267 {
268 Error e;
269
270 ucol_getContractions (handle, set.handle, e);
271 testError (e, "failed to get collator contractions");
272 }
273
274 /***********************************************************************
275
276 Compare two strings. Return value is -, 0, +
277
278 ***********************************************************************/
279
280 int strcoll (UText source, UText target)
281 {
282 return ucol_strcoll (handle, source.get.ptr, source.len, target.get.ptr, target.len);
283 }
284
285 /***********************************************************************
286
287 Determine if one string is greater than another. This
288 function is equivalent to strcoll() > 1
289
290 ***********************************************************************/
291
292 bool greater (UText source, UText target)
293 {
294 return ucol_greater (handle, source.get.ptr, source.len, target.get.ptr, target.len) != 0;
295 }
296
297 /***********************************************************************
298
299 Determine if one string is greater than or equal to
300 another. This function is equivalent to strcoll() >= 0
301
302 ***********************************************************************/
303
304 bool greaterOrEqual (UText source, UText target)
305 {
306 return ucol_greaterOrEqual (handle, source.get.ptr, source.len, target.get.ptr, target.len) != 0;
307 }
308
309 /***********************************************************************
310
311 This function is equivalent to strcoll() == 0
312
313 ***********************************************************************/
314
315 bool equal (UText source, UText target)
316 {
317 return ucol_equal (handle, source.get.ptr, source.len, target.get.ptr, target.len) != 0;
318 }
319
320 /***********************************************************************
321
322 Get the collation strength used in a UCollator. The
323 strength influences how strings are compared.
324
325 ***********************************************************************/
326
327 Strength getStrength ()
328 {
329 return ucol_getStrength (handle);
330 }
331
332 /***********************************************************************
333
334 Set the collation strength used in this UCollator. The
335 strength influences how strings are compared. one of
336 Primary, Secondary, Tertiary, Quaternary, Dentical, or
337 Default
338
339 ***********************************************************************/
340
341 void setStrength (Strength s)
342 {
343 ucol_setStrength (handle, s);
344 }
345
346 /***********************************************************************
347
348 Get the display name for a UCollator. The display name is
349 suitable for presentation to a user
350
351 ***********************************************************************/
352
353 void getDisplayName (ULocale obj, ULocale display, UString dst)
354 {
355 uint fmt (wchar* p, uint len, inout Error e)
356 {
357 return ucol_getDisplayName (toString(obj.name), toString(display.name), dst.get.ptr, dst.len, e);
358 }
359
360 dst.format (&fmt, "failed to get collator display name");
361 }
362
363 /***********************************************************************
364
365 Returns current rules. Options define whether full rules
366 are returned or just the tailoring.
367
368 ***********************************************************************/
369
370 void getRules (UString dst, RuleOption o = RuleOption.FullRules)
371 {
372 uint fmt (wchar* p, uint len, inout Error e)
373 {
374 uint needed = ucol_getRulesEx (handle, o, dst.get.ptr, dst.len);
375 if (needed > len)
376 e = e.BufferOverflow;
377 return needed;
378 }
379
380 dst.format (&fmt, "failed to get collator rules");
381 }
382
383 /***********************************************************************
384
385 Get the short definition string for a collator.
386
387 This API harvests the collator's locale and the attribute
388 set and produces a string that can be used for opening a
389 collator with the same properties using the char[] style
390 constructor. This string will be normalized.
391
392 The structure and the syntax of the string is defined in the
393 "Naming collators" section of the users guide:
394 http://oss.software.ibm.com/icu/userguide/Collate_Concepts.html#Naming_Collators
395
396 ***********************************************************************/
397
398 char[] getShortDefinitionString (ULocale locale = ULocale.Default)
399 {
400 Error e;
401 char[64] dst;
402
403 uint len = ucol_getShortDefinitionString (handle, toString(locale.name), dst.ptr, dst.length, e);
404 testError (e, "failed to get collator short name");
405 return dst[0..len].dup;
406 }
407
408 /***********************************************************************
409
410 Verifies and normalizes short definition string. Normalized
411 short definition string has all the option sorted by the
412 argument name, so that equivalent definition strings are the
413 same
414
415 ***********************************************************************/
416
417 char[] normalizeShortDefinitionString (char[] source)
418 {
419 Error e;
420 char[64] dst;
421
422 uint len = ucol_normalizeShortDefinitionString (toString(source), dst.ptr, dst.length, null, e);
423 testError (e, "failed to normalize collator short name");
424 return dst[0..len].dup;
425 }
426
427 /***********************************************************************
428
429 Get a sort key for a string from a UCollator. Sort keys
430 may be compared using strcmp.
431
432 ***********************************************************************/
433
434 ubyte[] getSortKey (UText t, ubyte[] result)
435 {
436 uint len = ucol_getSortKey (handle, t.get.ptr, t.len, result.ptr, result.length);
437 if (len < result.length)
438 return result [0..len];
439 return null;
440 }
441
442 /***********************************************************************
443
444 Merge two sort keys. The levels are merged with their
445 corresponding counterparts (primaries with primaries,
446 secondaries with secondaries etc.). Between the values
447 from the same level a separator is inserted. example
448 (uncompressed): 191B1D 01 050505 01 910505 00 and
449 1F2123 01 050505 01 910505 00 will be merged as
450 191B1D 02 1F212301 050505 02 050505 01 910505 02 910505 00
451 This allows for concatenating of first and last names for
452 sorting, among other things. If the destination buffer is
453 not big enough, the results are undefined. If any of source
454 lengths are zero or any of source pointers are null/undefined,
455 result is of size zero.
456
457 ***********************************************************************/
458
459 ubyte[] mergeSortkeys (ubyte[] left, ubyte[] right, ubyte[] result)
460 {
461 uint len = ucol_mergeSortkeys (left.ptr, left.length, right.ptr, right.length, result.ptr, result.length);
462 if (len < result.length)
463 return result [0..len];
464 return null;
465 }
466
467 /***********************************************************************
468
469 Produce a bound for a given sortkey and a number of levels.
470
471 Return value is always the number of bytes needed, regardless
472 of whether the result buffer was big enough or even valid.
473
474 Resulting bounds can be used to produce a range of strings
475 that are between upper and lower bounds. For example, if
476 bounds are produced for a sortkey of string "smith", strings
477 between upper and lower bounds with one level would include
478 "Smith", "SMITH", "sMiTh".
479
480 There are two upper bounds that can be produced. If BoundUpper
481 is produced, strings matched would be as above. However, if
482 bound produced using BoundUpperLong is used, the above example
483 will also match "Smithsonian" and similar.
484
485 ***********************************************************************/
486
487 ubyte[] getBound (BoundMode mode, ubyte[] source, ubyte[] result, uint levels = 1)
488 {
489 Error e;
490
491 uint len = ucol_getBound (source.ptr, source.length, mode, levels, result.ptr, result.length, e);
492 testError (e, "failed to get sortkey bound");
493 if (len < result.length)
494 return result [0..len];
495 return null;
496 }
497
498 /***********************************************************************
499
500 Gets the version information for a Collator.
501
502 Version is currently an opaque 32-bit number which depends,
503 among other things, on major versions of the collator
504 tailoring and UCA
505
506 ***********************************************************************/
507
508 void getVersion (inout Version v)
509 {
510 ucol_getVersion (handle, v);
511 }
512
513 /***********************************************************************
514
515 Gets the UCA version information for this Collator
516
517 ***********************************************************************/
518
519 void getUCAVersion (inout Version v)
520 {
521 ucol_getUCAVersion (handle, v);
522 }
523
524 /***********************************************************************
525
526 Universal attribute setter
527
528 ***********************************************************************/
529
530 void setAttribute (Attribute attr, AttributeValue value)
531 {
532 Error e;
533
534 ucol_setAttribute (handle, attr, value, e);
535 testError (e, "failed to set collator attribute");
536 }
537
538 /***********************************************************************
539
540 Universal attribute getter
541
542 ***********************************************************************/
543
544 AttributeValue getAttribute (Attribute attr)
545 {
546 Error e;
547
548 AttributeValue v = ucol_getAttribute (handle, attr, e);
549 testError (e, "failed to get collator attribute");
550 return v;
551 }
552
553 /***********************************************************************
554
555 Variable top is a two byte primary value which causes all
556 the codepoints with primary values that are less or equal
557 than the variable top to be shifted when alternate handling
558 is set to Shifted.
559
560 ***********************************************************************/
561
562 void setVariableTop (UText t)
563 {
564 Error e;
565
566 ucol_setVariableTop (handle, t.get.ptr, t.len, e);
567 testError (e, "failed to set variable-top");
568 }
569
570 /***********************************************************************
571
572 Sets the variable top to a collation element value
573 supplied.Variable top is set to the upper 16 bits.
574 Lower 16 bits are ignored.
575
576 ***********************************************************************/
577
578 void setVariableTop (uint x)
579 {
580 Error e;
581
582 ucol_restoreVariableTop (handle, x, e);
583 testError (e, "failed to restore variable-top");
584 }
585
586 /***********************************************************************
587
588 Gets the variable top value of this Collator. Lower 16 bits
589 are undefined and should be ignored.
590
591 ***********************************************************************/
592
593 uint getVariableTop ()
594 {
595 Error e;
596
597 uint x = ucol_getVariableTop (handle, e);
598 testError (e, "failed to get variable-top");
599 return x;
600 }
601
602 /***********************************************************************
603
604 Gets the locale name of the collator. If the collator is
605 instantiated from the rules, then this function will throw
606 an exception
607
608 ***********************************************************************/
609
610 void getLocale (ULocale locale, ULocale.Type type)
611 {
612 Error e;
613
614 locale.name = toArray (ucol_getLocaleByType (handle, type, e));
615 if (isError(e) || locale.name is null)
616 exception ("failed to get collator locale");
617 }
618
619 /***********************************************************************
620
621 Get the Unicode set that contains all the characters and
622 sequences tailored in this collator.
623
624 ***********************************************************************/
625
626 USet getTailoredSet ()
627 {
628 Error e;
629
630 Handle h = ucol_getTailoredSet (handle, e);
631 testError (e, "failed to get tailored set");
632 return new USet (h);
633 }
634
635
636 /***********************************************************************
637
638 Bind the ICU functions from a shared library. This is
639 complicated by the issues regarding D and DLLs on the
640 Windows platform
641
642 ***********************************************************************/
643
644 private static void* library;
645
646 /***********************************************************************
647
648 ***********************************************************************/
649
650 private static extern (C)
651 {
652 void function (Handle) ucol_close;
653 Handle function (char *loc, inout Error e) ucol_open;
654 Handle function (wchar* rules, uint rulesLength, AttributeValue normalizationMode, Strength strength, UParseError *parseError, inout Error e) ucol_openRules;
655 Handle function (char *definition, byte forceDefaults, UParseError *parseError, inout Error e) ucol_openFromShortString;
656 uint function (Handle, Handle conts, inout Error e) ucol_getContractions;
657 int function (Handle, wchar* source, uint sourceLength, wchar* target, uint targetLength) ucol_strcoll;
658 byte function (Handle, wchar* source, uint sourceLength, wchar* target, uint targetLength) ucol_greater;
659 byte function (Handle, wchar* source, uint sourceLength, wchar* target, uint targetLength) ucol_greaterOrEqual;
660 byte function (Handle, wchar* source, uint sourceLength, wchar* target, uint targetLength) ucol_equal;
661 Strength function (Handle) ucol_getStrength;
662 void function (Handle, Strength strength) ucol_setStrength;
663 uint function (char *objLoc, char *dispLoc, wchar* result, uint resultLength, inout Error e) ucol_getDisplayName;
664 uint function (Handle, char *locale, char *buffer, uint capacity, inout Error e) ucol_getShortDefinitionString;
665 uint function (char *source, char *destination, uint capacity, UParseError *parseError, inout Error e) ucol_normalizeShortDefinitionString;
666 uint function (Handle, wchar* source, uint sourceLength, ubyte *result, uint resultLength) ucol_getSortKey;
667 uint function (ubyte *source, uint sourceLength, BoundMode boundType, uint noOfLevels, ubyte *result, uint resultLength, inout Error e) ucol_getBound;
668 void function (Handle, Version info) ucol_getVersion;
669 void function (Handle, Version info) ucol_getUCAVersion;
670 uint function (ubyte *src1, uint src1Length, ubyte *src2, uint src2Length, ubyte *dest, uint destCapacity) ucol_mergeSortkeys;
671 void function (Handle, Attribute attr, AttributeValue value, inout Error e) ucol_setAttribute;
672 AttributeValue function (Handle, Attribute attr, inout Error e) ucol_getAttribute;
673 uint function (Handle, wchar* varTop, uint len, inout Error e) ucol_setVariableTop;
674 uint function (Handle, inout Error e) ucol_getVariableTop;
675 void function (Handle, uint varTop, inout Error e) ucol_restoreVariableTop;
676 uint function (Handle, RuleOption delta, wchar* buffer, uint bufferLen) ucol_getRulesEx;
677 char* function (Handle, ULocale.Type type, inout Error e) ucol_getLocaleByType;
678 Handle function (Handle, inout Error e) ucol_getTailoredSet;
679 }
680
681 /***********************************************************************
682
683 ***********************************************************************/
684
685 static FunctionLoader.Bind[] targets =
686 [
687 {cast(void**) &ucol_open, "ucol_open"},
688 {cast(void**) &ucol_close, "ucol_close"},
689 {cast(void**) &ucol_openRules, "ucol_openRules"},
690 {cast(void**) &ucol_openFromShortString, "ucol_openFromShortString"},
691 {cast(void**) &ucol_getContractions, "ucol_getContractions"},
692 {cast(void**) &ucol_strcoll, "ucol_strcoll"},
693 {cast(void**) &ucol_greater, "ucol_greater"},
694 {cast(void**) &ucol_greaterOrEqual, "ucol_greaterOrEqual"},
695 {cast(void**) &ucol_equal, "ucol_equal"},
696 {cast(void**) &ucol_getStrength, "ucol_getStrength"},
697 {cast(void**) &ucol_setStrength, "ucol_setStrength"},
698 {cast(void**) &ucol_getDisplayName, "ucol_getDisplayName"},
699 {cast(void**) &ucol_getShortDefinitionString, "ucol_getShortDefinitionString"},
700 {cast(void**) &ucol_normalizeShortDefinitionString, "ucol_normalizeShortDefinitionString"},
701 {cast(void**) &ucol_getSortKey, "ucol_getSortKey"},
702 {cast(void**) &ucol_getBound, "ucol_getBound"},
703 {cast(void**) &ucol_getVersion, "ucol_getVersion"},
704 {cast(void**) &ucol_getUCAVersion, "ucol_getUCAVersion"},
705 {cast(void**) &ucol_mergeSortkeys, "ucol_mergeSortkeys"},
706 {cast(void**) &ucol_setAttribute, "ucol_setAttribute"},
707 {cast(void**) &ucol_getAttribute, "ucol_getAttribute"},
708 {cast(void**) &ucol_setVariableTop, "ucol_setVariableTop"},
709 {cast(void**) &ucol_getVariableTop, "ucol_getVariableTop"},
710 {cast(void**) &ucol_restoreVariableTop, "ucol_restoreVariableTop"},
711 {cast(void**) &ucol_getRulesEx, "ucol_getRulesEx"},
712 {cast(void**) &ucol_getLocaleByType, "ucol_getLocaleByType"},
713 {cast(void**) &ucol_getTailoredSet, "ucol_getTailoredSet"},
714 ];
715
716 /***********************************************************************
717
718 ***********************************************************************/
719
720 static this ()
721 {
722 library = FunctionLoader.bind (icuin, targets);
723 }
724
725 /***********************************************************************
726
727 ***********************************************************************/
728
729 static ~this ()
730 {
731 FunctionLoader.unbind (library);
732 }
733 }
734