comparison base/src/java/mangoicu/UNormalize.d @ 27:1bf55a6eb092

Renamed java tree to base
author Frank Benoit <benoit@tionex.de>
date Sat, 21 Mar 2009 11:33:57 +0100
parents java/src/java/mangoicu/UNormalize.d@dbfb303e8fb0
children
comparison
equal deleted inserted replaced
26:f589fc20a5f9 27:1bf55a6eb092
1 /*******************************************************************************
2
3 @file UNormalize.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, October 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module java.mangoicu.UNormalize;
86
87 private import java.mangoicu.ICU,
88 java.mangoicu.UString,
89 java.mangoicu.ULocale;
90
91 /*******************************************************************************
92
93 transforms Unicode text into an equivalent composed or
94 decomposed form, allowing for easier sorting and searching
95 of text. UNormalize supports the standard normalization forms
96 described in http://www.unicode.org/unicode/reports/tr15/
97
98 Characters with accents or other adornments can be encoded
99 in several different ways in Unicode. For example, take the
100 character A-acute. In Unicode, this can be encoded as a single
101 character (the "composed" form):
102
103 00C1 LATIN CAPITAL LETTER A WITH ACUTE
104
105 or as two separate characters (the "decomposed" form):
106
107 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT
108
109 To a user of your program, however, both of these sequences
110 should be treated as the same "user-level" character "A with
111 acute accent". When you are searching or comparing text, you
112 must ensure that these two sequences are treated equivalently.
113 In addition, you must handle characters with more than one
114 accent. Sometimes the order of a character's combining accents
115 is significant, while in other cases accent sequences in different
116 orders are really equivalent.
117
118 Similarly, the string "ffi" can be encoded as three separate
119 letters:
120
121 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F
122 0069 LATIN SMALL LETTER I
123
124 or as the single character
125
126 FB03 LATIN SMALL LIGATURE FFI
127
128 The ffi ligature is not a distinct semantic character, and strictly
129 speaking it shouldn't be in Unicode at all, but it was included for
130 compatibility with existing character sets that already provided it.
131 The Unicode standard identifies such characters by giving them
132 "compatibility" decompositions into the corresponding semantic
133 characters. When sorting and searching, you will often want to use
134 these mappings.
135
136 unorm_normalize helps solve these problems by transforming text into
137 the canonical composed and decomposed forms as shown in the first
138 example above. In addition, you can have it perform compatibility
139 decompositions so that you can treat compatibility characters the
140 same as their equivalents. Finally, UNormalize rearranges
141 accents into the proper canonical order, so that you do not have
142 to worry about accent rearrangement on your own.
143
144 Form FCD, "Fast C or D", is also designed for collation. It allows
145 to work on strings that are not necessarily normalized with an
146 algorithm (like in collation) that works under "canonical closure",
147 i.e., it treats precomposed characters and their decomposed
148 equivalents the same.
149
150 It is not a normalization form because it does not provide for
151 uniqueness of representation. Multiple strings may be canonically
152 equivalent (their NFDs are identical) and may all conform to FCD
153 without being identical themselves.
154
155 The form is defined such that the "raw decomposition", the
156 recursive canonical decomposition of each character, results
157 in a string that is canonically ordered. This means that
158 precomposed characters are allowed for as long as their
159 decompositions do not need canonical reordering.
160
161 Its advantage for a process like collation is that all NFD
162 and most NFC texts - and many unnormalized texts - already
163 conform to FCD and do not need to be normalized (NFD) for
164 such a process. The FCD quick check will return UNORM_YES
165 for most strings in practice.
166
167 For more details on FCD see the collation design document:
168 http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
169
170 ICU collation performs either NFD or FCD normalization
171 automatically if normalization is turned on for the collator
172 object. Beyond collation and string search, normalized strings
173 may be useful for string equivalence comparisons, transliteration/
174 transcription, unique representations, etc.
175
176 The W3C generally recommends to exchange texts in NFC. Note also
177 that most legacy character encodings use only precomposed forms
178 and often do not encode any combining marks by themselves. For
179 conversion to such character encodings the Unicode text needs to
180 be normalized to NFC. For more usage examples, see the Unicode
181 Standard Annex.
182
183 See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html">
184 this page</A> for full details.
185
186
187 *******************************************************************************/
188
189 class UNormalize : ICU
190 {
191 enum Mode
192 {
193 None = 1,
194 NFD = 2,
195 NFKD = 3,
196 NFC = 4,
197 Default = NFC,
198 NFKC = 5,
199 FCD = 6,
200 Count
201 }
202
203 enum Check
204 {
205 No,
206 Yes,
207 Maybe
208 }
209
210 enum Options
211 {
212 None = 0x00,
213 Unicode32 = 0x20
214 }
215
216 /***********************************************************************
217
218 Normalize a string. The string will be normalized according
219 the specified normalization mode and options
220
221 ***********************************************************************/
222
223 static void normalize (UStringView src, UString dst, Mode mode, Options o = Options.None)
224 {
225 uint fmt (wchar* dst, uint len, inout UErrorCode e)
226 {
227 return unorm_normalize (src.get.ptr, src.len, mode, o, dst, len, e);
228 }
229
230 dst.format (&fmt, "failed to normalize");
231 }
232
233 /***********************************************************************
234
235 Performing quick check on a string, to quickly determine
236 if the string is in a particular normalization format.
237
238 Three types of result can be returned: Yes, No or Maybe.
239 Result Yes indicates that the argument string is in the
240 desired normalized format, No determines that argument
241 string is not in the desired normalized format. A Maybe
242 result indicates that a more thorough check is required,
243 the user may have to put the string in its normalized
244 form and compare the results.
245
246 ***********************************************************************/
247
248 static Check check (UStringView t, Mode mode, Options o = Options.None)
249 {
250 UErrorCode e;
251
252 Check c = cast(Check) unorm_quickCheckWithOptions (t.get.ptr, t.len, mode, o, e);
253 testError (e, "failed to perform normalization check");
254 return c;
255 }
256
257 /***********************************************************************
258
259 Test if a string is in a given normalization form.
260
261 Unlike check(), this function returns a definitive result,
262 never a "maybe". For NFD, NFKD, and FCD, both functions
263 work exactly the same. For NFC and NFKC where quickCheck
264 may return "maybe", this function will perform further
265 tests to arrive at a TRUE/FALSE result.
266
267 ***********************************************************************/
268
269 static bool isNormalized (UStringView t, Mode mode, Options o = Options.None)
270 {
271 UErrorCode e;
272
273 byte b = unorm_isNormalizedWithOptions (t.get.ptr, t.len, mode, o, e);
274 testError (e, "failed to perform normalization test");
275 return b != 0;
276 }
277
278 /***********************************************************************
279
280 Concatenate normalized strings, making sure that the result
281 is normalized as well. If both the left and the right strings
282 are in the normalization form according to "mode/options",
283 then the result will be
284
285 dest=normalize(left+right, mode, options)
286
287 With the input strings already being normalized, this function
288 will use unorm_next() and unorm_previous() to find the adjacent
289 end pieces of the input strings. Only the concatenation of these
290 end pieces will be normalized and then concatenated with the
291 remaining parts of the input strings.
292
293 It is allowed to have dst==left to avoid copying the entire
294 left string.
295
296 ***********************************************************************/
297
298 static void concatenate (UStringView left, UStringView right, UString dst, Mode mode, Options o = Options.None)
299 {
300 uint fmt (wchar* p, uint len, inout UErrorCode e)
301 {
302 return unorm_concatenate (left.get.ptr, left.len, right.get.ptr, right.len, p, len, mode, o, e);
303 }
304
305 dst.format (&fmt, "failed to concatenate");
306 }
307
308 /***********************************************************************
309
310 Compare two strings for canonical equivalence. Further
311 options include case-insensitive comparison and code
312 point order (as opposed to code unit order).
313
314 Canonical equivalence between two strings is defined as
315 their normalized forms (NFD or NFC) being identical.
316 This function compares strings incrementally instead of
317 normalizing (and optionally case-folding) both strings
318 entirely, improving performance significantly.
319
320 Bulk normalization is only necessary if the strings do
321 not fulfill the FCD conditions. Only in this case, and
322 only if the strings are relatively long, is memory
323 allocated temporarily. For FCD strings and short non-FCD
324 strings there is no memory allocation.
325
326 ***********************************************************************/
327
328 static int compare (UStringView left, UStringView right, Options o = Options.None)
329 {
330 UErrorCode e;
331
332 int i = unorm_compare (left.get.ptr, left.len, right.get.ptr, right.len, o, e);
333 testError (e, "failed to compare");
334 return i;
335 }
336
337
338 /***********************************************************************
339
340 Bind the ICU functions from a shared library. This is
341 complicated by the issues regarding D and DLLs on the
342 Windows platform
343
344 ***********************************************************************/
345
346 private static void* library;
347
348 /***********************************************************************
349
350 ***********************************************************************/
351
352 private static extern (C)
353 {
354 uint function (wchar*, uint, uint, uint, wchar*, uint, inout UErrorCode) unorm_normalize;
355 uint function (wchar*, uint, uint, uint, inout UErrorCode) unorm_quickCheckWithOptions;
356 byte function (wchar*, uint, uint, uint, inout UErrorCode) unorm_isNormalizedWithOptions;
357 uint function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout UErrorCode) unorm_concatenate;
358 uint function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) unorm_compare;
359 }
360
361 /***********************************************************************
362
363 ***********************************************************************/
364
365 static FunctionLoader.Bind[] targets =
366 [
367 {cast(void**) &unorm_normalize, "unorm_normalize"},
368 {cast(void**) &unorm_quickCheckWithOptions, "unorm_quickCheckWithOptions"},
369 {cast(void**) &unorm_isNormalizedWithOptions, "unorm_isNormalizedWithOptions"},
370 {cast(void**) &unorm_concatenate, "unorm_concatenate"},
371 {cast(void**) &unorm_compare, "unorm_compare"},
372 ];
373
374 /***********************************************************************
375
376 ***********************************************************************/
377
378 static this ()
379 {
380 library = FunctionLoader.bind (icuuc, targets);
381 }
382
383 /***********************************************************************
384
385 ***********************************************************************/
386
387 static ~this ()
388 {
389 FunctionLoader.unbind (library);
390 }
391 }