92
|
1 /*******************************************************************************
|
|
2
|
|
3 @file UNormalize.d
|
|
4
|
|
5 Copyright (c) 2004 Kris Bell
|
|
6
|
|
7 This software is provided 'as-is', without any express or implied
|
|
8 warranty. In no event will the authors be held liable for damages
|
|
9 of any kind arising from the use of this software.
|
|
10
|
|
11 Permission is hereby granted to anyone to use this software for any
|
|
12 purpose, including commercial applications, and to alter it and/or
|
|
13 redistribute it freely, subject to the following restrictions:
|
|
14
|
|
15 1. The origin of this software must not be misrepresented; you must
|
|
16 not claim that you wrote the original software. If you use this
|
|
17 software in a product, an acknowledgment within documentation of
|
|
18 said product would be appreciated but is not required.
|
|
19
|
|
20 2. Altered source versions must be plainly marked as such, and must
|
|
21 not be misrepresented as being the original software.
|
|
22
|
|
23 3. This notice may not be removed or altered from any distribution
|
|
24 of the source.
|
|
25
|
|
26 4. Derivative works are permitted, but they must carry this notice
|
|
27 in full and credit the original source.
|
|
28
|
|
29
|
|
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
31
|
|
32
|
|
33 @version Initial version, October 2004
|
|
34 @author Kris
|
|
35
|
|
36 Note that this package and documentation is built around the ICU
|
|
37 project (http://oss.software.ibm.com/icu/). Below is the license
|
|
38 statement as specified by that software:
|
|
39
|
|
40
|
|
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
42
|
|
43
|
|
44 ICU License - ICU 1.8.1 and later
|
|
45
|
|
46 COPYRIGHT AND PERMISSION NOTICE
|
|
47
|
|
48 Copyright (c) 1995-2003 International Business Machines Corporation and
|
|
49 others.
|
|
50
|
|
51 All rights reserved.
|
|
52
|
|
53 Permission is hereby granted, free of charge, to any person obtaining a
|
|
54 copy of this software and associated documentation files (the
|
|
55 "Software"), to deal in the Software without restriction, including
|
|
56 without limitation the rights to use, copy, modify, merge, publish,
|
|
57 distribute, and/or sell copies of the Software, and to permit persons
|
|
58 to whom the Software is furnished to do so, provided that the above
|
|
59 copyright notice(s) and this permission notice appear in all copies of
|
|
60 the Software and that both the above copyright notice(s) and this
|
|
61 permission notice appear in supporting documentation.
|
|
62
|
|
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
|
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
|
|
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
|
|
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
|
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
|
|
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
72
|
|
73 Except as contained in this notice, the name of a copyright holder
|
|
74 shall not be used in advertising or otherwise to promote the sale, use
|
|
75 or other dealings in this Software without prior written authorization
|
|
76 of the copyright holder.
|
|
77
|
|
78 ----------------------------------------------------------------------
|
|
79
|
|
80 All trademarks and registered trademarks mentioned herein are the
|
|
81 property of their respective owners.
|
|
82
|
|
83 *******************************************************************************/
|
|
84
|
|
85 module dwtx.dwtxhelper.mangoicu.UNormalize;
|
|
86
|
|
87 private import dwtx.dwtxhelper.mangoicu.ICU,
|
|
88 dwtx.dwtxhelper.mangoicu.UString,
|
|
89 dwtx.dwtxhelper.mangoicu.ULocale;
|
|
90
|
|
91 /*******************************************************************************
|
|
92
|
|
93 transforms Unicode text into an equivalent composed or
|
|
94 decomposed form, allowing for easier sorting and searching
|
|
95 of text. UNormalize supports the standard normalization forms
|
|
96 described in http://www.unicode.org/unicode/reports/tr15/
|
|
97
|
|
98 Characters with accents or other adornments can be encoded
|
|
99 in several different ways in Unicode. For example, take the
|
|
100 character A-acute. In Unicode, this can be encoded as a single
|
|
101 character (the "composed" form):
|
|
102
|
|
103 00C1 LATIN CAPITAL LETTER A WITH ACUTE
|
|
104
|
|
105 or as two separate characters (the "decomposed" form):
|
|
106
|
|
107 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT
|
|
108
|
|
109 To a user of your program, however, both of these sequences
|
|
110 should be treated as the same "user-level" character "A with
|
|
111 acute accent". When you are searching or comparing text, you
|
|
112 must ensure that these two sequences are treated equivalently.
|
|
113 In addition, you must handle characters with more than one
|
|
114 accent. Sometimes the order of a character's combining accents
|
|
115 is significant, while in other cases accent sequences in different
|
|
116 orders are really equivalent.
|
|
117
|
|
118 Similarly, the string "ffi" can be encoded as three separate
|
|
119 letters:
|
|
120
|
|
121 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F
|
|
122 0069 LATIN SMALL LETTER I
|
|
123
|
|
124 or as the single character
|
|
125
|
|
126 FB03 LATIN SMALL LIGATURE FFI
|
|
127
|
|
128 The ffi ligature is not a distinct semantic character, and strictly
|
|
129 speaking it shouldn't be in Unicode at all, but it was included for
|
|
130 compatibility with existing character sets that already provided it.
|
|
131 The Unicode standard identifies such characters by giving them
|
|
132 "compatibility" decompositions into the corresponding semantic
|
|
133 characters. When sorting and searching, you will often want to use
|
|
134 these mappings.
|
|
135
|
|
136 unorm_normalize helps solve these problems by transforming text into
|
|
137 the canonical composed and decomposed forms as shown in the first
|
|
138 example above. In addition, you can have it perform compatibility
|
|
139 decompositions so that you can treat compatibility characters the
|
|
140 same as their equivalents. Finally, UNormalize rearranges
|
|
141 accents into the proper canonical order, so that you do not have
|
|
142 to worry about accent rearrangement on your own.
|
|
143
|
|
144 Form FCD, "Fast C or D", is also designed for collation. It allows
|
|
145 to work on strings that are not necessarily normalized with an
|
|
146 algorithm (like in collation) that works under "canonical closure",
|
|
147 i.e., it treats precomposed characters and their decomposed
|
|
148 equivalents the same.
|
|
149
|
|
150 It is not a normalization form because it does not provide for
|
|
151 uniqueness of representation. Multiple strings may be canonically
|
|
152 equivalent (their NFDs are identical) and may all conform to FCD
|
|
153 without being identical themselves.
|
|
154
|
|
155 The form is defined such that the "raw decomposition", the
|
|
156 recursive canonical decomposition of each character, results
|
|
157 in a string that is canonically ordered. This means that
|
|
158 precomposed characters are allowed for as long as their
|
|
159 decompositions do not need canonical reordering.
|
|
160
|
|
161 Its advantage for a process like collation is that all NFD
|
|
162 and most NFC texts - and many unnormalized texts - already
|
|
163 conform to FCD and do not need to be normalized (NFD) for
|
|
164 such a process. The FCD quick check will return UNORM_YES
|
|
165 for most strings in practice.
|
|
166
|
|
167 For more details on FCD see the collation design document:
|
|
168 http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
|
|
169
|
|
170 ICU collation performs either NFD or FCD normalization
|
|
171 automatically if normalization is turned on for the collator
|
|
172 object. Beyond collation and string search, normalized strings
|
|
173 may be useful for string equivalence comparisons, transliteration/
|
|
174 transcription, unique representations, etc.
|
|
175
|
|
176 The W3C generally recommends to exchange texts in NFC. Note also
|
|
177 that most legacy character encodings use only precomposed forms
|
|
178 and often do not encode any combining marks by themselves. For
|
|
179 conversion to such character encodings the Unicode text needs to
|
|
180 be normalized to NFC. For more usage examples, see the Unicode
|
|
181 Standard Annex.
|
|
182
|
|
183 See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html">
|
|
184 this page</A> for full details.
|
|
185
|
|
186
|
|
187 *******************************************************************************/
|
|
188
|
|
189 class UNormalize : ICU
|
|
190 {
|
|
191 enum Mode
|
|
192 {
|
|
193 None = 1,
|
|
194 NFD = 2,
|
|
195 NFKD = 3,
|
|
196 NFC = 4,
|
|
197 Default = NFC,
|
|
198 NFKC = 5,
|
|
199 FCD = 6,
|
|
200 Count
|
|
201 }
|
|
202
|
|
203 enum Check
|
|
204 {
|
|
205 No,
|
|
206 Yes,
|
|
207 Maybe
|
|
208 }
|
|
209
|
|
210 enum Options
|
|
211 {
|
|
212 None = 0x00,
|
|
213 Unicode32 = 0x20
|
|
214 }
|
|
215
|
|
216 /***********************************************************************
|
|
217
|
|
218 Normalize a string. The string will be normalized according
|
|
219 the specified normalization mode and options
|
|
220
|
|
221 ***********************************************************************/
|
|
222
|
|
223 static void normalize (UStringView src, UString dst, Mode mode, Options o = Options.None)
|
|
224 {
|
|
225 uint fmt (wchar* dst, uint len, inout UErrorCode e)
|
|
226 {
|
|
227 return unorm_normalize (src.get.ptr, src.len, mode, o, dst, len, e);
|
|
228 }
|
|
229
|
|
230 dst.format (&fmt, "failed to normalize");
|
|
231 }
|
|
232
|
|
233 /***********************************************************************
|
|
234
|
|
235 Performing quick check on a string, to quickly determine
|
|
236 if the string is in a particular normalization format.
|
|
237
|
|
238 Three types of result can be returned: Yes, No or Maybe.
|
|
239 Result Yes indicates that the argument string is in the
|
|
240 desired normalized format, No determines that argument
|
|
241 string is not in the desired normalized format. A Maybe
|
|
242 result indicates that a more thorough check is required,
|
|
243 the user may have to put the string in its normalized
|
|
244 form and compare the results.
|
|
245
|
|
246 ***********************************************************************/
|
|
247
|
|
248 static Check check (UStringView t, Mode mode, Options o = Options.None)
|
|
249 {
|
|
250 UErrorCode e;
|
|
251
|
|
252 Check c = cast(Check) unorm_quickCheckWithOptions (t.get.ptr, t.len, mode, o, e);
|
|
253 testError (e, "failed to perform normalization check");
|
|
254 return c;
|
|
255 }
|
|
256
|
|
257 /***********************************************************************
|
|
258
|
|
259 Test if a string is in a given normalization form.
|
|
260
|
|
261 Unlike check(), this function returns a definitive result,
|
|
262 never a "maybe". For NFD, NFKD, and FCD, both functions
|
|
263 work exactly the same. For NFC and NFKC where quickCheck
|
|
264 may return "maybe", this function will perform further
|
|
265 tests to arrive at a TRUE/FALSE result.
|
|
266
|
|
267 ***********************************************************************/
|
|
268
|
|
269 static bool isNormalized (UStringView t, Mode mode, Options o = Options.None)
|
|
270 {
|
|
271 UErrorCode e;
|
|
272
|
|
273 byte b = unorm_isNormalizedWithOptions (t.get.ptr, t.len, mode, o, e);
|
|
274 testError (e, "failed to perform normalization test");
|
|
275 return b != 0;
|
|
276 }
|
|
277
|
|
278 /***********************************************************************
|
|
279
|
|
280 Concatenate normalized strings, making sure that the result
|
|
281 is normalized as well. If both the left and the right strings
|
|
282 are in the normalization form according to "mode/options",
|
|
283 then the result will be
|
|
284
|
|
285 dest=normalize(left+right, mode, options)
|
|
286
|
|
287 With the input strings already being normalized, this function
|
|
288 will use unorm_next() and unorm_previous() to find the adjacent
|
|
289 end pieces of the input strings. Only the concatenation of these
|
|
290 end pieces will be normalized and then concatenated with the
|
|
291 remaining parts of the input strings.
|
|
292
|
|
293 It is allowed to have dst==left to avoid copying the entire
|
|
294 left string.
|
|
295
|
|
296 ***********************************************************************/
|
|
297
|
|
298 static void concatenate (UStringView left, UStringView right, UString dst, Mode mode, Options o = Options.None)
|
|
299 {
|
|
300 uint fmt (wchar* p, uint len, inout UErrorCode e)
|
|
301 {
|
|
302 return unorm_concatenate (left.get.ptr, left.len, right.get.ptr, right.len, p, len, mode, o, e);
|
|
303 }
|
|
304
|
|
305 dst.format (&fmt, "failed to concatenate");
|
|
306 }
|
|
307
|
|
308 /***********************************************************************
|
|
309
|
|
310 Compare two strings for canonical equivalence. Further
|
|
311 options include case-insensitive comparison and code
|
|
312 point order (as opposed to code unit order).
|
|
313
|
|
314 Canonical equivalence between two strings is defined as
|
|
315 their normalized forms (NFD or NFC) being identical.
|
|
316 This function compares strings incrementally instead of
|
|
317 normalizing (and optionally case-folding) both strings
|
|
318 entirely, improving performance significantly.
|
|
319
|
|
320 Bulk normalization is only necessary if the strings do
|
|
321 not fulfill the FCD conditions. Only in this case, and
|
|
322 only if the strings are relatively long, is memory
|
|
323 allocated temporarily. For FCD strings and short non-FCD
|
|
324 strings there is no memory allocation.
|
|
325
|
|
326 ***********************************************************************/
|
|
327
|
|
328 static int compare (UStringView left, UStringView right, Options o = Options.None)
|
|
329 {
|
|
330 UErrorCode e;
|
|
331
|
|
332 int i = unorm_compare (left.get.ptr, left.len, right.get.ptr, right.len, o, e);
|
|
333 testError (e, "failed to compare");
|
|
334 return i;
|
|
335 }
|
|
336
|
|
337
|
|
338 /***********************************************************************
|
|
339
|
|
340 Bind the ICU functions from a shared library. This is
|
|
341 complicated by the issues regarding D and DLLs on the
|
|
342 Windows platform
|
|
343
|
|
344 ***********************************************************************/
|
|
345
|
|
346 private static void* library;
|
|
347
|
|
348 /***********************************************************************
|
|
349
|
|
350 ***********************************************************************/
|
|
351
|
|
352 private static extern (C)
|
|
353 {
|
|
354 uint function (wchar*, uint, uint, uint, wchar*, uint, inout UErrorCode) unorm_normalize;
|
|
355 uint function (wchar*, uint, uint, uint, inout UErrorCode) unorm_quickCheckWithOptions;
|
|
356 byte function (wchar*, uint, uint, uint, inout UErrorCode) unorm_isNormalizedWithOptions;
|
|
357 uint function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout UErrorCode) unorm_concatenate;
|
|
358 uint function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) unorm_compare;
|
|
359 }
|
|
360
|
|
361 /***********************************************************************
|
|
362
|
|
363 ***********************************************************************/
|
|
364
|
|
365 static FunctionLoader.Bind[] targets =
|
|
366 [
|
|
367 {cast(void**) &unorm_normalize, "unorm_normalize"},
|
|
368 {cast(void**) &unorm_quickCheckWithOptions, "unorm_quickCheckWithOptions"},
|
|
369 {cast(void**) &unorm_isNormalizedWithOptions, "unorm_isNormalizedWithOptions"},
|
|
370 {cast(void**) &unorm_concatenate, "unorm_concatenate"},
|
|
371 {cast(void**) &unorm_compare, "unorm_compare"},
|
|
372 ];
|
|
373
|
|
374 /***********************************************************************
|
|
375
|
|
376 ***********************************************************************/
|
|
377
|
|
378 static this ()
|
|
379 {
|
|
380 library = FunctionLoader.bind (icuuc, targets);
|
|
381 }
|
|
382
|
|
383 /***********************************************************************
|
|
384
|
|
385 ***********************************************************************/
|
|
386
|
|
387 static ~this ()
|
|
388 {
|
|
389 FunctionLoader.unbind (library);
|
|
390 }
|
|
391 }
|