Mercurial > projects > dwt-addons
comparison dwtx/dwtxhelper/mangoicu/UNormalize.d @ 89:040da1cb0d76
Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sun, 22 Jun 2008 22:57:31 +0200 |
parents | |
children | 11e8159caf7a |
comparison
equal
deleted
inserted
replaced
88:cd18fa3b71f1 | 89:040da1cb0d76 |
---|---|
1 /******************************************************************************* | |
2 | |
3 @file UNormalize.d | |
4 | |
5 Copyright (c) 2004 Kris Bell | |
6 | |
7 This software is provided 'as-is', without any express or implied | |
8 warranty. In no event will the authors be held liable for damages | |
9 of any kind arising from the use of this software. | |
10 | |
11 Permission is hereby granted to anyone to use this software for any | |
12 purpose, including commercial applications, and to alter it and/or | |
13 redistribute it freely, subject to the following restrictions: | |
14 | |
15 1. The origin of this software must not be misrepresented; you must | |
16 not claim that you wrote the original software. If you use this | |
17 software in a product, an acknowledgment within documentation of | |
18 said product would be appreciated but is not required. | |
19 | |
20 2. Altered source versions must be plainly marked as such, and must | |
21 not be misrepresented as being the original software. | |
22 | |
23 3. This notice may not be removed or altered from any distribution | |
24 of the source. | |
25 | |
26 4. Derivative works are permitted, but they must carry this notice | |
27 in full and credit the original source. | |
28 | |
29 | |
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
31 | |
32 | |
33 @version Initial version, October 2004 | |
34 @author Kris | |
35 | |
36 Note that this package and documentation is built around the ICU | |
37 project (http://oss.software.ibm.com/icu/). Below is the license | |
38 statement as specified by that software: | |
39 | |
40 | |
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
42 | |
43 | |
44 ICU License - ICU 1.8.1 and later | |
45 | |
46 COPYRIGHT AND PERMISSION NOTICE | |
47 | |
48 Copyright (c) 1995-2003 International Business Machines Corporation and | |
49 others. | |
50 | |
51 All rights reserved. | |
52 | |
53 Permission is hereby granted, free of charge, to any person obtaining a | |
54 copy of this software and associated documentation files (the | |
55 "Software"), to deal in the Software without restriction, including | |
56 without limitation the rights to use, copy, modify, merge, publish, | |
57 distribute, and/or sell copies of the Software, and to permit persons | |
58 to whom the Software is furnished to do so, provided that the above | |
59 copyright notice(s) and this permission notice appear in all copies of | |
60 the Software and that both the above copyright notice(s) and this | |
61 permission notice appear in supporting documentation. | |
62 | |
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR | |
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL | |
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING | |
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, | |
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION | |
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
72 | |
73 Except as contained in this notice, the name of a copyright holder | |
74 shall not be used in advertising or otherwise to promote the sale, use | |
75 or other dealings in this Software without prior written authorization | |
76 of the copyright holder. | |
77 | |
78 ---------------------------------------------------------------------- | |
79 | |
80 All trademarks and registered trademarks mentioned herein are the | |
81 property of their respective owners. | |
82 | |
83 *******************************************************************************/ | |
84 | |
85 module dwtx.dwthelper.mangoicu.UNormalize; | |
86 | |
87 private import dwtx.dwthelper.mangoicu.ICU, | |
88 dwtx.dwthelper.mangoicu.UString, | |
89 dwtx.dwthelper.mangoicu.ULocale; | |
90 | |
91 /******************************************************************************* | |
92 | |
93 transforms Unicode text into an equivalent composed or | |
94 decomposed form, allowing for easier sorting and searching | |
95 of text. UNormalize supports the standard normalization forms | |
96 described in http://www.unicode.org/unicode/reports/tr15/ | |
97 | |
98 Characters with accents or other adornments can be encoded | |
99 in several different ways in Unicode. For example, take the | |
100 character A-acute. In Unicode, this can be encoded as a single | |
101 character (the "composed" form): | |
102 | |
103 00C1 LATIN CAPITAL LETTER A WITH ACUTE | |
104 | |
105 or as two separate characters (the "decomposed" form): | |
106 | |
107 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT | |
108 | |
109 To a user of your program, however, both of these sequences | |
110 should be treated as the same "user-level" character "A with | |
111 acute accent". When you are searching or comparing text, you | |
112 must ensure that these two sequences are treated equivalently. | |
113 In addition, you must handle characters with more than one | |
114 accent. Sometimes the order of a character's combining accents | |
115 is significant, while in other cases accent sequences in different | |
116 orders are really equivalent. | |
117 | |
118 Similarly, the string "ffi" can be encoded as three separate | |
119 letters: | |
120 | |
121 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F | |
122 0069 LATIN SMALL LETTER I | |
123 | |
124 or as the single character | |
125 | |
126 FB03 LATIN SMALL LIGATURE FFI | |
127 | |
128 The ffi ligature is not a distinct semantic character, and strictly | |
129 speaking it shouldn't be in Unicode at all, but it was included for | |
130 compatibility with existing character sets that already provided it. | |
131 The Unicode standard identifies such characters by giving them | |
132 "compatibility" decompositions into the corresponding semantic | |
133 characters. When sorting and searching, you will often want to use | |
134 these mappings. | |
135 | |
136 unorm_normalize helps solve these problems by transforming text into | |
137 the canonical composed and decomposed forms as shown in the first | |
138 example above. In addition, you can have it perform compatibility | |
139 decompositions so that you can treat compatibility characters the | |
140 same as their equivalents. Finally, UNormalize rearranges | |
141 accents into the proper canonical order, so that you do not have | |
142 to worry about accent rearrangement on your own. | |
143 | |
144 Form FCD, "Fast C or D", is also designed for collation. It allows | |
145 to work on strings that are not necessarily normalized with an | |
146 algorithm (like in collation) that works under "canonical closure", | |
147 i.e., it treats precomposed characters and their decomposed | |
148 equivalents the same. | |
149 | |
150 It is not a normalization form because it does not provide for | |
151 uniqueness of representation. Multiple strings may be canonically | |
152 equivalent (their NFDs are identical) and may all conform to FCD | |
153 without being identical themselves. | |
154 | |
155 The form is defined such that the "raw decomposition", the | |
156 recursive canonical decomposition of each character, results | |
157 in a string that is canonically ordered. This means that | |
158 precomposed characters are allowed for as long as their | |
159 decompositions do not need canonical reordering. | |
160 | |
161 Its advantage for a process like collation is that all NFD | |
162 and most NFC texts - and many unnormalized texts - already | |
163 conform to FCD and do not need to be normalized (NFD) for | |
164 such a process. The FCD quick check will return UNORM_YES | |
165 for most strings in practice. | |
166 | |
167 For more details on FCD see the collation design document: | |
168 http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm | |
169 | |
170 ICU collation performs either NFD or FCD normalization | |
171 automatically if normalization is turned on for the collator | |
172 object. Beyond collation and string search, normalized strings | |
173 may be useful for string equivalence comparisons, transliteration/ | |
174 transcription, unique representations, etc. | |
175 | |
176 The W3C generally recommends to exchange texts in NFC. Note also | |
177 that most legacy character encodings use only precomposed forms | |
178 and often do not encode any combining marks by themselves. For | |
179 conversion to such character encodings the Unicode text needs to | |
180 be normalized to NFC. For more usage examples, see the Unicode | |
181 Standard Annex. | |
182 | |
183 See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html"> | |
184 this page</A> for full details. | |
185 | |
186 | |
187 *******************************************************************************/ | |
188 | |
189 class UNormalize : ICU | |
190 { | |
191 enum Mode | |
192 { | |
193 None = 1, | |
194 NFD = 2, | |
195 NFKD = 3, | |
196 NFC = 4, | |
197 Default = NFC, | |
198 NFKC = 5, | |
199 FCD = 6, | |
200 Count | |
201 } | |
202 | |
203 enum Check | |
204 { | |
205 No, | |
206 Yes, | |
207 Maybe | |
208 } | |
209 | |
210 enum Options | |
211 { | |
212 None = 0x00, | |
213 Unicode32 = 0x20 | |
214 } | |
215 | |
216 /*********************************************************************** | |
217 | |
218 Normalize a string. The string will be normalized according | |
219 the specified normalization mode and options | |
220 | |
221 ***********************************************************************/ | |
222 | |
223 static void normalize (UText src, UString dst, Mode mode, Options o = Options.None) | |
224 { | |
225 uint fmt (wchar* dst, uint len, inout Error e) | |
226 { | |
227 return unorm_normalize (src.get.ptr, src.len, mode, o, dst, len, e); | |
228 } | |
229 | |
230 dst.format (&fmt, "failed to normalize"); | |
231 } | |
232 | |
233 /*********************************************************************** | |
234 | |
235 Performing quick check on a string, to quickly determine | |
236 if the string is in a particular normalization format. | |
237 | |
238 Three types of result can be returned: Yes, No or Maybe. | |
239 Result Yes indicates that the argument string is in the | |
240 desired normalized format, No determines that argument | |
241 string is not in the desired normalized format. A Maybe | |
242 result indicates that a more thorough check is required, | |
243 the user may have to put the string in its normalized | |
244 form and compare the results. | |
245 | |
246 ***********************************************************************/ | |
247 | |
248 static Check check (UText t, Mode mode, Options o = Options.None) | |
249 { | |
250 Error e; | |
251 | |
252 Check c = cast(Check) unorm_quickCheckWithOptions (t.get.ptr, t.len, mode, o, e); | |
253 testError (e, "failed to perform normalization check"); | |
254 return c; | |
255 } | |
256 | |
257 /*********************************************************************** | |
258 | |
259 Test if a string is in a given normalization form. | |
260 | |
261 Unlike check(), this function returns a definitive result, | |
262 never a "maybe". For NFD, NFKD, and FCD, both functions | |
263 work exactly the same. For NFC and NFKC where quickCheck | |
264 may return "maybe", this function will perform further | |
265 tests to arrive at a TRUE/FALSE result. | |
266 | |
267 ***********************************************************************/ | |
268 | |
269 static bool isNormalized (UText t, Mode mode, Options o = Options.None) | |
270 { | |
271 Error e; | |
272 | |
273 byte b = unorm_isNormalizedWithOptions (t.get.ptr, t.len, mode, o, e); | |
274 testError (e, "failed to perform normalization test"); | |
275 return b != 0; | |
276 } | |
277 | |
278 /*********************************************************************** | |
279 | |
280 Concatenate normalized strings, making sure that the result | |
281 is normalized as well. If both the left and the right strings | |
282 are in the normalization form according to "mode/options", | |
283 then the result will be | |
284 | |
285 dest=normalize(left+right, mode, options) | |
286 | |
287 With the input strings already being normalized, this function | |
288 will use unorm_next() and unorm_previous() to find the adjacent | |
289 end pieces of the input strings. Only the concatenation of these | |
290 end pieces will be normalized and then concatenated with the | |
291 remaining parts of the input strings. | |
292 | |
293 It is allowed to have dst==left to avoid copying the entire | |
294 left string. | |
295 | |
296 ***********************************************************************/ | |
297 | |
298 static void concatenate (UText left, UText right, UString dst, Mode mode, Options o = Options.None) | |
299 { | |
300 uint fmt (wchar* p, uint len, inout Error e) | |
301 { | |
302 return unorm_concatenate (left.get.ptr, left.len, right.get.ptr, right.len, p, len, mode, o, e); | |
303 } | |
304 | |
305 dst.format (&fmt, "failed to concatenate"); | |
306 } | |
307 | |
308 /*********************************************************************** | |
309 | |
310 Compare two strings for canonical equivalence. Further | |
311 options include case-insensitive comparison and code | |
312 point order (as opposed to code unit order). | |
313 | |
314 Canonical equivalence between two strings is defined as | |
315 their normalized forms (NFD or NFC) being identical. | |
316 This function compares strings incrementally instead of | |
317 normalizing (and optionally case-folding) both strings | |
318 entirely, improving performance significantly. | |
319 | |
320 Bulk normalization is only necessary if the strings do | |
321 not fulfill the FCD conditions. Only in this case, and | |
322 only if the strings are relatively long, is memory | |
323 allocated temporarily. For FCD strings and short non-FCD | |
324 strings there is no memory allocation. | |
325 | |
326 ***********************************************************************/ | |
327 | |
328 static int compare (UText left, UText right, Options o = Options.None) | |
329 { | |
330 Error e; | |
331 | |
332 int i = unorm_compare (left.get.ptr, left.len, right.get.ptr, right.len, o, e); | |
333 testError (e, "failed to compare"); | |
334 return i; | |
335 } | |
336 | |
337 | |
338 /*********************************************************************** | |
339 | |
340 Bind the ICU functions from a shared library. This is | |
341 complicated by the issues regarding D and DLLs on the | |
342 Windows platform | |
343 | |
344 ***********************************************************************/ | |
345 | |
346 private static void* library; | |
347 | |
348 /*********************************************************************** | |
349 | |
350 ***********************************************************************/ | |
351 | |
352 private static extern (C) | |
353 { | |
354 uint function (wchar*, uint, uint, uint, wchar*, uint, inout Error) unorm_normalize; | |
355 uint function (wchar*, uint, uint, uint, inout Error) unorm_quickCheckWithOptions; | |
356 byte function (wchar*, uint, uint, uint, inout Error) unorm_isNormalizedWithOptions; | |
357 uint function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout Error) unorm_concatenate; | |
358 uint function (wchar*, uint, wchar*, uint, uint, inout Error) unorm_compare; | |
359 } | |
360 | |
361 /*********************************************************************** | |
362 | |
363 ***********************************************************************/ | |
364 | |
365 static FunctionLoader.Bind[] targets = | |
366 [ | |
367 {cast(void**) &unorm_normalize, "unorm_normalize"}, | |
368 {cast(void**) &unorm_quickCheckWithOptions, "unorm_quickCheckWithOptions"}, | |
369 {cast(void**) &unorm_isNormalizedWithOptions, "unorm_isNormalizedWithOptions"}, | |
370 {cast(void**) &unorm_concatenate, "unorm_concatenate"}, | |
371 {cast(void**) &unorm_compare, "unorm_compare"}, | |
372 ]; | |
373 | |
374 /*********************************************************************** | |
375 | |
376 ***********************************************************************/ | |
377 | |
378 static this () | |
379 { | |
380 library = FunctionLoader.bind (icuuc, targets); | |
381 } | |
382 | |
383 /*********************************************************************** | |
384 | |
385 ***********************************************************************/ | |
386 | |
387 static ~this () | |
388 { | |
389 FunctionLoader.unbind (library); | |
390 } | |
391 } |