Mercurial > projects > dwt-addons
diff dwtx/dwtxhelper/mangoicu/UNormalize.d @ 89:040da1cb0d76
Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sun, 22 Jun 2008 22:57:31 +0200 |
parents | |
children | 11e8159caf7a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dwtx/dwtxhelper/mangoicu/UNormalize.d Sun Jun 22 22:57:31 2008 +0200 @@ -0,0 +1,391 @@ +/******************************************************************************* + + @file UNormalize.d + + Copyright (c) 2004 Kris Bell + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for damages + of any kind arising from the use of this software. + + Permission is hereby granted to anyone to use this software for any + purpose, including commercial applications, and to alter it and/or + redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment within documentation of + said product would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any distribution + of the source. + + 4. Derivative works are permitted, but they must carry this notice + in full and credit the original source. + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + @version Initial version, October 2004 + @author Kris + + Note that this package and documentation is built around the ICU + project (http://oss.software.ibm.com/icu/). Below is the license + statement as specified by that software: + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + ICU License - ICU 1.8.1 and later + + COPYRIGHT AND PERMISSION NOTICE + + Copyright (c) 1995-2003 International Business Machines Corporation and + others. + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, and/or sell copies of the Software, and to permit persons + to whom the Software is furnished to do so, provided that the above + copyright notice(s) and this permission notice appear in all copies of + the Software and that both the above copyright notice(s) and this + permission notice appear in supporting documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL + INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING + FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION + WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, use + or other dealings in this Software without prior written authorization + of the copyright holder. + + ---------------------------------------------------------------------- + + All trademarks and registered trademarks mentioned herein are the + property of their respective owners. + +*******************************************************************************/ + +module dwtx.dwthelper.mangoicu.UNormalize; + +private import dwtx.dwthelper.mangoicu.ICU, + dwtx.dwthelper.mangoicu.UString, + dwtx.dwthelper.mangoicu.ULocale; + +/******************************************************************************* + + transforms Unicode text into an equivalent composed or + decomposed form, allowing for easier sorting and searching + of text. UNormalize supports the standard normalization forms + described in http://www.unicode.org/unicode/reports/tr15/ + + Characters with accents or other adornments can be encoded + in several different ways in Unicode. For example, take the + character A-acute. In Unicode, this can be encoded as a single + character (the "composed" form): + + 00C1 LATIN CAPITAL LETTER A WITH ACUTE + + or as two separate characters (the "decomposed" form): + + 0041 LATIN CAPITAL LETTER A 0301 COMBINING ACUTE ACCENT + + To a user of your program, however, both of these sequences + should be treated as the same "user-level" character "A with + acute accent". When you are searching or comparing text, you + must ensure that these two sequences are treated equivalently. + In addition, you must handle characters with more than one + accent. Sometimes the order of a character's combining accents + is significant, while in other cases accent sequences in different + orders are really equivalent. + + Similarly, the string "ffi" can be encoded as three separate + letters: + + 0066 LATIN SMALL LETTER F 0066 LATIN SMALL LETTER F + 0069 LATIN SMALL LETTER I + + or as the single character + + FB03 LATIN SMALL LIGATURE FFI + + The ffi ligature is not a distinct semantic character, and strictly + speaking it shouldn't be in Unicode at all, but it was included for + compatibility with existing character sets that already provided it. + The Unicode standard identifies such characters by giving them + "compatibility" decompositions into the corresponding semantic + characters. When sorting and searching, you will often want to use + these mappings. + + unorm_normalize helps solve these problems by transforming text into + the canonical composed and decomposed forms as shown in the first + example above. In addition, you can have it perform compatibility + decompositions so that you can treat compatibility characters the + same as their equivalents. Finally, UNormalize rearranges + accents into the proper canonical order, so that you do not have + to worry about accent rearrangement on your own. + + Form FCD, "Fast C or D", is also designed for collation. It allows + to work on strings that are not necessarily normalized with an + algorithm (like in collation) that works under "canonical closure", + i.e., it treats precomposed characters and their decomposed + equivalents the same. + + It is not a normalization form because it does not provide for + uniqueness of representation. Multiple strings may be canonically + equivalent (their NFDs are identical) and may all conform to FCD + without being identical themselves. + + The form is defined such that the "raw decomposition", the + recursive canonical decomposition of each character, results + in a string that is canonically ordered. This means that + precomposed characters are allowed for as long as their + decompositions do not need canonical reordering. + + Its advantage for a process like collation is that all NFD + and most NFC texts - and many unnormalized texts - already + conform to FCD and do not need to be normalized (NFD) for + such a process. The FCD quick check will return UNORM_YES + for most strings in practice. + + For more details on FCD see the collation design document: + http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm + + ICU collation performs either NFD or FCD normalization + automatically if normalization is turned on for the collator + object. Beyond collation and string search, normalized strings + may be useful for string equivalence comparisons, transliteration/ + transcription, unique representations, etc. + + The W3C generally recommends to exchange texts in NFC. Note also + that most legacy character encodings use only precomposed forms + and often do not encode any combining marks by themselves. For + conversion to such character encodings the Unicode text needs to + be normalized to NFC. For more usage examples, see the Unicode + Standard Annex. + + See <A HREF="http://oss.software.ibm.com/icu/apiref/unorm_8h.html"> + this page</A> for full details. + + +*******************************************************************************/ + +class UNormalize : ICU +{ + enum Mode + { + None = 1, + NFD = 2, + NFKD = 3, + NFC = 4, + Default = NFC, + NFKC = 5, + FCD = 6, + Count + } + + enum Check + { + No, + Yes, + Maybe + } + + enum Options + { + None = 0x00, + Unicode32 = 0x20 + } + + /*********************************************************************** + + Normalize a string. The string will be normalized according + the specified normalization mode and options + + ***********************************************************************/ + + static void normalize (UText src, UString dst, Mode mode, Options o = Options.None) + { + uint fmt (wchar* dst, uint len, inout Error e) + { + return unorm_normalize (src.get.ptr, src.len, mode, o, dst, len, e); + } + + dst.format (&fmt, "failed to normalize"); + } + + /*********************************************************************** + + Performing quick check on a string, to quickly determine + if the string is in a particular normalization format. + + Three types of result can be returned: Yes, No or Maybe. + Result Yes indicates that the argument string is in the + desired normalized format, No determines that argument + string is not in the desired normalized format. A Maybe + result indicates that a more thorough check is required, + the user may have to put the string in its normalized + form and compare the results. + + ***********************************************************************/ + + static Check check (UText t, Mode mode, Options o = Options.None) + { + Error e; + + Check c = cast(Check) unorm_quickCheckWithOptions (t.get.ptr, t.len, mode, o, e); + testError (e, "failed to perform normalization check"); + return c; + } + + /*********************************************************************** + + Test if a string is in a given normalization form. + + Unlike check(), this function returns a definitive result, + never a "maybe". For NFD, NFKD, and FCD, both functions + work exactly the same. For NFC and NFKC where quickCheck + may return "maybe", this function will perform further + tests to arrive at a TRUE/FALSE result. + + ***********************************************************************/ + + static bool isNormalized (UText t, Mode mode, Options o = Options.None) + { + Error e; + + byte b = unorm_isNormalizedWithOptions (t.get.ptr, t.len, mode, o, e); + testError (e, "failed to perform normalization test"); + return b != 0; + } + + /*********************************************************************** + + Concatenate normalized strings, making sure that the result + is normalized as well. If both the left and the right strings + are in the normalization form according to "mode/options", + then the result will be + + dest=normalize(left+right, mode, options) + + With the input strings already being normalized, this function + will use unorm_next() and unorm_previous() to find the adjacent + end pieces of the input strings. Only the concatenation of these + end pieces will be normalized and then concatenated with the + remaining parts of the input strings. + + It is allowed to have dst==left to avoid copying the entire + left string. + + ***********************************************************************/ + + static void concatenate (UText left, UText right, UString dst, Mode mode, Options o = Options.None) + { + uint fmt (wchar* p, uint len, inout Error e) + { + return unorm_concatenate (left.get.ptr, left.len, right.get.ptr, right.len, p, len, mode, o, e); + } + + dst.format (&fmt, "failed to concatenate"); + } + + /*********************************************************************** + + Compare two strings for canonical equivalence. Further + options include case-insensitive comparison and code + point order (as opposed to code unit order). + + Canonical equivalence between two strings is defined as + their normalized forms (NFD or NFC) being identical. + This function compares strings incrementally instead of + normalizing (and optionally case-folding) both strings + entirely, improving performance significantly. + + Bulk normalization is only necessary if the strings do + not fulfill the FCD conditions. Only in this case, and + only if the strings are relatively long, is memory + allocated temporarily. For FCD strings and short non-FCD + strings there is no memory allocation. + + ***********************************************************************/ + + static int compare (UText left, UText right, Options o = Options.None) + { + Error e; + + int i = unorm_compare (left.get.ptr, left.len, right.get.ptr, right.len, o, e); + testError (e, "failed to compare"); + return i; + } + + + /*********************************************************************** + + Bind the ICU functions from a shared library. This is + complicated by the issues regarding D and DLLs on the + Windows platform + + ***********************************************************************/ + + private static void* library; + + /*********************************************************************** + + ***********************************************************************/ + + private static extern (C) + { + uint function (wchar*, uint, uint, uint, wchar*, uint, inout Error) unorm_normalize; + uint function (wchar*, uint, uint, uint, inout Error) unorm_quickCheckWithOptions; + byte function (wchar*, uint, uint, uint, inout Error) unorm_isNormalizedWithOptions; + uint function (wchar*, uint, wchar*, uint, wchar*, uint, uint, uint, inout Error) unorm_concatenate; + uint function (wchar*, uint, wchar*, uint, uint, inout Error) unorm_compare; + } + + /*********************************************************************** + + ***********************************************************************/ + + static FunctionLoader.Bind[] targets = + [ + {cast(void**) &unorm_normalize, "unorm_normalize"}, + {cast(void**) &unorm_quickCheckWithOptions, "unorm_quickCheckWithOptions"}, + {cast(void**) &unorm_isNormalizedWithOptions, "unorm_isNormalizedWithOptions"}, + {cast(void**) &unorm_concatenate, "unorm_concatenate"}, + {cast(void**) &unorm_compare, "unorm_compare"}, + ]; + + /*********************************************************************** + + ***********************************************************************/ + + static this () + { + library = FunctionLoader.bind (icuuc, targets); + } + + /*********************************************************************** + + ***********************************************************************/ + + static ~this () + { + FunctionLoader.unbind (library); + } +}