Mercurial > projects > dwt-addons
diff dwtx/dwtxhelper/mangoicu/URegex.d @ 89:040da1cb0d76
Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sun, 22 Jun 2008 22:57:31 +0200 |
parents | |
children | 11e8159caf7a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dwtx/dwtxhelper/mangoicu/URegex.d Sun Jun 22 22:57:31 2008 +0200 @@ -0,0 +1,700 @@ +/******************************************************************************* + + @file URegex.d + + Copyright (c) 2004 Kris Bell + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for damages + of any kind arising from the use of this software. + + Permission is hereby granted to anyone to use this software for any + purpose, including commercial applications, and to alter it and/or + redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment within documentation of + said product would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any distribution + of the source. + + 4. Derivative works are permitted, but they must carry this notice + in full and credit the original source. + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + @version Initial version, November 2004 + @author Kris + + Note that this package and documentation is built around the ICU + project (http://oss.software.ibm.com/icu/). Below is the license + statement as specified by that software: + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + ICU License - ICU 1.8.1 and later + + COPYRIGHT AND PERMISSION NOTICE + + Copyright (c) 1995-2003 International Business Machines Corporation and + others. + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, and/or sell copies of the Software, and to permit persons + to whom the Software is furnished to do so, provided that the above + copyright notice(s) and this permission notice appear in all copies of + the Software and that both the above copyright notice(s) and this + permission notice appear in supporting documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL + INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING + FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION + WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, use + or other dealings in this Software without prior written authorization + of the copyright holder. + + ---------------------------------------------------------------------- + + All trademarks and registered trademarks mentioned herein are the + property of their respective owners. + +*******************************************************************************/ + +module dwtx.dwthelper.mangoicu.URegex; + +private import dwtx.dwthelper.mangoicu.ICU; + +public import dwtx.dwthelper.mangoicu.ULocale, + dwtx.dwthelper.mangoicu.UString, + dwtx.dwthelper.mangoicu.UCollator, + dwtx.dwthelper.mangoicu.UBreakIterator; + + +/******************************************************************************* + + Set of slices to return for group matching. See URegex.groups() + +*******************************************************************************/ + +class Groups : ICU +{ + public wchar[] g0, + g1, + g2, + g3, + g4, + g5, + g6, + g7, + g8, + g9; +} + +/******************************************************************************* + + Apis for an engine that provides regular-expression searching of + UTF16 strings. + + See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full + details. + +*******************************************************************************/ + +class URegex : Groups +{ + private Handle handle; + private UText theText; + + // Regex modes + public enum Flag + { + None = 0, + + // Enable case insensitive matching + CaseInsensitive = 2, + + // Allow white space and comments within patterns + Comments = 4, + + // Control behavior of "$" and "^" If set, recognize + // line terminators within string, otherwise, match + // only at start and end of input string. + MultiLine = 8, + + // If set, '.' matches line terminators, otherwise '.' + // matching stops at line end + DotAll = 32, + + // Forces normalization of pattern and strings + CanonEq = 128, + + // If set, uses the Unicode TR 29 definition of word + // boundaries. Warning: Unicode word boundaries are + // quite different from traditional regular expression + // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries + UWord = 256, + } + + /*********************************************************************** + + Compiles the regular expression in string form into an + internal representation using the specified match mode + flags. The resulting regular expression handle can then + be used to perform various matching operations. + + ***********************************************************************/ + + this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null) + { + Error e; + + handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e); + testError (e, "failed to open regex"); + uregex_setText (handle, "", 0, e); + } + + /*********************************************************************** + + Compiles the regular expression in string form into an + internal representation using the specified match mode + flags. The resulting regular expression handle can then + be used to perform various matching operations. + + ***********************************************************************/ + + this (UText pattern, Flag flags=Flag.None, ParseError* pe=null) + { + this (pattern.get, flags, pe); + } + + /*********************************************************************** + + Internal constructor; used for cloning + + ***********************************************************************/ + + private this (Handle handle) + { + Error e; + + this.handle = handle; + uregex_setText (handle, "", 0, e); + } + + /*********************************************************************** + + Close the regular expression, recovering all resources (memory) + it was holding + + ***********************************************************************/ + + ~this () + { + uregex_close (handle); + } + + /*********************************************************************** + + Cloning a regular expression is faster than opening a second + instance from the source form of the expression, and requires + less memory. + + Note that the current input string and the position of any + matched text within it are not cloned; only the pattern itself + and and the match mode flags are copied. + + Cloning can be particularly useful to threaded applications + that perform multiple match operations in parallel. Each + concurrent RE operation requires its own instance of a + URegularExpression. + + ***********************************************************************/ + + URegex clone () + { + Error e; + + Handle h = uregex_clone (handle, e); + testError (e, "failed to clone regex"); + return new URegex (h); + } + + /*********************************************************************** + + Return a copy of the source form of the pattern for this + regular expression + + ***********************************************************************/ + + UString getPattern () + { + Error e; + uint len; + + wchar* x = uregex_pattern (handle, len, e); + testError (e, "failed to extract regex pattern"); + return new UString (x[0..len]); + } + + /*********************************************************************** + + Get the match mode flags that were specified when compiling + this regular expression + + ***********************************************************************/ + + Flag getFlags () + { + Error e; + + Flag f = cast(Flag) uregex_flags (handle, e); + testError (e, "failed to get regex flags"); + return f; + } + + /*********************************************************************** + + Set the subject text string upon which the regular expression + will look for matches. + + This function may be called any number of times, allowing the + regular expression pattern to be applied to different strings. + + Regular expression matching operations work directly on the + application's string data. No copy is made. The subject string + data must not be altered after calling this function until after + all regular expression operations involving this string data are + completed. + + Zero length strings are permitted. In this case, no subsequent + match operation will dereference the text string pointer. + + ***********************************************************************/ + + void setText (UText t) + { + Error e; + + theText = t; + uregex_setText (handle, t.get.ptr, t.length, e); + testError (e, "failed to set regex text"); + } + + /*********************************************************************** + + Get the subject text that is currently associated with this + regular expression object. This simply returns whatever was + previously supplied via setText(). + + Note that this returns a read-only reference to the text. + + ***********************************************************************/ + + UText getText () + { + return theText; + } + + /*********************************************************************** + + Return a set of slices representing the parenthesised groups. + This can be used in the following manner: + + @code + wchar msg; + + if (regex.next()) + with (regex.groups()) + msg ~= g1 ~ ":" ~ g2 + @endcode + + Note that g0 represents the entire match, whereas g1 through + g9 represent the parenthesised expressions. + + ***********************************************************************/ + + Groups groups () + { + wchar[]* p = &g0; + uint count = groupCount(); + wchar[] content = theText.get(); + + if (count > 9) + count = 9; + for (uint i=0; i <= count; ++p, ++i) + *p = content [start(i)..end(i)]; + return this; + } + + /*********************************************************************** + + Extract the string for the specified matching expression or + subexpression. UString 's' is the destination for the match. + + Group #0 is the complete string of matched text. Group #1 is + the text matched by the first set of capturing parentheses. + + ***********************************************************************/ + + void group (UString s, uint index) + { + uint fmt (wchar* dst, uint length, inout Error e) + { + return uregex_group (handle, index, dst, length, e); + } + + s.format (&fmt, "failed to extract regex group text"); + } + + /*********************************************************************** + + Get the number of capturing groups in this regular + expression's pattern + + ***********************************************************************/ + + uint groupCount () + { + Error e; + + uint i = uregex_groupCount (handle, e); + testError (e, "failed to get regex group-count"); + return i; + } + + /*********************************************************************** + + Returns the index in the input string of the start of the + text matched by the specified capture group during the + previous match operation. + + Return -1 if the capture group was not part of the last + match. Group #0 refers to the complete range of matched + text. Group #1 refers to the text matched by the first + set of capturing parentheses + + ***********************************************************************/ + + uint start (uint index = 0) + { + Error e; + + uint i = uregex_start (handle, index, e); + testError (e, "failed to get regex start"); + return i; + } + + /*********************************************************************** + + Returns the index in the input string of the position + following the end of the text matched by the specified + capture group. + + Return -1 if the capture group was not part of the last + match. Group #0 refers to the complete range of matched + text. Group #1 refers to the text matched by the first + set of capturing parentheses. + + ***********************************************************************/ + + uint end (uint index = 0) + { + Error e; + + uint i = uregex_end (handle, index, e); + testError (e, "failed to get regex end"); + return i; + } + + /*********************************************************************** + + Reset any saved state from the previous match. + + Has the effect of causing uregex_findNext to begin at the + specified index, and causing uregex_start(), uregex_end() + and uregex_group() to return an error indicating that there + is no match information available. + + ***********************************************************************/ + + void reset (uint startIndex) + { + Error e; + + uregex_reset (handle, startIndex, e); + testError (e, "failed to set regex next-index"); + } + + /*********************************************************************** + + Attempts to match the input string, beginning at startIndex, + against the pattern. + + To succeed, the match must extend to the end of the input + string + + ***********************************************************************/ + + bool match (uint startIndex) + { + Error e; + + bool b = uregex_matches (handle, startIndex, e); + testError (e, "failed while matching regex"); + return b; + } + + /*********************************************************************** + + Attempts to match the input string, starting from the + specified index, against the pattern. + + The match may be of any length, and is not required to + extend to the end of the input string. Contrast with match() + + ***********************************************************************/ + + bool probe (uint startIndex) + { + Error e; + + bool b = uregex_lookingAt (handle, startIndex, e); + testError (e, "failed while looking at regex"); + return b; + } + + /*********************************************************************** + + Returns whether the text matches the search pattern, starting + from the current position. + + If startIndex is specified, the current position is moved to + the specified location before the seach is initiated. + + ***********************************************************************/ + + bool next (uint startIndex = uint.max) + { + Error e; + bool b; + + b = (startIndex == uint.max) ? uregex_findNext (handle, e) : + uregex_find (handle, startIndex, e); + + testError (e, "failed on next regex"); + return b; + } + + /*********************************************************************** + + Replaces every substring of the input that matches the pattern + with the given replacement string. + + This is a convenience function that provides a complete + find-and-replace-all operation. + + This method scans the input string looking for matches of + the pattern. Input that is not part of any match is copied + unchanged to the destination buffer. Matched regions are + replaced in the output buffer by the replacement string. + The replacement string may contain references to capture + groups; these take the form of $1, $2, etc. + + The provided 'result' will contain the results, and should + be set with a length sufficient to house the entire result. + Upon completion, the 'result' is shortened appropriately + and the total extent (length) of the operation is returned. + Set the initital length of 'result' using the UString method + truncate(). + + The returned extent should be checked to ensure it is not + longer than the length of 'result'. If it is longer, then + the result has been truncated. + + ***********************************************************************/ + + uint replaceAll (UText replace, UString result) + { + Error e; + + uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); + testError (e, "failed during regex replace"); + result.truncate (len); + return len; + } + + /*********************************************************************** + + Replaces the first substring of the input that matches the + pattern with the given replacement string. + + This is a convenience function that provides a complete + find-and-replace operation. + + This method scans the input string looking for a match of + the pattern. All input that is not part of the match is + copied unchanged to the destination buffer. The matched + region is replaced in the output buffer by the replacement + string. The replacement string may contain references to + capture groups; these take the form of $1, $2, etc + + The provided 'result' will contain the results, and should + be set with a length sufficient to house the entire result. + Upon completion, the 'result' is shortened appropriately + and the total extent (length) of the operation is returned. + Set the initital length of 'result' using the UString method + truncate(). + + The returned extent should be checked to ensure it is not + longer than the length of 'result'. If it is longer, then + the result has been truncated. + + ***********************************************************************/ + + uint replaceFirst (UText replace, UString result) + { + Error e; + + uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); + testError (e, "failed during regex replace"); + result.truncate (len); + return len; + } + + /*********************************************************************** + + Split the text up into slices (fields), where each slice + represents the text situated between each pattern matched + within the text. The pattern is expected to represent one + or more slice delimiters. + + ***********************************************************************/ + + uint split (wchar[][] fields) + { + Error e; + uint pos, + count; + wchar[] content = theText.get; + + while (count < fields.length) + if (uregex_findNext (handle, e) && e == e.OK) + { + uint i = start(); + fields[count] = content[pos..i]; + pos = end (); + + // ignore leading delimiter + if (i) + ++count; + } + else + break; + + testError (e, "failed during split"); + return count; + } + + + /*********************************************************************** + + Bind the ICU functions from a shared library. This is + complicated by the issues regarding D and DLLs on the + Windows platform + + ***********************************************************************/ + + private static void* library; + + /*********************************************************************** + + ***********************************************************************/ + + private static extern (C) + { + Handle function (wchar*, uint, uint, ParseError*, inout Error) uregex_open; + void function (Handle) uregex_close; + Handle function (Handle, inout Error) uregex_clone; + wchar* function (Handle, inout uint, inout Error) uregex_pattern; + uint function (Handle, inout Error) uregex_flags; + void function (Handle, wchar*, uint, inout Error) uregex_setText; + wchar* function (Handle, inout uint, inout Error) uregex_getText; + uint function (Handle, uint, wchar*, uint, inout Error) uregex_group; + uint function (Handle, inout Error) uregex_groupCount; + uint function (Handle, uint, inout Error) uregex_start; + uint function (Handle, uint, inout Error) uregex_end; + void function (Handle, uint, inout Error) uregex_reset; + bool function (Handle, uint, inout Error) uregex_matches; + bool function (Handle, uint, inout Error) uregex_lookingAt; + bool function (Handle, uint, inout Error) uregex_find; + bool function (Handle, inout Error) uregex_findNext; + uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll; + uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst; + } + + /*********************************************************************** + + ***********************************************************************/ + + static FunctionLoader.Bind[] targets = + [ + {cast(void**) &uregex_open, "uregex_open"}, + {cast(void**) &uregex_close, "uregex_close"}, + {cast(void**) &uregex_clone, "uregex_clone"}, + {cast(void**) &uregex_pattern, "uregex_pattern"}, + {cast(void**) &uregex_flags, "uregex_flags"}, + {cast(void**) &uregex_setText, "uregex_setText"}, + {cast(void**) &uregex_getText, "uregex_getText"}, + {cast(void**) &uregex_group, "uregex_group"}, + {cast(void**) &uregex_groupCount, "uregex_groupCount"}, + {cast(void**) &uregex_start, "uregex_start"}, + {cast(void**) &uregex_end, "uregex_end"}, + {cast(void**) &uregex_reset, "uregex_reset"}, + {cast(void**) &uregex_matches, "uregex_matches"}, + {cast(void**) &uregex_lookingAt, "uregex_lookingAt"}, + {cast(void**) &uregex_find, "uregex_find"}, + {cast(void**) &uregex_findNext, "uregex_findNext"}, + {cast(void**) &uregex_replaceAll, "uregex_replaceAll"}, + {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"}, + ]; + + /*********************************************************************** + + ***********************************************************************/ + + static this () + { + library = FunctionLoader.bind (icuin, targets); + } + + /*********************************************************************** + + ***********************************************************************/ + + static ~this () + { + FunctionLoader.unbind (library); + } +}