Mercurial > projects > dwt-addons
diff dwtx/dwtxhelper/mangoicu/URegex.d @ 92:f05207c07a98
changed filetype to unix
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Mon, 07 Jul 2008 15:54:03 +0200 |
parents | 11e8159caf7a |
children |
line wrap: on
line diff
--- a/dwtx/dwtxhelper/mangoicu/URegex.d Mon Jul 07 15:53:07 2008 +0200 +++ b/dwtx/dwtxhelper/mangoicu/URegex.d Mon Jul 07 15:54:03 2008 +0200 @@ -1,700 +1,700 @@ -/******************************************************************************* - - @file URegex.d - - Copyright (c) 2004 Kris Bell - - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for damages - of any kind arising from the use of this software. - - Permission is hereby granted to anyone to use this software for any - purpose, including commercial applications, and to alter it and/or - redistribute it freely, subject to the following restrictions: - - 1. The origin of this software must not be misrepresented; you must - not claim that you wrote the original software. If you use this - software in a product, an acknowledgment within documentation of - said product would be appreciated but is not required. - - 2. Altered source versions must be plainly marked as such, and must - not be misrepresented as being the original software. - - 3. This notice may not be removed or altered from any distribution - of the source. - - 4. Derivative works are permitted, but they must carry this notice - in full and credit the original source. - - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - @version Initial version, November 2004 - @author Kris - - Note that this package and documentation is built around the ICU - project (http://oss.software.ibm.com/icu/). Below is the license - statement as specified by that software: - - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - ICU License - ICU 1.8.1 and later - - COPYRIGHT AND PERMISSION NOTICE - - Copyright (c) 1995-2003 International Business Machines Corporation and - others. - - All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining a - copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, and/or sell copies of the Software, and to permit persons - to whom the Software is furnished to do so, provided that the above - copyright notice(s) and this permission notice appear in all copies of - the Software and that both the above copyright notice(s) and this - permission notice appear in supporting documentation. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT - OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL - INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING - FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, - NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION - WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - Except as contained in this notice, the name of a copyright holder - shall not be used in advertising or otherwise to promote the sale, use - or other dealings in this Software without prior written authorization - of the copyright holder. - - ---------------------------------------------------------------------- - - All trademarks and registered trademarks mentioned herein are the - property of their respective owners. - -*******************************************************************************/ - -module dwtx.dwtxhelper.mangoicu.URegex; - -private import dwtx.dwtxhelper.mangoicu.ICU; - -public import dwtx.dwtxhelper.mangoicu.ULocale, - dwtx.dwtxhelper.mangoicu.UString, - dwtx.dwtxhelper.mangoicu.UCollator, - dwtx.dwtxhelper.mangoicu.UBreakIterator; - - -/******************************************************************************* - - Set of slices to return for group matching. See URegex.groups() - -*******************************************************************************/ - -class Groups : ICU -{ - public wchar[] g0, - g1, - g2, - g3, - g4, - g5, - g6, - g7, - g8, - g9; -} - -/******************************************************************************* - - Apis for an engine that provides regular-expression searching of - UTF16 strings. - - See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full - details. - -*******************************************************************************/ - -class URegex : Groups -{ - private Handle handle; - private UStringView theText; - - // Regex modes - public enum Flag - { - None = 0, - - // Enable case insensitive matching - CaseInsensitive = 2, - - // Allow white space and comments within patterns - Comments = 4, - - // Control behavior of "$" and "^" If set, recognize - // line terminators within string, otherwise, match - // only at start and end of input string. - MultiLine = 8, - - // If set, '.' matches line terminators, otherwise '.' - // matching stops at line end - DotAll = 32, - - // Forces normalization of pattern and strings - CanonEq = 128, - - // If set, uses the Unicode TR 29 definition of word - // boundaries. Warning: Unicode word boundaries are - // quite different from traditional regular expression - // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries - UWord = 256, - } - - /*********************************************************************** - - Compiles the regular expression in string form into an - internal representation using the specified match mode - flags. The resulting regular expression handle can then - be used to perform various matching operations. - - ***********************************************************************/ - - this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null) - { - UErrorCode e; - - handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e); - testError (e, "failed to open regex"); - uregex_setText (handle, "", 0, e); - } - - /*********************************************************************** - - Compiles the regular expression in string form into an - internal representation using the specified match mode - flags. The resulting regular expression handle can then - be used to perform various matching operations. - - ***********************************************************************/ - - this (UStringView pattern, Flag flags=Flag.None, ParseError* pe=null) - { - this (pattern.get, flags, pe); - } - - /*********************************************************************** - - Internal constructor; used for cloning - - ***********************************************************************/ - - private this (Handle handle) - { - UErrorCode e; - - this.handle = handle; - uregex_setText (handle, "", 0, e); - } - - /*********************************************************************** - - Close the regular expression, recovering all resources (memory) - it was holding - - ***********************************************************************/ - - ~this () - { - uregex_close (handle); - } - - /*********************************************************************** - - Cloning a regular expression is faster than opening a second - instance from the source form of the expression, and requires - less memory. - - Note that the current input string and the position of any - matched text within it are not cloned; only the pattern itself - and and the match mode flags are copied. - - Cloning can be particularly useful to threaded applications - that perform multiple match operations in parallel. Each - concurrent RE operation requires its own instance of a - URegularExpression. - - ***********************************************************************/ - - URegex clone () - { - UErrorCode e; - - Handle h = uregex_clone (handle, e); - testError (e, "failed to clone regex"); - return new URegex (h); - } - - /*********************************************************************** - - Return a copy of the source form of the pattern for this - regular expression - - ***********************************************************************/ - - UString getPattern () - { - UErrorCode e; - uint len; - - wchar* x = uregex_pattern (handle, len, e); - testError (e, "failed to extract regex pattern"); - return new UString (x[0..len]); - } - - /*********************************************************************** - - Get the match mode flags that were specified when compiling - this regular expression - - ***********************************************************************/ - - Flag getFlags () - { - UErrorCode e; - - Flag f = cast(Flag) uregex_flags (handle, e); - testError (e, "failed to get regex flags"); - return f; - } - - /*********************************************************************** - - Set the subject text string upon which the regular expression - will look for matches. - - This function may be called any number of times, allowing the - regular expression pattern to be applied to different strings. - - Regular expression matching operations work directly on the - application's string data. No copy is made. The subject string - data must not be altered after calling this function until after - all regular expression operations involving this string data are - completed. - - Zero length strings are permitted. In this case, no subsequent - match operation will dereference the text string pointer. - - ***********************************************************************/ - - void setText (UStringView t) - { - UErrorCode e; - - theText = t; - uregex_setText (handle, t.get.ptr, t.length, e); - testError (e, "failed to set regex text"); - } - - /*********************************************************************** - - Get the subject text that is currently associated with this - regular expression object. This simply returns whatever was - previously supplied via setText(). - - Note that this returns a read-only reference to the text. - - ***********************************************************************/ - - UStringView getText () - { - return theText; - } - - /*********************************************************************** - - Return a set of slices representing the parenthesised groups. - This can be used in the following manner: - - @code - wchar msg; - - if (regex.next()) - with (regex.groups()) - msg ~= g1 ~ ":" ~ g2 - @endcode - - Note that g0 represents the entire match, whereas g1 through - g9 represent the parenthesised expressions. - - ***********************************************************************/ - - Groups groups () - { - wchar[]* p = &g0; - uint count = groupCount(); - wchar[] content = theText.get(); - - if (count > 9) - count = 9; - for (uint i=0; i <= count; ++p, ++i) - *p = content [start(i)..end(i)]; - return this; - } - - /*********************************************************************** - - Extract the string for the specified matching expression or - subexpression. UString 's' is the destination for the match. - - Group #0 is the complete string of matched text. Group #1 is - the text matched by the first set of capturing parentheses. - - ***********************************************************************/ - - void group (UString s, uint index) - { - uint fmt (wchar* dst, uint length, inout UErrorCode e) - { - return uregex_group (handle, index, dst, length, e); - } - - s.format (&fmt, "failed to extract regex group text"); - } - - /*********************************************************************** - - Get the number of capturing groups in this regular - expression's pattern - - ***********************************************************************/ - - uint groupCount () - { - UErrorCode e; - - uint i = uregex_groupCount (handle, e); - testError (e, "failed to get regex group-count"); - return i; - } - - /*********************************************************************** - - Returns the index in the input string of the start of the - text matched by the specified capture group during the - previous match operation. - - Return -1 if the capture group was not part of the last - match. Group #0 refers to the complete range of matched - text. Group #1 refers to the text matched by the first - set of capturing parentheses - - ***********************************************************************/ - - uint start (uint index = 0) - { - UErrorCode e; - - uint i = uregex_start (handle, index, e); - testError (e, "failed to get regex start"); - return i; - } - - /*********************************************************************** - - Returns the index in the input string of the position - following the end of the text matched by the specified - capture group. - - Return -1 if the capture group was not part of the last - match. Group #0 refers to the complete range of matched - text. Group #1 refers to the text matched by the first - set of capturing parentheses. - - ***********************************************************************/ - - uint end (uint index = 0) - { - UErrorCode e; - - uint i = uregex_end (handle, index, e); - testError (e, "failed to get regex end"); - return i; - } - - /*********************************************************************** - - Reset any saved state from the previous match. - - Has the effect of causing uregex_findNext to begin at the - specified index, and causing uregex_start(), uregex_end() - and uregex_group() to return an error indicating that there - is no match information available. - - ***********************************************************************/ - - void reset (uint startIndex) - { - UErrorCode e; - - uregex_reset (handle, startIndex, e); - testError (e, "failed to set regex next-index"); - } - - /*********************************************************************** - - Attempts to match the input string, beginning at startIndex, - against the pattern. - - To succeed, the match must extend to the end of the input - string - - ***********************************************************************/ - - bool match (uint startIndex) - { - UErrorCode e; - - bool b = uregex_matches (handle, startIndex, e); - testError (e, "failed while matching regex"); - return b; - } - - /*********************************************************************** - - Attempts to match the input string, starting from the - specified index, against the pattern. - - The match may be of any length, and is not required to - extend to the end of the input string. Contrast with match() - - ***********************************************************************/ - - bool probe (uint startIndex) - { - UErrorCode e; - - bool b = uregex_lookingAt (handle, startIndex, e); - testError (e, "failed while looking at regex"); - return b; - } - - /*********************************************************************** - - Returns whether the text matches the search pattern, starting - from the current position. - - If startIndex is specified, the current position is moved to - the specified location before the seach is initiated. - - ***********************************************************************/ - - bool next (uint startIndex = uint.max) - { - UErrorCode e; - bool b; - - b = (startIndex == uint.max) ? uregex_findNext (handle, e) : - uregex_find (handle, startIndex, e); - - testError (e, "failed on next regex"); - return b; - } - - /*********************************************************************** - - Replaces every substring of the input that matches the pattern - with the given replacement string. - - This is a convenience function that provides a complete - find-and-replace-all operation. - - This method scans the input string looking for matches of - the pattern. Input that is not part of any match is copied - unchanged to the destination buffer. Matched regions are - replaced in the output buffer by the replacement string. - The replacement string may contain references to capture - groups; these take the form of $1, $2, etc. - - The provided 'result' will contain the results, and should - be set with a length sufficient to house the entire result. - Upon completion, the 'result' is shortened appropriately - and the total extent (length) of the operation is returned. - Set the initital length of 'result' using the UString method - truncate(). - - The returned extent should be checked to ensure it is not - longer than the length of 'result'. If it is longer, then - the result has been truncated. - - ***********************************************************************/ - - uint replaceAll (UStringView replace, UString result) - { - UErrorCode e; - - uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); - testError (e, "failed during regex replace"); - result.truncate (len); - return len; - } - - /*********************************************************************** - - Replaces the first substring of the input that matches the - pattern with the given replacement string. - - This is a convenience function that provides a complete - find-and-replace operation. - - This method scans the input string looking for a match of - the pattern. All input that is not part of the match is - copied unchanged to the destination buffer. The matched - region is replaced in the output buffer by the replacement - string. The replacement string may contain references to - capture groups; these take the form of $1, $2, etc - - The provided 'result' will contain the results, and should - be set with a length sufficient to house the entire result. - Upon completion, the 'result' is shortened appropriately - and the total extent (length) of the operation is returned. - Set the initital length of 'result' using the UString method - truncate(). - - The returned extent should be checked to ensure it is not - longer than the length of 'result'. If it is longer, then - the result has been truncated. - - ***********************************************************************/ - - uint replaceFirst (UStringView replace, UString result) - { - UErrorCode e; - - uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); - testError (e, "failed during regex replace"); - result.truncate (len); - return len; - } - - /*********************************************************************** - - Split the text up into slices (fields), where each slice - represents the text situated between each pattern matched - within the text. The pattern is expected to represent one - or more slice delimiters. - - ***********************************************************************/ - - uint split (wchar[][] fields) - { - UErrorCode e; - uint pos, - count; - wchar[] content = theText.get; - - while (count < fields.length) - if (uregex_findNext (handle, e) && e == e.OK) - { - uint i = start(); - fields[count] = content[pos..i]; - pos = end (); - - // ignore leading delimiter - if (i) - ++count; - } - else - break; - - testError (e, "failed during split"); - return count; - } - - - /*********************************************************************** - - Bind the ICU functions from a shared library. This is - complicated by the issues regarding D and DLLs on the - Windows platform - - ***********************************************************************/ - - private static void* library; - - /*********************************************************************** - - ***********************************************************************/ - - private static extern (C) - { - Handle function (wchar*, uint, uint, ParseError*, inout UErrorCode) uregex_open; - void function (Handle) uregex_close; - Handle function (Handle, inout UErrorCode) uregex_clone; - wchar* function (Handle, inout uint, inout UErrorCode) uregex_pattern; - uint function (Handle, inout UErrorCode) uregex_flags; - void function (Handle, wchar*, uint, inout UErrorCode) uregex_setText; - wchar* function (Handle, inout uint, inout UErrorCode) uregex_getText; - uint function (Handle, uint, wchar*, uint, inout UErrorCode) uregex_group; - uint function (Handle, inout UErrorCode) uregex_groupCount; - uint function (Handle, uint, inout UErrorCode) uregex_start; - uint function (Handle, uint, inout UErrorCode) uregex_end; - void function (Handle, uint, inout UErrorCode) uregex_reset; - bool function (Handle, uint, inout UErrorCode) uregex_matches; - bool function (Handle, uint, inout UErrorCode) uregex_lookingAt; - bool function (Handle, uint, inout UErrorCode) uregex_find; - bool function (Handle, inout UErrorCode) uregex_findNext; - uint function (Handle, wchar*, uint, wchar*, uint, inout UErrorCode) uregex_replaceAll; - uint function (Handle, wchar*, uint, wchar*, uint, inout UErrorCode) uregex_replaceFirst; - } - - /*********************************************************************** - - ***********************************************************************/ - - static FunctionLoader.Bind[] targets = - [ - {cast(void**) &uregex_open, "uregex_open"}, - {cast(void**) &uregex_close, "uregex_close"}, - {cast(void**) &uregex_clone, "uregex_clone"}, - {cast(void**) &uregex_pattern, "uregex_pattern"}, - {cast(void**) &uregex_flags, "uregex_flags"}, - {cast(void**) &uregex_setText, "uregex_setText"}, - {cast(void**) &uregex_getText, "uregex_getText"}, - {cast(void**) &uregex_group, "uregex_group"}, - {cast(void**) &uregex_groupCount, "uregex_groupCount"}, - {cast(void**) &uregex_start, "uregex_start"}, - {cast(void**) &uregex_end, "uregex_end"}, - {cast(void**) &uregex_reset, "uregex_reset"}, - {cast(void**) &uregex_matches, "uregex_matches"}, - {cast(void**) &uregex_lookingAt, "uregex_lookingAt"}, - {cast(void**) &uregex_find, "uregex_find"}, - {cast(void**) &uregex_findNext, "uregex_findNext"}, - {cast(void**) &uregex_replaceAll, "uregex_replaceAll"}, - {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"}, - ]; - - /*********************************************************************** - - ***********************************************************************/ - - static this () - { - library = FunctionLoader.bind (icuin, targets); - } - - /*********************************************************************** - - ***********************************************************************/ - - static ~this () - { - FunctionLoader.unbind (library); - } -} +/******************************************************************************* + + @file URegex.d + + Copyright (c) 2004 Kris Bell + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for damages + of any kind arising from the use of this software. + + Permission is hereby granted to anyone to use this software for any + purpose, including commercial applications, and to alter it and/or + redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment within documentation of + said product would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any distribution + of the source. + + 4. Derivative works are permitted, but they must carry this notice + in full and credit the original source. + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + @version Initial version, November 2004 + @author Kris + + Note that this package and documentation is built around the ICU + project (http://oss.software.ibm.com/icu/). Below is the license + statement as specified by that software: + + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + ICU License - ICU 1.8.1 and later + + COPYRIGHT AND PERMISSION NOTICE + + Copyright (c) 1995-2003 International Business Machines Corporation and + others. + + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, and/or sell copies of the Software, and to permit persons + to whom the Software is furnished to do so, provided that the above + copyright notice(s) and this permission notice appear in all copies of + the Software and that both the above copyright notice(s) and this + permission notice appear in supporting documentation. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL + INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING + FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION + WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, use + or other dealings in this Software without prior written authorization + of the copyright holder. + + ---------------------------------------------------------------------- + + All trademarks and registered trademarks mentioned herein are the + property of their respective owners. + +*******************************************************************************/ + +module dwtx.dwtxhelper.mangoicu.URegex; + +private import dwtx.dwtxhelper.mangoicu.ICU; + +public import dwtx.dwtxhelper.mangoicu.ULocale, + dwtx.dwtxhelper.mangoicu.UString, + dwtx.dwtxhelper.mangoicu.UCollator, + dwtx.dwtxhelper.mangoicu.UBreakIterator; + + +/******************************************************************************* + + Set of slices to return for group matching. See URegex.groups() + +*******************************************************************************/ + +class Groups : ICU +{ + public wchar[] g0, + g1, + g2, + g3, + g4, + g5, + g6, + g7, + g8, + g9; +} + +/******************************************************************************* + + Apis for an engine that provides regular-expression searching of + UTF16 strings. + + See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full + details. + +*******************************************************************************/ + +class URegex : Groups +{ + private Handle handle; + private UStringView theText; + + // Regex modes + public enum Flag + { + None = 0, + + // Enable case insensitive matching + CaseInsensitive = 2, + + // Allow white space and comments within patterns + Comments = 4, + + // Control behavior of "$" and "^" If set, recognize + // line terminators within string, otherwise, match + // only at start and end of input string. + MultiLine = 8, + + // If set, '.' matches line terminators, otherwise '.' + // matching stops at line end + DotAll = 32, + + // Forces normalization of pattern and strings + CanonEq = 128, + + // If set, uses the Unicode TR 29 definition of word + // boundaries. Warning: Unicode word boundaries are + // quite different from traditional regular expression + // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries + UWord = 256, + } + + /*********************************************************************** + + Compiles the regular expression in string form into an + internal representation using the specified match mode + flags. The resulting regular expression handle can then + be used to perform various matching operations. + + ***********************************************************************/ + + this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null) + { + UErrorCode e; + + handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e); + testError (e, "failed to open regex"); + uregex_setText (handle, "", 0, e); + } + + /*********************************************************************** + + Compiles the regular expression in string form into an + internal representation using the specified match mode + flags. The resulting regular expression handle can then + be used to perform various matching operations. + + ***********************************************************************/ + + this (UStringView pattern, Flag flags=Flag.None, ParseError* pe=null) + { + this (pattern.get, flags, pe); + } + + /*********************************************************************** + + Internal constructor; used for cloning + + ***********************************************************************/ + + private this (Handle handle) + { + UErrorCode e; + + this.handle = handle; + uregex_setText (handle, "", 0, e); + } + + /*********************************************************************** + + Close the regular expression, recovering all resources (memory) + it was holding + + ***********************************************************************/ + + ~this () + { + uregex_close (handle); + } + + /*********************************************************************** + + Cloning a regular expression is faster than opening a second + instance from the source form of the expression, and requires + less memory. + + Note that the current input string and the position of any + matched text within it are not cloned; only the pattern itself + and and the match mode flags are copied. + + Cloning can be particularly useful to threaded applications + that perform multiple match operations in parallel. Each + concurrent RE operation requires its own instance of a + URegularExpression. + + ***********************************************************************/ + + URegex clone () + { + UErrorCode e; + + Handle h = uregex_clone (handle, e); + testError (e, "failed to clone regex"); + return new URegex (h); + } + + /*********************************************************************** + + Return a copy of the source form of the pattern for this + regular expression + + ***********************************************************************/ + + UString getPattern () + { + UErrorCode e; + uint len; + + wchar* x = uregex_pattern (handle, len, e); + testError (e, "failed to extract regex pattern"); + return new UString (x[0..len]); + } + + /*********************************************************************** + + Get the match mode flags that were specified when compiling + this regular expression + + ***********************************************************************/ + + Flag getFlags () + { + UErrorCode e; + + Flag f = cast(Flag) uregex_flags (handle, e); + testError (e, "failed to get regex flags"); + return f; + } + + /*********************************************************************** + + Set the subject text string upon which the regular expression + will look for matches. + + This function may be called any number of times, allowing the + regular expression pattern to be applied to different strings. + + Regular expression matching operations work directly on the + application's string data. No copy is made. The subject string + data must not be altered after calling this function until after + all regular expression operations involving this string data are + completed. + + Zero length strings are permitted. In this case, no subsequent + match operation will dereference the text string pointer. + + ***********************************************************************/ + + void setText (UStringView t) + { + UErrorCode e; + + theText = t; + uregex_setText (handle, t.get.ptr, t.length, e); + testError (e, "failed to set regex text"); + } + + /*********************************************************************** + + Get the subject text that is currently associated with this + regular expression object. This simply returns whatever was + previously supplied via setText(). + + Note that this returns a read-only reference to the text. + + ***********************************************************************/ + + UStringView getText () + { + return theText; + } + + /*********************************************************************** + + Return a set of slices representing the parenthesised groups. + This can be used in the following manner: + + @code + wchar msg; + + if (regex.next()) + with (regex.groups()) + msg ~= g1 ~ ":" ~ g2 + @endcode + + Note that g0 represents the entire match, whereas g1 through + g9 represent the parenthesised expressions. + + ***********************************************************************/ + + Groups groups () + { + wchar[]* p = &g0; + uint count = groupCount(); + wchar[] content = theText.get(); + + if (count > 9) + count = 9; + for (uint i=0; i <= count; ++p, ++i) + *p = content [start(i)..end(i)]; + return this; + } + + /*********************************************************************** + + Extract the string for the specified matching expression or + subexpression. UString 's' is the destination for the match. + + Group #0 is the complete string of matched text. Group #1 is + the text matched by the first set of capturing parentheses. + + ***********************************************************************/ + + void group (UString s, uint index) + { + uint fmt (wchar* dst, uint length, inout UErrorCode e) + { + return uregex_group (handle, index, dst, length, e); + } + + s.format (&fmt, "failed to extract regex group text"); + } + + /*********************************************************************** + + Get the number of capturing groups in this regular + expression's pattern + + ***********************************************************************/ + + uint groupCount () + { + UErrorCode e; + + uint i = uregex_groupCount (handle, e); + testError (e, "failed to get regex group-count"); + return i; + } + + /*********************************************************************** + + Returns the index in the input string of the start of the + text matched by the specified capture group during the + previous match operation. + + Return -1 if the capture group was not part of the last + match. Group #0 refers to the complete range of matched + text. Group #1 refers to the text matched by the first + set of capturing parentheses + + ***********************************************************************/ + + uint start (uint index = 0) + { + UErrorCode e; + + uint i = uregex_start (handle, index, e); + testError (e, "failed to get regex start"); + return i; + } + + /*********************************************************************** + + Returns the index in the input string of the position + following the end of the text matched by the specified + capture group. + + Return -1 if the capture group was not part of the last + match. Group #0 refers to the complete range of matched + text. Group #1 refers to the text matched by the first + set of capturing parentheses. + + ***********************************************************************/ + + uint end (uint index = 0) + { + UErrorCode e; + + uint i = uregex_end (handle, index, e); + testError (e, "failed to get regex end"); + return i; + } + + /*********************************************************************** + + Reset any saved state from the previous match. + + Has the effect of causing uregex_findNext to begin at the + specified index, and causing uregex_start(), uregex_end() + and uregex_group() to return an error indicating that there + is no match information available. + + ***********************************************************************/ + + void reset (uint startIndex) + { + UErrorCode e; + + uregex_reset (handle, startIndex, e); + testError (e, "failed to set regex next-index"); + } + + /*********************************************************************** + + Attempts to match the input string, beginning at startIndex, + against the pattern. + + To succeed, the match must extend to the end of the input + string + + ***********************************************************************/ + + bool match (uint startIndex) + { + UErrorCode e; + + bool b = uregex_matches (handle, startIndex, e); + testError (e, "failed while matching regex"); + return b; + } + + /*********************************************************************** + + Attempts to match the input string, starting from the + specified index, against the pattern. + + The match may be of any length, and is not required to + extend to the end of the input string. Contrast with match() + + ***********************************************************************/ + + bool probe (uint startIndex) + { + UErrorCode e; + + bool b = uregex_lookingAt (handle, startIndex, e); + testError (e, "failed while looking at regex"); + return b; + } + + /*********************************************************************** + + Returns whether the text matches the search pattern, starting + from the current position. + + If startIndex is specified, the current position is moved to + the specified location before the seach is initiated. + + ***********************************************************************/ + + bool next (uint startIndex = uint.max) + { + UErrorCode e; + bool b; + + b = (startIndex == uint.max) ? uregex_findNext (handle, e) : + uregex_find (handle, startIndex, e); + + testError (e, "failed on next regex"); + return b; + } + + /*********************************************************************** + + Replaces every substring of the input that matches the pattern + with the given replacement string. + + This is a convenience function that provides a complete + find-and-replace-all operation. + + This method scans the input string looking for matches of + the pattern. Input that is not part of any match is copied + unchanged to the destination buffer. Matched regions are + replaced in the output buffer by the replacement string. + The replacement string may contain references to capture + groups; these take the form of $1, $2, etc. + + The provided 'result' will contain the results, and should + be set with a length sufficient to house the entire result. + Upon completion, the 'result' is shortened appropriately + and the total extent (length) of the operation is returned. + Set the initital length of 'result' using the UString method + truncate(). + + The returned extent should be checked to ensure it is not + longer than the length of 'result'. If it is longer, then + the result has been truncated. + + ***********************************************************************/ + + uint replaceAll (UStringView replace, UString result) + { + UErrorCode e; + + uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); + testError (e, "failed during regex replace"); + result.truncate (len); + return len; + } + + /*********************************************************************** + + Replaces the first substring of the input that matches the + pattern with the given replacement string. + + This is a convenience function that provides a complete + find-and-replace operation. + + This method scans the input string looking for a match of + the pattern. All input that is not part of the match is + copied unchanged to the destination buffer. The matched + region is replaced in the output buffer by the replacement + string. The replacement string may contain references to + capture groups; these take the form of $1, $2, etc + + The provided 'result' will contain the results, and should + be set with a length sufficient to house the entire result. + Upon completion, the 'result' is shortened appropriately + and the total extent (length) of the operation is returned. + Set the initital length of 'result' using the UString method + truncate(). + + The returned extent should be checked to ensure it is not + longer than the length of 'result'. If it is longer, then + the result has been truncated. + + ***********************************************************************/ + + uint replaceFirst (UStringView replace, UString result) + { + UErrorCode e; + + uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); + testError (e, "failed during regex replace"); + result.truncate (len); + return len; + } + + /*********************************************************************** + + Split the text up into slices (fields), where each slice + represents the text situated between each pattern matched + within the text. The pattern is expected to represent one + or more slice delimiters. + + ***********************************************************************/ + + uint split (wchar[][] fields) + { + UErrorCode e; + uint pos, + count; + wchar[] content = theText.get; + + while (count < fields.length) + if (uregex_findNext (handle, e) && e == e.OK) + { + uint i = start(); + fields[count] = content[pos..i]; + pos = end (); + + // ignore leading delimiter + if (i) + ++count; + } + else + break; + + testError (e, "failed during split"); + return count; + } + + + /*********************************************************************** + + Bind the ICU functions from a shared library. This is + complicated by the issues regarding D and DLLs on the + Windows platform + + ***********************************************************************/ + + private static void* library; + + /*********************************************************************** + + ***********************************************************************/ + + private static extern (C) + { + Handle function (wchar*, uint, uint, ParseError*, inout UErrorCode) uregex_open; + void function (Handle) uregex_close; + Handle function (Handle, inout UErrorCode) uregex_clone; + wchar* function (Handle, inout uint, inout UErrorCode) uregex_pattern; + uint function (Handle, inout UErrorCode) uregex_flags; + void function (Handle, wchar*, uint, inout UErrorCode) uregex_setText; + wchar* function (Handle, inout uint, inout UErrorCode) uregex_getText; + uint function (Handle, uint, wchar*, uint, inout UErrorCode) uregex_group; + uint function (Handle, inout UErrorCode) uregex_groupCount; + uint function (Handle, uint, inout UErrorCode) uregex_start; + uint function (Handle, uint, inout UErrorCode) uregex_end; + void function (Handle, uint, inout UErrorCode) uregex_reset; + bool function (Handle, uint, inout UErrorCode) uregex_matches; + bool function (Handle, uint, inout UErrorCode) uregex_lookingAt; + bool function (Handle, uint, inout UErrorCode) uregex_find; + bool function (Handle, inout UErrorCode) uregex_findNext; + uint function (Handle, wchar*, uint, wchar*, uint, inout UErrorCode) uregex_replaceAll; + uint function (Handle, wchar*, uint, wchar*, uint, inout UErrorCode) uregex_replaceFirst; + } + + /*********************************************************************** + + ***********************************************************************/ + + static FunctionLoader.Bind[] targets = + [ + {cast(void**) &uregex_open, "uregex_open"}, + {cast(void**) &uregex_close, "uregex_close"}, + {cast(void**) &uregex_clone, "uregex_clone"}, + {cast(void**) &uregex_pattern, "uregex_pattern"}, + {cast(void**) &uregex_flags, "uregex_flags"}, + {cast(void**) &uregex_setText, "uregex_setText"}, + {cast(void**) &uregex_getText, "uregex_getText"}, + {cast(void**) &uregex_group, "uregex_group"}, + {cast(void**) &uregex_groupCount, "uregex_groupCount"}, + {cast(void**) &uregex_start, "uregex_start"}, + {cast(void**) &uregex_end, "uregex_end"}, + {cast(void**) &uregex_reset, "uregex_reset"}, + {cast(void**) &uregex_matches, "uregex_matches"}, + {cast(void**) &uregex_lookingAt, "uregex_lookingAt"}, + {cast(void**) &uregex_find, "uregex_find"}, + {cast(void**) &uregex_findNext, "uregex_findNext"}, + {cast(void**) &uregex_replaceAll, "uregex_replaceAll"}, + {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"}, + ]; + + /*********************************************************************** + + ***********************************************************************/ + + static this () + { + library = FunctionLoader.bind (icuin, targets); + } + + /*********************************************************************** + + ***********************************************************************/ + + static ~this () + { + FunctionLoader.unbind (library); + } +}