Mercurial > projects > dwt-addons

diff dwtx/dwtxhelper/mangoicu/URegex.d @ 89:040da1cb0d76
Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author: Frank Benoit <benoit@tionex.de>
date: Sun, 22 Jun 2008 22:57:31 +0200
children: 11e8159caf7a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dwtx/dwtxhelper/mangoicu/URegex.d	Sun Jun 22 22:57:31 2008 +0200
@@ -0,0 +1,700 @@
+/*******************************************************************************
+
+        @file URegex.d
+        
+        Copyright (c) 2004 Kris Bell
+        
+        This software is provided 'as-is', without any express or implied
+        warranty. In no event will the authors be held liable for damages
+        of any kind arising from the use of this software.
+        
+        Permission is hereby granted to anyone to use this software for any 
+        purpose, including commercial applications, and to alter it and/or 
+        redistribute it freely, subject to the following restrictions:
+        
+        1. The origin of this software must not be misrepresented; you must 
+           not claim that you wrote the original software. If you use this 
+           software in a product, an acknowledgment within documentation of 
+           said product would be appreciated but is not required.
+
+        2. Altered source versions must be plainly marked as such, and must 
+           not be misrepresented as being the original software.
+
+        3. This notice may not be removed or altered from any distribution
+           of the source.
+
+        4. Derivative works are permitted, but they must carry this notice
+           in full and credit the original source.
+
+
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+        @version        Initial version, November 2004      
+        @author         Kris
+
+        Note that this package and documentation is built around the ICU 
+        project (http://oss.software.ibm.com/icu/). Below is the license 
+        statement as specified by that software:
+
+
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+        ICU License - ICU 1.8.1 and later
+
+        COPYRIGHT AND PERMISSION NOTICE
+
+        Copyright (c) 1995-2003 International Business Machines Corporation and 
+        others.
+
+        All rights reserved.
+
+        Permission is hereby granted, free of charge, to any person obtaining a
+        copy of this software and associated documentation files (the
+        "Software"), to deal in the Software without restriction, including
+        without limitation the rights to use, copy, modify, merge, publish,
+        distribute, and/or sell copies of the Software, and to permit persons
+        to whom the Software is furnished to do so, provided that the above
+        copyright notice(s) and this permission notice appear in all copies of
+        the Software and that both the above copyright notice(s) and this
+        permission notice appear in supporting documentation.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+        OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+        OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+        HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
+        INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
+        FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+        NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+        WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+        Except as contained in this notice, the name of a copyright holder
+        shall not be used in advertising or otherwise to promote the sale, use
+        or other dealings in this Software without prior written authorization
+        of the copyright holder.
+
+        ----------------------------------------------------------------------
+
+        All trademarks and registered trademarks mentioned herein are the 
+        property of their respective owners.
+
+*******************************************************************************/
+
+module dwtx.dwthelper.mangoicu.URegex;
+
+private import  dwtx.dwthelper.mangoicu.ICU;
+
+public  import  dwtx.dwthelper.mangoicu.ULocale,
+                dwtx.dwthelper.mangoicu.UString,
+                dwtx.dwthelper.mangoicu.UCollator,
+                dwtx.dwthelper.mangoicu.UBreakIterator;
+
+
+/*******************************************************************************
+
+        Set of slices to return for group matching. See URegex.groups()
+
+*******************************************************************************/
+
+class Groups : ICU
+{
+        public  wchar[] g0,
+                        g1,
+                        g2,
+                        g3,
+                        g4,
+                        g5,
+                        g6,
+                        g7,
+                        g8,
+                        g9;
+}
+
+/*******************************************************************************
+
+        Apis for an engine that provides regular-expression searching of
+        UTF16 strings.
+
+        See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full
+        details.
+
+*******************************************************************************/
+
+class URegex : Groups
+{       
+        private Handle  handle;
+        private UText   theText;
+
+        // Regex modes 
+        public enum     Flag 
+                        {
+                        None            = 0,
+
+                        // Enable case insensitive matching
+                        CaseInsensitive = 2, 
+
+                        // Allow white space and comments within patterns
+                        Comments        = 4,
+
+                        // Control behavior of "$" and "^" If set, recognize 
+                        // line terminators within string, otherwise, match
+                        // only at start and end of input string.
+                        MultiLine       = 8,
+
+                        // If set, '.' matches line terminators, otherwise '.' 
+                        // matching stops at line end
+                        DotAll          = 32,
+                        
+                        // Forces normalization of pattern and strings
+                        CanonEq         = 128,  
+
+                        // If set, uses the Unicode TR 29 definition of word 
+                        // boundaries. Warning: Unicode word boundaries are 
+                        // quite different from traditional regular expression 
+                        // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries
+                        UWord           = 256,
+                        }
+
+        /***********************************************************************
+
+                Compiles the regular expression in string form into an 
+                internal representation using the specified match mode 
+                flags. The resulting regular expression handle can then 
+                be used to perform various matching operations.
+
+        ***********************************************************************/
+
+        this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null)
+        {
+                Error e;
+
+                handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e);
+                testError (e, "failed to open regex");
+                uregex_setText (handle, "", 0, e);
+        }
+
+        /***********************************************************************
+
+                Compiles the regular expression in string form into an 
+                internal representation using the specified match mode 
+                flags. The resulting regular expression handle can then 
+                be used to perform various matching operations.
+
+        ***********************************************************************/
+
+        this (UText pattern, Flag flags=Flag.None, ParseError* pe=null)
+        {
+                this (pattern.get, flags, pe);
+        }
+
+        /***********************************************************************
+
+                Internal constructor; used for cloning
+
+        ***********************************************************************/
+
+        private this (Handle handle)
+        {
+                Error e;
+
+                this.handle = handle;
+                uregex_setText (handle, "", 0, e);
+        }
+
+        /***********************************************************************
+        
+                Close the regular expression, recovering all resources (memory) 
+                it was holding
+
+        ***********************************************************************/
+
+        ~this ()
+        {
+                uregex_close (handle);
+        }
+
+        /***********************************************************************
+        
+                Cloning a regular expression is faster than opening a second 
+                instance from the source form of the expression, and requires 
+                less memory.
+
+                Note that the current input string and the position of any 
+                matched text within it are not cloned; only the pattern itself 
+                and and the match mode flags are copied.
+
+                Cloning can be particularly useful to threaded applications 
+                that perform multiple match operations in parallel. Each 
+                concurrent RE operation requires its own instance of a 
+                URegularExpression.
+
+        ***********************************************************************/
+
+        URegex clone ()
+        {       
+                Error e;
+
+                Handle h = uregex_clone (handle, e);
+                testError (e, "failed to clone regex");
+                return new URegex (h);
+        }
+
+        /***********************************************************************
+
+                Return a copy of the source form of the pattern for this 
+                regular expression
+
+        ***********************************************************************/
+
+        UString getPattern ()
+        {       
+                Error e;
+                uint  len;
+
+                wchar* x = uregex_pattern (handle, len, e);
+                testError (e, "failed to extract regex pattern");
+                return new UString (x[0..len]);
+        }
+
+        /***********************************************************************
+
+                Get the match mode flags that were specified when compiling 
+                this regular expression        
+
+        ***********************************************************************/
+
+        Flag getFlags ()
+        {       
+                Error e;
+
+                Flag f = cast(Flag) uregex_flags (handle, e);
+                testError (e, "failed to get regex flags");
+                return f;        
+        }
+
+        /***********************************************************************
+        
+                Set the subject text string upon which the regular expression 
+                will look for matches.
+
+                This function may be called any number of times, allowing the 
+                regular expression pattern to be applied to different strings.
+
+                Regular expression matching operations work directly on the 
+                application's string data. No copy is made. The subject string 
+                data must not be altered after calling this function until after 
+                all regular expression operations involving this string data are 
+                completed.
+
+                Zero length strings are permitted. In this case, no subsequent 
+                match operation will dereference the text string pointer.
+
+        ***********************************************************************/
+
+        void setText (UText t)
+        {       
+                Error e;
+
+                theText = t;
+                uregex_setText (handle, t.get.ptr, t.length, e);
+                testError (e, "failed to set regex text");
+        }
+
+        /***********************************************************************
+                
+                Get the subject text that is currently associated with this 
+                regular expression object. This simply returns whatever was
+                previously supplied via setText(). 
+
+                Note that this returns a read-only reference to the text.
+
+        ***********************************************************************/
+
+        UText getText ()
+        {      
+                return theText;
+        }
+
+        /***********************************************************************
+
+                Return a set of slices representing the parenthesised groups.
+                This can be used in the following manner:               
+
+                @code
+                wchar msg;
+
+                if (regex.next())
+                    with (regex.groups())
+                          msg ~= g1 ~ ":" ~ g2
+                @endcode
+
+                Note that g0 represents the entire match, whereas g1 through
+                g9 represent the parenthesised expressions.
+                
+        ***********************************************************************/
+
+        Groups groups ()
+        {  
+                wchar[]*        p = &g0;
+                uint            count = groupCount();
+                wchar[]         content = theText.get();
+
+                if (count > 9)
+                    count = 9;
+                for (uint i=0; i <= count; ++p, ++i)
+                     *p = content [start(i)..end(i)];
+                return this;
+        }
+
+        /***********************************************************************
+
+                Extract the string for the specified matching expression or 
+                subexpression. UString 's' is the destination for the match.
+
+                Group #0 is the complete string of matched text. Group #1 is 
+                the text matched by the first set of capturing parentheses.
+        
+        ***********************************************************************/
+
+        void group (UString s, uint index)
+        {       
+                uint fmt (wchar* dst, uint length, inout Error e)
+                {
+                        return uregex_group (handle, index, dst, length, e);
+                }
+
+                s.format (&fmt, "failed to extract regex group text");
+        }
+
+        /***********************************************************************
+        
+                Get the number of capturing groups in this regular 
+                expression's pattern
+
+        ***********************************************************************/
+
+        uint groupCount ()
+        {       
+                Error e;
+
+                uint i = uregex_groupCount (handle, e);
+                testError (e, "failed to get regex group-count");
+                return i;        
+        }
+
+        /***********************************************************************
+                
+                Returns the index in the input string of the start of the 
+                text matched by the specified capture group during the 
+                previous match operation.
+
+                Return -1 if the capture group was not part of the last 
+                match. Group #0 refers to the complete range of matched 
+                text. Group #1 refers to the text matched by the first 
+                set of capturing parentheses
+
+        ***********************************************************************/
+
+        uint start (uint index = 0)
+        {       
+                Error e;
+
+                uint i = uregex_start (handle, index, e);
+                testError (e, "failed to get regex start");
+                return i;        
+        }
+
+        /***********************************************************************
+
+                Returns the index in the input string of the position 
+                following the end of the text matched by the specified 
+                capture group.
+
+                Return -1 if the capture group was not part of the last 
+                match. Group #0 refers to the complete range of matched 
+                text. Group #1 refers to the text matched by the first 
+                set of capturing parentheses.
+        
+        ***********************************************************************/
+
+        uint end (uint index = 0)
+        {       
+                Error e;
+
+                uint i = uregex_end (handle, index, e);
+                testError (e, "failed to get regex end");
+                return i;        
+        }
+
+        /***********************************************************************
+
+                Reset any saved state from the previous match.
+
+                Has the effect of causing uregex_findNext to begin at the 
+                specified index, and causing uregex_start(), uregex_end() 
+                and uregex_group() to return an error indicating that there 
+                is no match information available.
+        
+        ***********************************************************************/
+
+        void reset (uint startIndex)
+        {       
+                Error e;
+
+                uregex_reset (handle, startIndex, e);
+                testError (e, "failed to set regex next-index");
+        }
+
+        /***********************************************************************
+        
+                Attempts to match the input string, beginning at startIndex, 
+                against the pattern.
+
+                To succeed, the match must extend to the end of the input 
+                string
+
+        ***********************************************************************/
+
+        bool match (uint startIndex)
+        {       
+                Error e;
+
+                bool b = uregex_matches (handle, startIndex, e);
+                testError (e, "failed while matching regex");
+                return b;
+        }
+
+        /***********************************************************************
+
+                Attempts to match the input string, starting from the 
+                specified index, against the pattern.
+
+                The match may be of any length, and is not required to 
+                extend to the end of the input string. Contrast with match()        
+
+        ***********************************************************************/
+
+        bool probe (uint startIndex)
+        {       
+                Error e;
+
+                bool b = uregex_lookingAt (handle, startIndex, e);
+                testError (e, "failed while looking at regex");
+                return b;
+        }
+
+        /***********************************************************************
+                
+                Returns whether the text matches the search pattern, starting 
+                from the current position.
+
+                If startIndex is specified, the current position is moved to 
+                the specified location before the seach is initiated.
+
+        ***********************************************************************/
+
+        bool next (uint startIndex = uint.max)
+        {     
+                Error e;
+                bool  b;
+
+                b = (startIndex == uint.max) ? uregex_findNext (handle, e) : 
+                                               uregex_find     (handle, startIndex, e);
+
+                testError (e, "failed on next regex");  
+                return b;
+        }
+
+        /***********************************************************************
+        
+                Replaces every substring of the input that matches the pattern 
+                with the given replacement string.
+
+                This is a convenience function that provides a complete 
+                find-and-replace-all operation.
+
+                This method scans the input string looking for matches of 
+                the pattern. Input that is not part of any match is copied 
+                unchanged to the destination buffer. Matched regions are 
+                replaced in the output buffer by the replacement string. 
+                The replacement string may contain references to capture 
+                groups; these take the form of $1, $2, etc.
+
+                The provided 'result' will contain the results, and should
+                be set with a length sufficient to house the entire result.
+                Upon completion, the 'result' is shortened appropriately 
+                and the total extent (length) of the operation is returned. 
+                Set the initital length of 'result' using the UString method
+                truncate().
+
+                The returned extent should be checked to ensure it is not
+                longer than the length of 'result'. If it is longer, then
+                the result has been truncated.
+                
+        ***********************************************************************/
+
+        uint replaceAll (UText replace, UString result)
+        {
+                Error e;
+
+                uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e);
+                testError (e, "failed during regex replace");  
+                result.truncate (len);
+                return len;
+        }
+
+        /***********************************************************************
+        
+                Replaces the first substring of the input that matches the 
+                pattern with the given replacement string.
+
+                This is a convenience function that provides a complete 
+                find-and-replace operation.
+
+                This method scans the input string looking for a match of 
+                the pattern. All input that is not part of the match is 
+                copied unchanged to the destination buffer. The matched 
+                region is replaced in the output buffer by the replacement 
+                string. The replacement string may contain references to 
+                capture groups; these take the form of $1, $2, etc
+
+                The provided 'result' will contain the results, and should
+                be set with a length sufficient to house the entire result.
+                Upon completion, the 'result' is shortened appropriately 
+                and the total extent (length) of the operation is returned. 
+                Set the initital length of 'result' using the UString method
+                truncate().
+
+                The returned extent should be checked to ensure it is not
+                longer than the length of 'result'. If it is longer, then
+                the result has been truncated.
+                
+        ***********************************************************************/
+
+        uint replaceFirst (UText replace, UString result)
+        {
+                Error e;
+
+                uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e);
+                testError (e, "failed during regex replace");  
+                result.truncate (len);
+                return len;
+        }
+
+        /***********************************************************************
+        
+                Split the text up into slices (fields), where each slice 
+                represents the text situated between each pattern matched
+                within the text. The pattern is expected to represent one
+                or more slice delimiters.
+
+        ***********************************************************************/
+
+        uint split (wchar[][] fields)
+        {     
+                Error           e;
+                uint            pos,
+                                count;
+                wchar[]         content = theText.get;
+
+                while (count < fields.length)
+                       if (uregex_findNext (handle, e) && e == e.OK)
+                          {
+                          uint i = start();
+                          fields[count] = content[pos..i];
+                          pos = end ();
+
+                          // ignore leading delimiter
+                          if (i)
+                              ++count;
+                          }
+                       else
+                          break;
+                
+                testError (e, "failed during split");  
+                return count;
+        }
+
+
+        /***********************************************************************
+
+                Bind the ICU functions from a shared library. This is
+                complicated by the issues regarding D and DLLs on the
+                Windows platform
+        
+        ***********************************************************************/
+              
+        private static void* library;
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        private static extern (C) 
+        {
+                Handle  function (wchar*, uint, uint, ParseError*, inout Error) uregex_open;
+                void    function (Handle) uregex_close;
+                Handle  function (Handle, inout Error) uregex_clone;
+                wchar*  function (Handle, inout uint, inout Error) uregex_pattern;
+                uint    function (Handle, inout Error) uregex_flags;
+                void    function (Handle, wchar*, uint, inout Error) uregex_setText;
+                wchar*  function (Handle, inout uint, inout Error) uregex_getText;
+                uint    function (Handle, uint, wchar*, uint, inout Error) uregex_group;
+                uint    function (Handle, inout Error) uregex_groupCount;
+                uint    function (Handle, uint, inout Error) uregex_start;
+                uint    function (Handle, uint, inout Error) uregex_end;
+                void    function (Handle, uint, inout Error) uregex_reset;
+                bool    function (Handle, uint, inout Error) uregex_matches;
+                bool    function (Handle, uint, inout Error) uregex_lookingAt;
+                bool    function (Handle, uint, inout Error) uregex_find;
+                bool    function (Handle, inout Error) uregex_findNext;
+                uint    function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll;
+                uint    function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst;
+        }
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static  FunctionLoader.Bind[] targets = 
+                [
+                {cast(void**) &uregex_open,             "uregex_open"}, 
+                {cast(void**) &uregex_close,            "uregex_close"},
+                {cast(void**) &uregex_clone,            "uregex_clone"},
+                {cast(void**) &uregex_pattern,          "uregex_pattern"},
+                {cast(void**) &uregex_flags,            "uregex_flags"},
+                {cast(void**) &uregex_setText,          "uregex_setText"},
+                {cast(void**) &uregex_getText,          "uregex_getText"},
+                {cast(void**) &uregex_group,            "uregex_group"},
+                {cast(void**) &uregex_groupCount,       "uregex_groupCount"},
+                {cast(void**) &uregex_start,            "uregex_start"},
+                {cast(void**) &uregex_end,              "uregex_end"},
+                {cast(void**) &uregex_reset,            "uregex_reset"},
+                {cast(void**) &uregex_matches,          "uregex_matches"},
+                {cast(void**) &uregex_lookingAt,        "uregex_lookingAt"},
+                {cast(void**) &uregex_find,             "uregex_find"},
+                {cast(void**) &uregex_findNext,         "uregex_findNext"},
+                {cast(void**) &uregex_replaceAll,       "uregex_replaceAll"},
+                {cast(void**) &uregex_replaceFirst,     "uregex_replaceFirst"},
+                ];
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static this ()
+        {
+                library = FunctionLoader.bind (icuin, targets);
+        }
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static ~this ()
+        {
+                FunctionLoader.unbind (library);
+        }
+}
author	Frank Benoit <benoit@tionex.de>
date	Sun, 22 Jun 2008 22:57:31 +0200
parents
children	11e8159caf7a