Mercurial > projects > dwt2

diff com.ibm.icu/src/com/ibm/icu/mangoicu/UString.d @ 92:ebefa5c2eab4
moving ICU bindings to com.ibm.icu
author: Frank Benoit <benoit@tionex.de>
date: Sun, 19 Apr 2009 13:49:38 +0200
parents: base/src/java/mangoicu/UString.d@1bf55a6eb092
children: 536e43f63c81
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/com.ibm.icu/src/com/ibm/icu/mangoicu/UString.d	Sun Apr 19 13:49:38 2009 +0200
@@ -0,0 +1,1508 @@
+/*******************************************************************************
+
+        @file UString.d
+
+        Copyright (c) 2004 Kris Bell
+
+        This software is provided 'as-is', without any express or implied
+        warranty. In no event will the authors be held liable for damages
+        of any kind arising from the use of this software.
+
+        Permission is hereby granted to anyone to use this software for any
+        purpose, including commercial applications, and to alter it and/or
+        redistribute it freely, subject to the following restrictions:
+
+        1. The origin of this software must not be misrepresented; you must
+           not claim that you wrote the original software. If you use this
+           software in a product, an acknowledgment within documentation of
+           said product would be appreciated but is not required.
+
+        2. Altered source versions must be plainly marked as such, and must
+           not be misrepresented as being the original software.
+
+        3. This notice may not be removed or altered from any distribution
+           of the source.
+
+        4. Derivative works are permitted, but they must carry this notice
+           in full and credit the original source.
+
+
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+        @version        Initial version, October 2004
+        @author         Kris
+
+        Note that this package and documentation is built around the ICU
+        project (http://oss.software.ibm.com/icu/). Below is the license
+        statement as specified by that software:
+
+
+                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+        ICU License - ICU 1.8.1 and later
+
+        COPYRIGHT AND PERMISSION NOTICE
+
+        Copyright (c) 1995-2003 International Business Machines Corporation and
+        others.
+
+        All rights reserved.
+
+        Permission is hereby granted, free of charge, to any person obtaining a
+        copy of this software and associated documentation files (the
+        "Software"), to deal in the Software without restriction, including
+        without limitation the rights to use, copy, modify, merge, publish,
+        distribute, and/or sell copies of the Software, and to permit persons
+        to whom the Software is furnished to do so, provided that the above
+        copyright notice(s) and this permission notice appear in all copies of
+        the Software and that both the above copyright notice(s) and this
+        permission notice appear in supporting documentation.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+        OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+        OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+        HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
+        INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
+        FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+        NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+        WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+        Except as contained in this notice, the name of a copyright holder
+        shall not be used in advertising or otherwise to promote the sale, use
+        or other dealings in this Software without prior written authorization
+        of the copyright holder.
+
+        ----------------------------------------------------------------------
+
+        All trademarks and registered trademarks mentioned herein are the
+        property of their respective owners.
+
+*******************************************************************************/
+
+module com.ibm.icu.mangoicu.UString;
+
+private import  com.ibm.icu.mangoicu.ICU,
+                com.ibm.icu.mangoicu.UChar,
+                com.ibm.icu.mangoicu.ULocale;
+import java.lang.util;
+/*******************************************************************************
+
+*******************************************************************************/
+
+private extern (C) void memmove (void* dst, void* src, uint bytes);
+
+/*******************************************************************************
+
+        Bind to the IReadable and IWritable interfaces if we're building
+        along with the mango.io package
+
+*******************************************************************************/
+
+version=Isolated;
+version (Isolated)
+        {
+        private interface ITextOther   {}
+        private interface IStringOther {}
+        }
+     else
+        {
+        private import  com.ibm.icu.mangoicu.UMango;
+
+        private import  mango.io.model.IReader,
+                        mango.io.model.IWriter;
+
+        private interface ITextOther   : IWritable {}
+        private interface IStringOther : IReadable {}
+        }
+
+
+/*******************************************************************************
+
+        UString is a string class that stores Unicode characters directly
+        and provides similar functionality as the Java String class.
+
+        In ICU, a Unicode string consists of 16-bit Unicode code units.
+        A Unicode character may be stored with either one code unit &#8212;
+        which is the most common case &#8212; or with a matched pair of
+        special code units ("surrogates"). The data type for code units
+        is UChar.
+
+        For single-character handling, a Unicode character code point is
+        a value in the range 0..0x10ffff. ICU uses the UChar32 type for
+        code points.
+
+        Indexes and offsets into and lengths of strings always count code
+        units, not code points. This is the same as with multi-byte char*
+        strings in traditional string handling. Operations on partial
+        strings typically do not test for code point boundaries. If necessary,
+        the user needs to take care of such boundaries by testing for the code
+        unit values or by using functions like getChar32Start()
+        and getChar32Limit()
+
+        UString methods are more lenient with regard to input parameter values
+        than other ICU APIs. In particular:
+
+        - If indexes are out of bounds for a UString object (< 0 or > length)
+          then they are "pinned" to the nearest boundary.
+
+        - If primitive string pointer values (e.g., const wchar* or char*) for
+          input strings are null, then those input string parameters are treated
+          as if they pointed to an empty string. However, this is not the case
+          for char* parameters for charset names or other IDs.
+
+*******************************************************************************/
+
+class UString : UStringView, IStringOther
+{
+        alias opCat             append;
+        alias opIndexAssign     setCharAt;
+
+        /***********************************************************************
+
+                Create an empty UString with the specified available space
+
+        ***********************************************************************/
+
+        this (uint space = 0)
+        {
+                content.length = space;
+                mutable = true;
+        }
+
+        /***********************************************************************
+
+                Create a UString upon the provided content. If said content
+                is immutable (read-only) then you might consider setting the
+                'mutable' parameter to false. Doing so will avoid allocating
+                heap-space for the content until it is modified.
+
+        ***********************************************************************/
+
+        this (CString16 content, bool mutable = true)
+        {
+                setTo (content, mutable);
+        }
+
+        /***********************************************************************
+
+                Create a UString via the content of a UStringView. Note that the
+                default is to assume the content is immutable (read-only).
+
+        ***********************************************************************/
+
+        this (UStringView other, bool mutable = false)
+        {
+                this (other.get, mutable);
+        }
+
+        /***********************************************************************
+
+                Create a UString via the content of a UString. If said content
+                is immutable (read-only) then you might consider setting the
+                'mutable' parameter to false. Doing so will avoid allocating
+                heap-space for the content until it is modified via UString
+                methods.
+
+        ***********************************************************************/
+
+        this (UString other, bool mutable = true)
+        {
+                this (other.get, mutable);
+        }
+
+        /***********************************************************************
+
+                Support for reading content via the IO system
+
+        ***********************************************************************/
+
+        version (Isolated){}
+        else
+        {
+                /***************************************************************
+
+                        Internal adapter to handle loading and conversion
+                        of UString content. Once constructed, this may be
+                        used as the target for an IReader. Alternatively,
+                        invoke the load() method with an IBuffer of choice.
+
+                ***************************************************************/
+
+                class UStringDecoder : StringDecoder16
+                {
+                        private UString s;
+
+                        // construct a decoder on the given UString
+                        this (UConverter c, uint bytes, UString s)
+                        {
+                                super (c, bytes);
+                                this.s = s;
+                        }
+
+                        // IReadable adapter to perform the conversion
+                        protected void read (IReader r)
+                        {
+                                load (r.buffer);
+                        }
+
+                        // read from the provided buffer until we
+                        // either have all the content, or an eof
+                        // condition throws an exception.
+                        package void load (IBuffer b)
+                        {
+                                uint produced = super.read (b, s.content);
+                                while (toGo)
+                                      {
+                                      s.expand (toGo);
+                                      produced += super.read (b, s.content[produced..$]);
+                                      }
+                                s.len = produced;
+                        }
+                }
+
+                /***************************************************************
+
+                        Another constructor for loading known content length
+                        into a UString.
+
+                ***************************************************************/
+
+                this (IBuffer buffer, uint contentLength, UConverter cvt)
+                {
+                        this (contentLength);
+                        UStringDecoder sd = new UStringDecoder (cvt, contentLength, this);
+                        sd.load (buffer);
+                }
+
+                /***************************************************************
+
+                        Read as many bytes from the input as is necessary
+                        to produce the expected number of wchar elements.
+                        This uses the default wchar handler, which can be
+                        altered by binding a StringDecoder to the IReader
+                        in use (see UMango for details).
+
+                        We're mutable, so ensure we don't mess with the
+                        IO buffers. Interestingly, changing the length
+                        of a D array will account for slice assignments
+                        (it checks the pointer to see if it's a starting
+                         point in the pool). Unfortunately, that doesn't
+                        catch the case where a slice starts at offset 0,
+                        which is where IBuffer slices may come from.
+
+                        To be safe, we ask the allocator in use whether
+                        the content it provided can be mutated or not.
+                        Note that this is not necessary for UStringView, since
+                        that is a read-only construct.
+
+                ***************************************************************/
+
+                void read (IReader r)
+                {
+                        r.get (content);
+                        len = content.length;
+                        mutable = r.getAllocator.isMutable (content);
+                }
+
+                /***************************************************************
+
+                        Return a streaming decoder that can be used to
+                        populate this UString with a specified number of
+                        input bytes.
+
+                        This differs from the above read() method in the
+                        way content is read: in the above case, exactly
+                        the specified number of wchar elements will be
+                        converter from the input, whereas in this case
+                        a variable number of wchar elements are converted
+                        until 'bytes' have been read from the input. This
+                        is useful in those cases where the original number
+                        of elements has been lost, and only the resultant
+                        converted byte-count remains (a la HTTP).
+
+                        The returned StringDecoder is one-shot only. You may
+                        reuse it (both the converter and the byte count) via
+                        its reset() method.
+
+                        One applies the resultant converter directly with an
+                        IReader like so:
+
+                        @code
+                        UString s = ...;
+                        IReader r = ...;
+
+                        // r >> s.createDecoder(cvt, bytes);
+                        r.get (s.createDecoder(cvt, bytes));
+                        @endcode
+
+                        which will read the specified number of bytes from
+                        the input and convert them to an appropriate number
+                        of wchars within the UString.
+
+                ***************************************************************/
+
+                StringDecoder createDecoder (UConverter c, uint bytes)
+                {
+                        return new UStringDecoder (c, bytes, this);
+                }
+        }
+
+        /***********************************************************************
+
+                Append text to this UString
+
+        ***********************************************************************/
+
+        UString opCat (UStringView other)
+        {
+                return opCat (other.get);
+        }
+
+        /***********************************************************************
+
+                Append partial text to this UString
+
+        ***********************************************************************/
+
+        UString opCat (UStringView other, uint start, uint len=uint.max)
+        {
+                other.pinIndices (start, len);
+                return opCat (other.content [start..start+len]);
+        }
+
+        /***********************************************************************
+
+                Append a single character to this UString
+
+        ***********************************************************************/
+
+        UString opCat (wchar chr)
+        {
+                return opCat (&chr, 1);
+        }
+
+        /***********************************************************************
+
+                Append text to this UString
+
+        ***********************************************************************/
+
+        UString opCat (wchar[] chars)
+        {
+                return opCat (chars.ptr, chars.length);
+        }
+
+        /***********************************************************************
+
+                Converts a sequence of UTF-8 bytes to UChars (UTF-16)
+
+        ***********************************************************************/
+
+        UString opCat (char[] chars)
+        {
+                uint fmt (wchar* dst, uint len, inout UErrorCode e)
+                {
+                        uint x;
+
+                        u_strFromUTF8 (dst, len, &x, chars.ptr, chars.length, e);
+                        return x;
+                }
+
+                expand (chars.length);
+                return format (&fmt, "failed to append UTF char[]");
+        }
+
+        /***********************************************************************
+
+                Set a section of this UString to the specified character
+
+        ***********************************************************************/
+
+        UString setTo (wchar chr, uint start=0, uint len=uint.max)
+        {
+                pinIndices (start, len);
+                if (! mutable)
+                      realloc ();
+                content [start..start+len] = chr;
+                return this;
+        }
+
+        /***********************************************************************
+
+                Set the content to the provided array. Parameter 'mutable'
+                specifies whether the given array is likely to change. If
+                not, the array is aliased until such time this UString is
+                altered.
+
+        ***********************************************************************/
+
+        UString setTo (CString16 chars, bool mutable = true)
+        {
+                len = chars.length;
+                if ((this.mutable = mutable) == true)
+                     content = chars.dup;
+                else
+                   content = cast(wchar[])chars;
+                return this;
+        }
+
+        /***********************************************************************
+
+                Replace the content of this UString. If the new content
+                is immutable (read-only) then you might consider setting the
+                'mutable' parameter to false. Doing so will avoid allocating
+                heap-space for the content until it is modified via one of
+                these methods.
+
+        ***********************************************************************/
+
+        UString setTo (UStringView other, bool mutable = true)
+        {
+                return setTo (other.get, mutable);
+        }
+
+        /***********************************************************************
+
+                Replace the content of this UString. If the new content
+                is immutable (read-only) then you might consider setting the
+                'mutable' parameter to false. Doing so will avoid allocating
+                heap-space for the content until it is modified via one of
+                these methods.
+
+        ***********************************************************************/
+
+        UString setTo (UStringView other, uint start, uint len, bool mutable = true)
+        {
+                other.pinIndices (start, len);
+                return setTo (other.content [start..start+len], mutable);
+        }
+
+        /***********************************************************************
+
+                Replace the character at the specified location.
+
+        ***********************************************************************/
+
+        final UString opIndexAssign (wchar chr, uint index)
+        in {
+                if (index >= len)
+                    exception ("index of out bounds");
+           }
+        body
+        {
+                if (! mutable)
+                      realloc ();
+                content [index] = chr;
+                return this;
+        }
+
+        /***********************************************************************
+
+                Remove a piece of this UString.
+
+        ***********************************************************************/
+
+        UString remove (uint start, uint length=uint.max)
+        {
+                pinIndices (start, length);
+                if (length)
+                    if (start >= len)
+                        truncate (start);
+                    else
+                       {
+                       if (! mutable)
+                             realloc ();
+
+                       uint i = start + length;
+                       memmove (&content[start], &content[i], (len-i) * wchar.sizeof);
+                       len -= length;
+                       }
+                return this;
+        }
+
+        /***********************************************************************
+
+                Truncate the length of this UString.
+
+        ***********************************************************************/
+
+        UString truncate (uint length=0)
+        {
+                if (length <= len)
+                    len = length;
+                return this;
+        }
+
+        /***********************************************************************
+
+                Insert leading spaces in this UString
+
+        ***********************************************************************/
+
+        UString padLeading (uint count, wchar padChar = 0x0020)
+        {
+                expand  (count);
+                memmove (&content[count], content.ptr, len * wchar.sizeof);
+                len += count;
+                return setTo (padChar, 0, count);
+        }
+
+        /***********************************************************************
+
+                Append some trailing spaces to this UString.
+
+        ***********************************************************************/
+
+        UString padTrailing (uint length, wchar padChar = 0x0020)
+        {
+                expand (length);
+                len += length;
+                return setTo  (padChar, len-length, length);
+        }
+
+        /***********************************************************************
+
+                Check for available space within the buffer, and expand
+                as necessary.
+
+        ***********************************************************************/
+
+        package final void expand (uint count)
+        {
+                if ((len + count) > content.length)
+                     realloc (count);
+        }
+
+        /***********************************************************************
+
+                Allocate memory due to a change in the content. We handle
+                the distinction between mutable and immutable here.
+
+        ***********************************************************************/
+
+        private final void realloc (uint count = 0)
+        {
+                uint size = (content.length + count + 63) & ~63;
+
+                if (mutable)
+                    content.length = size;
+                else
+                   {
+                   mutable = true;
+                   wchar[] x = content;
+                   content = new wchar [size];
+                   if (len)
+                       content[0..len] = x;
+                   }
+        }
+
+        /***********************************************************************
+
+                Internal method to support UString appending
+
+        ***********************************************************************/
+
+        private final UString opCat (wchar* chars, uint count)
+        {
+                expand (count);
+                content[len..len+count] = chars[0..count];
+                len += count;
+                return this;
+        }
+
+        /***********************************************************************
+
+                Internal method to support formatting into this UString.
+                This is used by many of the ICU wrappers to append content
+                into a UString.
+
+        ***********************************************************************/
+
+        typedef uint delegate (wchar* dst, uint len, inout UErrorCode e) Formatter;
+
+        package final UString format (Formatter format, CString msg)
+        {
+                UErrorCode   e;
+                uint    length;
+
+                while (true)
+                      {
+                      e = e.OK;
+                      length = format (&content[len], content.length - len, e);
+                      if (e == e.BufferOverflow)
+                          expand (length);
+                      else
+                         break;
+                      }
+
+                if (isError (e))
+                    exception (msg);
+
+                len += length;
+                return this;
+        }
+}
+
+
+/*******************************************************************************
+
+        Immutable (read-only) text -- use UString for mutable strings.
+
+*******************************************************************************/
+
+class UStringView : ICU, ITextOther
+{
+        alias opIndex   charAt;
+
+        // the core of the UStringView and UString attributes. The name 'len'
+        // is used rather than the more obvious 'length' since there is
+        // a collision with the silly array[length] syntactic sugar ...
+        package uint    len;
+        package wchar[] content;
+
+        // this should probably be in UString only, but there seems to
+        // be a compiler bug where it doesn't get initialised correctly,
+        // and it's perhaps useful to have here for when a UString is
+        // passed as a UStringView argument.
+        private bool    mutable;
+
+        // toFolded() argument
+        public enum     CaseOption
+                        {
+                        Default  = 0,
+                        SpecialI = 1
+                        }
+
+        /***********************************************************************
+
+                Hidden constructor
+
+        ***********************************************************************/
+
+        private this ()
+        {
+        }
+
+        /***********************************************************************
+
+                Construct read-only wrapper around the given content
+
+        ***********************************************************************/
+
+        this (wchar[] content)
+        {
+                this.content = content;
+                this.len = content.length;
+        }
+
+        /***********************************************************************
+
+                Support for writing via the Mango IO subsystem
+
+        ***********************************************************************/
+
+        version (Isolated){}
+        else
+        {
+                void write (IWriter w)
+                {
+                        w.put (get);
+                }
+        }
+
+        /***********************************************************************
+
+                Return the valid content from this UStringView
+
+        ***********************************************************************/
+
+        final package wchar[] get ()
+        {
+                return content [0..len];
+        }
+
+        /***********************************************************************
+
+                Is this UStringView equal to another?
+
+        ***********************************************************************/
+
+        final override equals_t opEquals (Object o)
+        {
+                UStringView other = cast(UStringView) o;
+
+                if (other)
+                    return (other is this || compare (other) == 0);
+                return 0;
+        }
+
+        /***********************************************************************
+
+                Compare this UStringView to another.
+
+        ***********************************************************************/
+
+        final override int opCmp (Object o)
+        {
+                UStringView other = cast(UStringView) o;
+
+                if (other is this)
+                    return 0;
+                else
+                   if (other)
+                       return compare (other);
+                return 1;
+        }
+
+        /***********************************************************************
+
+                Hash this UStringView
+
+        ***********************************************************************/
+
+        final override uint toHash ()
+        {
+                return typeid(wchar[]).getHash (&content[0..len]);
+        }
+
+        /***********************************************************************
+
+                Clone this UStringView into a UString
+
+        ***********************************************************************/
+
+        final UString copy ()
+        {
+                return new UString (content);
+        }
+
+        /***********************************************************************
+
+                Clone a section of this UStringView into a UString
+
+        ***********************************************************************/
+
+        final UString extract (uint start, uint len=uint.max)
+        {
+                pinIndices (start, len);
+                return new UString (content[start..start+len]);
+        }
+
+        /***********************************************************************
+
+                Count unicode code points in the length UChar code units of
+                the string. A code point may occupy either one or two UChar
+                code units. Counting code points involves reading all code
+                units.
+
+        ***********************************************************************/
+
+        final uint codePoints (uint start=0, uint length=uint.max)
+        {
+                pinIndices (start, length);
+                return u_countChar32 (&content[start], length);
+        }
+
+        /***********************************************************************
+
+                Return an indication whether or not there are surrogate pairs
+                within the string.
+
+        ***********************************************************************/
+
+        final bool hasSurrogates (uint start=0, uint length=uint.max)
+        {
+                pinIndices (start, length);
+                return codePoints (start, length) != length;
+        }
+
+        /***********************************************************************
+
+                Return the character at the specified position.
+
+        ***********************************************************************/
+
+        final wchar opIndex (uint index)
+        in {
+                if (index >= len)
+                    exception ("index of out bounds");
+           }
+        body
+        {
+                return content [index];
+        }
+
+        /***********************************************************************
+
+                Return the length of the valid content
+
+        ***********************************************************************/
+
+        final uint length ()
+        {
+                return len;
+        }
+
+        /***********************************************************************
+
+                The comparison can be done in code unit order or in code
+                point order. They differ only in UTF-16 when comparing
+                supplementary code points (U+10000..U+10ffff) to BMP code
+                points near the end of the BMP (i.e., U+e000..U+ffff).
+
+                In code unit order, high BMP code points sort after
+                supplementary code points because they are stored as
+                pairs of surrogates which are at U+d800..U+dfff.
+
+        ***********************************************************************/
+
+        final int compare (UStringView other, bool codePointOrder=false)
+        {
+                return compare (other.get, codePointOrder);
+        }
+
+        /***********************************************************************
+
+                The comparison can be done in code unit order or in code
+                point order. They differ only in UTF-16 when comparing
+                supplementary code points (U+10000..U+10ffff) to BMP code
+                points near the end of the BMP (i.e., U+e000..U+ffff).
+
+                In code unit order, high BMP code points sort after
+                supplementary code points because they are stored as
+                pairs of surrogates which are at U+d800..U+dfff.
+
+        ***********************************************************************/
+
+        final int compare (wchar[] other, bool codePointOrder=false)
+        {
+                return u_strCompare (content.ptr, len, other.ptr, other.length, codePointOrder);
+        }
+
+        /***********************************************************************
+
+                The comparison can be done in UTF-16 code unit order or
+                in code point order. They differ only when comparing
+                supplementary code points (U+10000..U+10ffff) to BMP code
+                points near the end of the BMP (i.e., U+e000..U+ffff).
+
+                In code unit order, high BMP code points sort after
+                supplementary code points because they are stored as
+                pairs of surrogates which are at U+d800..U+dfff.
+
+        ***********************************************************************/
+
+        final int compareFolded (UStringView other, CaseOption option = CaseOption.Default)
+        {
+                return compareFolded (other.content, option);
+        }
+
+        /***********************************************************************
+
+                The comparison can be done in UTF-16 code unit order or
+                in code point order. They differ only when comparing
+                supplementary code points (U+10000..U+10ffff) to BMP code
+                points near the end of the BMP (i.e., U+e000..U+ffff).
+
+                In code unit order, high BMP code points sort after
+                supplementary code points because they are stored as
+                pairs of surrogates which are at U+d800..U+dfff.
+
+        ***********************************************************************/
+
+        final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default)
+        {
+                return compareFolded (get, other, option);
+        }
+
+        /***********************************************************************
+
+                Does this UStringView start with specified string?
+
+        ***********************************************************************/
+
+        final bool startsWith (UStringView other)
+        {
+                return startsWith (other.get);
+        }
+
+        /***********************************************************************
+
+                Does this UStringView start with specified string?
+
+        ***********************************************************************/
+
+        final bool startsWith (wchar[] chars)
+        {
+                if (len >= chars.length)
+                    return compareFolded (content[0..chars.length], chars) == 0;
+                return false;
+        }
+
+        /***********************************************************************
+
+                Does this UStringView end with specified string?
+
+        ***********************************************************************/
+
+        final bool endsWith (UStringView other)
+        {
+                return endsWith (other.get);
+        }
+
+        /***********************************************************************
+
+                Does this UStringView end with specified string?
+
+        ***********************************************************************/
+
+        final bool endsWith (wchar[] chars)
+        {
+                if (len >= chars.length)
+                    return compareFolded (content[len-chars.length..len], chars) == 0;
+                return false;
+        }
+
+        /***********************************************************************
+
+                Find the first occurrence of a BMP code point in a string.
+                A surrogate code point is found only if its match in the
+                text is not part of a surrogate pair.
+
+        ***********************************************************************/
+
+        final uint indexOf (wchar c, uint start=0)
+        {
+                pinIndex (start);
+                wchar* s = u_memchr (&content[start], c, len-start);
+                if (s)
+                    return s - content.ptr;
+                return uint.max;
+        }
+
+        /***********************************************************************
+
+                Find the first occurrence of a substring in a string.
+
+                The substring is found at code point boundaries. That means
+                that if the substring begins with a trail surrogate or ends
+                with a lead surrogate, then it is found only if these
+                surrogates stand alone in the text. Otherwise, the substring
+                edge units would be matched against halves of surrogate pairs.
+
+        ***********************************************************************/
+
+        final uint indexOf (UStringView other, uint start=0)
+        {
+                return indexOf (other.get, start);
+        }
+
+        /***********************************************************************
+
+                Find the first occurrence of a substring in a string.
+
+                The substring is found at code point boundaries. That means
+                that if the substring begins with a trail surrogate or ends
+                with a lead surrogate, then it is found only if these
+                surrogates stand alone in the text. Otherwise, the substring
+                edge units would be matched against halves of surrogate pairs.
+
+        ***********************************************************************/
+
+        final uint indexOf (wchar[] chars, uint start=0)
+        {
+                pinIndex (start);
+                wchar* s = u_strFindFirst (&content[start], len-start, chars.ptr, chars.length);
+                if (s)
+                    return s - content.ptr;
+                return uint.max;
+        }
+
+        /***********************************************************************
+
+                Find the last occurrence of a BMP code point in a string.
+                A surrogate code point is found only if its match in the
+                text is not part of a surrogate pair.
+
+        ***********************************************************************/
+
+        final uint lastIndexOf (wchar c, uint start=uint.max)
+        {
+                pinIndex (start);
+                wchar* s = u_memrchr (content.ptr, c, start);
+                if (s)
+                    return s - content.ptr;
+                return uint.max;
+        }
+
+        /***********************************************************************
+
+                Find the last occurrence of a BMP code point in a string.
+                A surrogate code point is found only if its match in the
+                text is not part of a surrogate pair.
+
+        ***********************************************************************/
+
+        final uint lastIndexOf (UStringView other, uint start=uint.max)
+        {
+                return lastIndexOf (other.get, start);
+        }
+
+        /***********************************************************************
+
+                Find the last occurrence of a substring in a string.
+
+                The substring is found at code point boundaries. That means
+                that if the substring begins with a trail surrogate or ends
+                with a lead surrogate, then it is found only if these
+                surrogates stand alone in the text. Otherwise, the substring
+                edge units would be matched against halves of surrogate pairs.
+
+        ***********************************************************************/
+
+        final uint lastIndexOf (wchar[] chars, uint start=uint.max)
+        {
+                pinIndex (start);
+                wchar* s = u_strFindLast (content.ptr, start, chars.ptr, chars.length);
+                if (s)
+                    return s - content.ptr;
+                return uint.max;
+        }
+
+        /***********************************************************************
+
+                Lowercase the characters into a seperate UString.
+
+                Casing is locale-dependent and context-sensitive. The
+                result may be longer or shorter than the original.
+
+                Note that the return value refers to the provided destination
+                UString.
+
+        ***********************************************************************/
+
+        final UString toLower (UString dst)
+        {
+               return toLower (dst, ULocale.Default);
+        }
+
+        /***********************************************************************
+
+                Lowercase the characters into a seperate UString.
+
+                Casing is locale-dependent and context-sensitive. The
+                result may be longer or shorter than the original.
+
+                Note that the return value refers to the provided destination
+                UString.
+
+        ***********************************************************************/
+
+        final UString toLower (UString dst, inout ULocale locale)
+        {
+                uint lower (wchar* dst, uint length, inout UErrorCode e)
+                {
+                        return u_strToLower (dst, length, content.ptr, len, ICU.toString(locale.name), e);
+                }
+
+                dst.expand (len + 32);
+                return dst.format (&lower, "toLower() failed");
+        }
+
+        /***********************************************************************
+
+                Uppercase the characters into a seperate UString.
+
+                Casing is locale-dependent and context-sensitive. The
+                result may be longer or shorter than the original.
+
+                Note that the return value refers to the provided destination
+                UString.
+
+        ***********************************************************************/
+
+        final UString toUpper (UString dst)
+        {
+               return toUpper (dst, ULocale.Default);
+        }
+
+        /***********************************************************************
+
+                Uppercase the characters into a seperate UString.
+
+                Casing is locale-dependent and context-sensitive. The
+                result may be longer or shorter than the original.
+
+                Note that the return value refers to the provided destination
+                UString.
+
+        ***********************************************************************/
+
+        final UString toUpper (UString dst, inout ULocale locale)
+        {
+                uint upper (wchar* dst, uint length, inout UErrorCode e)
+                {
+                        return u_strToUpper (dst, length, content.ptr, len, ICU.toString(locale.name), e);
+                }
+
+                dst.expand (len + 32);
+                return dst.format (&upper, "toUpper() failed");
+        }
+
+        /***********************************************************************
+
+                Case-fold the characters into a seperate UString.
+
+                Case-folding is locale-independent and not context-sensitive,
+                but there is an option for whether to include or exclude
+                mappings for dotted I and dotless i that are marked with 'I'
+                in CaseFolding.txt. The result may be longer or shorter than
+                the original.
+
+                Note that the return value refers to the provided destination
+                UString.
+
+        ***********************************************************************/
+
+        final UString toFolded (UString dst, CaseOption option = CaseOption.Default)
+        {
+                uint fold (wchar* dst, uint length, inout UErrorCode e)
+                {
+                        return u_strFoldCase (dst, length, content.ptr, len, option, e);
+                }
+
+                dst.expand (len + 32);
+                return dst.format (&fold, "toFolded() failed");
+        }
+
+        /***********************************************************************
+
+                Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If
+                the output array is not provided, an array of appropriate
+                size will be allocated and returned. Where the output is
+                provided, it must be large enough to hold potentially four
+                bytes per character for surrogate-pairs or three bytes per
+                character for BMP only. Consider using UConverter where
+                streaming conversions are required.
+
+                Returns an array slice representing the valid UTF8 content.
+
+        ***********************************************************************/
+
+        final char[] toUtf8 (char[] dst = null)
+        {
+                uint    x;
+                UErrorCode   e;
+
+                if (! cast(char*) dst)
+                      dst = new char[len * 4];
+
+                u_strToUTF8 (dst.ptr, dst.length, &x, content.ptr, len, e);
+                testError (e, "failed to convert to UTF8");
+                return dst [0..x];
+        }
+
+        /***********************************************************************
+
+                Remove leading and trailing whitespace from this UStringView.
+                Note that we slice the content to remove leading space.
+
+        ***********************************************************************/
+
+        UStringView trim ()
+        {
+                wchar   c;
+                uint    i = len;
+
+                // cut off trailing white space
+                while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c)))
+                       --i;
+                len = i;
+
+                // now remove leading whitespace
+                for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {}
+                if (i)
+                   {
+                   len -= i;
+                   content = content[i..$-i];
+                   }
+
+                return this;
+        }
+
+        /***********************************************************************
+
+                Unescape a string of characters and write the resulting
+                Unicode characters to the destination buffer.  The following
+                escape sequences are recognized:
+
+                  uhhhh       4 hex digits; h in [0-9A-Fa-f]
+                  Uhhhhhhhh   8 hex digits
+                  xhh         1-2 hex digits
+                  x{h...}     1-8 hex digits
+                  ooo         1-3 octal digits; o in [0-7]
+                  cX          control-X; X is masked with 0x1F
+
+                as well as the standard ANSI C escapes:
+
+                  a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
+                  v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
+                  \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
+
+                Anything else following a backslash is generically escaped.
+                For example, "[a\\-z]" returns "[a-z]".
+
+                If an escape sequence is ill-formed, this method returns an
+                empty string.  An example of an ill-formed sequence is "\\u"
+                followed by fewer than 4 hex digits.
+
+         ***********************************************************************/
+
+        final UString unEscape ()
+        {
+                UString result = new UString (len);
+                for (uint i=0; i < len;)
+                    {
+                    dchar c = charAt(i++);
+                    if (c == 0x005C)
+                       {
+                       // bump index ...
+                       c = u_unescapeAt (&_charAt, &i, len, cast(void*) this);
+
+                       // error?
+                       if (c == 0xFFFFFFFF)
+                          {
+                          result.truncate ();   // return empty string
+                          break;                // invalid escape sequence
+                          }
+                       }
+                    result.append (c);
+                    }
+                return result;
+        }
+
+        /***********************************************************************
+
+                Is this code point a surrogate (U+d800..U+dfff)?
+
+        ***********************************************************************/
+
+        final static bool isSurrogate (wchar c)
+        {
+                return (c & 0xfffff800) == 0xd800;
+        }
+
+        /***********************************************************************
+
+                Is this code unit a lead surrogate (U+d800..U+dbff)?
+
+        ***********************************************************************/
+
+        final static bool isLeading (wchar c)
+        {
+                return (c & 0xfffffc00) == 0xd800;
+        }
+
+        /***********************************************************************
+
+                Is this code unit a trail surrogate (U+dc00..U+dfff)?
+
+        ***********************************************************************/
+
+        final static bool isTrailing (wchar c)
+        {
+                return (c & 0xfffffc00) == 0xdc00;
+        }
+
+        /***********************************************************************
+
+                Adjust a random-access offset to a code point boundary
+                at the start of a code point. If the offset points to
+                the trail surrogate of a surrogate pair, then the offset
+                is decremented. Otherwise, it is not modified.
+
+        ***********************************************************************/
+
+        final uint getCharStart (uint i)
+        in {
+                if (i >= len)
+                    exception ("index of out bounds");
+           }
+        body
+        {
+                if (isTrailing (content[i]) && i && isLeading (content[i-1]))
+                    --i;
+                return i;
+        }
+
+        /***********************************************************************
+
+                Adjust a random-access offset to a code point boundary
+                after a code point. If the offset is behind the lead
+                surrogate of a surrogate pair, then the offset is
+                incremented. Otherwise, it is not modified.
+
+        ***********************************************************************/
+
+        final uint getCharLimit (uint i)
+        in {
+                if (i >= len)
+                    exception ("index of out bounds");
+           }
+        body
+        {
+                if (i && isLeading(content[i-1]) && isTrailing (content[i]))
+                    ++i;
+                return i;
+        }
+
+        /***********************************************************************
+
+                Callback for C unescapeAt() function
+
+        ***********************************************************************/
+
+        extern (C)
+        {
+                typedef wchar function (uint offset, void* context) CharAt;
+
+                private static wchar _charAt (uint offset, void* context)
+                {
+                        return (cast(UString) context).charAt (offset);
+                }
+        }
+
+        /***********************************************************************
+
+                Pin the given index to a valid position.
+
+        ***********************************************************************/
+
+        final private void pinIndex (inout uint x)
+        {
+                if (x > len)
+                    x = len;
+        }
+
+        /***********************************************************************
+
+                Pin the given index and length to a valid position.
+
+        ***********************************************************************/
+
+        final private void pinIndices (inout uint start, inout uint length)
+        {
+                if (start > len)
+                    start = len;
+
+                if (length > (len - start))
+                    length = len - start;
+        }
+
+        /***********************************************************************
+
+                Helper for comparison methods
+
+        ***********************************************************************/
+
+        final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default)
+        {
+                UErrorCode e;
+
+                int x = u_strCaseCompare (s1.ptr, s1.length, s2.ptr, s2.length, option, e);
+                testError (e, "compareFolded failed");
+                return x;
+        }
+
+
+        /***********************************************************************
+
+                Bind the ICU functions from a shared library. This is
+                complicated by the issues regarding D and DLLs on the
+                Windows platform
+
+        ***********************************************************************/
+
+        private static void* library;
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        private static extern (C)
+        {
+                wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst;
+                wchar* function (wchar*, uint, wchar*, uint) u_strFindLast;
+                wchar* function (wchar*, wchar, uint) u_memchr;
+                wchar* function (wchar*, wchar, uint) u_memrchr;
+                int    function (wchar*, uint, wchar*, uint, bool) u_strCompare;
+                int    function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) u_strCaseCompare;
+                dchar  function (CharAt, uint*, uint, void*) u_unescapeAt;
+                uint   function (wchar*, uint) u_countChar32;
+                uint   function (wchar*, uint, wchar*, uint, char*, inout UErrorCode) u_strToUpper;
+                uint   function (wchar*, uint, wchar*, uint, char*, inout UErrorCode) u_strToLower;
+                uint   function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) u_strFoldCase;
+                wchar* function (wchar*, uint, uint*, char*, uint, inout UErrorCode) u_strFromUTF8;
+                char*  function (char*, uint, uint*, wchar*, uint, inout UErrorCode) u_strToUTF8;
+        }
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static  FunctionLoader.Bind[] targets =
+                [
+                {cast(void**) &u_strFindFirst,      "u_strFindFirst"},
+                {cast(void**) &u_strFindLast,       "u_strFindLast"},
+                {cast(void**) &u_memchr,            "u_memchr"},
+                {cast(void**) &u_memrchr,           "u_memrchr"},
+                {cast(void**) &u_strCompare,        "u_strCompare"},
+                {cast(void**) &u_strCaseCompare,    "u_strCaseCompare"},
+                {cast(void**) &u_unescapeAt,        "u_unescapeAt"},
+                {cast(void**) &u_countChar32,       "u_countChar32"},
+                {cast(void**) &u_strToUpper,        "u_strToUpper"},
+                {cast(void**) &u_strToLower,        "u_strToLower"},
+                {cast(void**) &u_strFoldCase,       "u_strFoldCase"},
+                {cast(void**) &u_strFromUTF8,       "u_strFromUTF8"},
+                {cast(void**) &u_strToUTF8,         "u_strToUTF8"},
+                ];
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static this ()
+        {
+                library = FunctionLoader.bind (icuuc, targets);
+                //test ();
+        }
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        static ~this ()
+        {
+                FunctionLoader.unbind (library);
+        }
+
+        /***********************************************************************
+
+        ***********************************************************************/
+
+        //private static void test()
+        //{
+        //        UString s = new UString (r"aaaqw \uabcd eaaa");
+        //        CString16 x = "dssfsdff";
+        //        s ~ x ~ x;
+        //        wchar c = s[3];
+        //        s[3] = 'Q';
+        //        int y = s.indexOf ("qwe");
+        //        s.unEscape ();
+        //        s.toUpper (new UString);
+        //        s.padLeading(2).padTrailing(2).trim();
+        //}
+}
author	Frank Benoit <benoit@tionex.de>
date	Sun, 19 Apr 2009 13:49:38 +0200
parents	base/src/java/mangoicu/UString.d@1bf55a6eb092
children	536e43f63c81