Mercurial > projects > ldc

diff tango/tango/text/convert/Utf.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!! Initial commit after moving to Tango instead of Phobos. Lots of bugfixes... This build is not suitable for most things.
author: lindquist
date: Fri, 11 Jan 2008 17:57:40 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tango/tango/text/convert/Utf.d	Fri Jan 11 17:57:40 2008 +0100
@@ -0,0 +1,532 @@
+/*******************************************************************************
+
+        copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
+
+        license:        BSD style: $(LICENSE)
+
+        version:        Initial release: Oct 2004
+
+        authors:        Kris
+
+        Fast Unicode transcoders. These are particularly sensitive to
+        minor changes on 32bit x86 devices, because the register set of
+        those devices is so small. Beware of subtle changes which might
+        extend the execution-period by as much as 200%. Because of this,
+        three of the six transcoders might read past the end of input by
+        one, two, or three bytes before arresting themselves. Note that
+        support for streaming adds a 15% overhead to the dchar => char
+        conversion, but has little effect on the others.
+
+        These routines were tuned on an Intel P4; other devices may work
+        more efficiently with a slightly different approach, though this
+        is likely to be reasonably optimal on AMD x86 CPUs also. These
+        algorithms would benefit significantly from those extra AMD64
+        registers. On a 3GHz P4, the dchar/char conversions take around
+        2500ns to process an array of 1000 ASCII elements. Invoking the
+        memory manager doubles that period, and quadruples the time for
+        arrays of 100 elements. Memory allocation can slow down notably
+        in a multi-threaded environment, so avoid that where possible.
+
+        Surrogate-pairs are dealt with in a non-optimal fashion when
+        transcoding between utf16 and utf8. Such cases are considered
+        to be boundary-conditions for this module.
+
+        There are three common cases where the input may be incomplete,
+        including each 'widening' case of utf8 => utf16, utf8 => utf32,
+        and utf16 => utf32. An edge-case is utf16 => utf8, if surrogate
+        pairs are present. Such cases will throw an exception, unless
+        streaming-mode is enabled ~ in the latter mode, an additional
+        integer is returned indicating how many elements of the input
+        have been consumed. In all cases, a correct slice of the output
+        is returned.
+
+        For details on Unicode processing see:
+        $(UL $(LINK http://www.utf-8.com/))
+        $(UL $(LINK http://www.hackcraft.net/xmlUnicode/))
+        $(UL $(LINK http://www.azillionmonkeys.com/qed/unicode.html/))
+        $(UL $(LINK http://icu.sourceforge.net/docs/papers/forms_of_unicode/))
+
+*******************************************************************************/
+
+module tango.text.convert.Utf;
+
+public extern (C) void onUnicodeError (char[] msg, size_t idx = 0);
+
+
+/*******************************************************************************
+
+        Encode Utf8 up to a maximum of 4 bytes long (five & six byte
+        variations are not supported).
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+        For example:
+
+        ---
+        char[] output;
+
+        char[] result = toString (input, output);
+
+        // reset output after a realloc
+        if (result.length > output.length)
+            output = result;
+        ---
+
+*******************************************************************************/
+
+char[] toString (wchar[] input, char[] output=null, uint* ate=null)
+{
+        if (ate)
+            *ate = input.length;
+        else
+           {
+           // potentially reallocate output
+           int estimate = input.length * 2 + 3;
+           if (output.length < estimate)
+               output.length = estimate;
+           }
+
+        char* pOut = output.ptr;
+        char* pMax = pOut + output.length - 3;
+
+        foreach (int eaten, wchar b; input)
+                {
+                // about to overflow the output?
+                if (pOut > pMax)
+                   {
+                   // if streaming, just return the unused input
+                   if (ate)
+                      {
+                      *ate = eaten;
+                      break;
+                      }
+
+                   // reallocate the output buffer
+                   int len = pOut - output.ptr;
+                   output.length = len + len / 2;
+                   pOut = output.ptr + len;
+                   pMax = output.ptr + output.length - 3;
+                   }
+
+                if (b < 0x80)
+                    *pOut++ = b;
+                else
+                   if (b < 0x0800)
+                      {
+                      pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
+                      pOut[1] = 0x80 | (b & 0x3f);
+                      pOut += 2;
+                      }
+                   else
+                      if (b < 0xd800 || b > 0xdfff)
+                         {
+                         pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
+                         pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
+                         pOut[2] = 0x80 | (b & 0x3f);
+                         pOut += 3;
+                         }
+                      else
+                         // deal with surrogate-pairs
+                         return toString (toString32(input, null, ate), output);
+                }
+
+        // return the produced output
+        return output [0..(pOut - output.ptr)];
+}
+
+/*******************************************************************************
+
+        Decode Utf8 produced by the above toString() method.
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+
+*******************************************************************************/
+
+wchar[] toString16 (char[] input, wchar[] output=null, uint* ate=null)
+{
+        int     produced;
+        char*   pIn = input.ptr;
+        char*   pMax = pIn + input.length;
+        char*   pValid;
+
+        if (ate is null)
+            if (input.length > output.length)
+                output.length = input.length;
+
+        if (input.length)
+        foreach (inout wchar d; output)
+                {
+                pValid = pIn;
+                wchar b = cast(wchar) *pIn;
+
+                if (b & 0x80)
+                    if (b < 0xe0)
+                       {
+                       b &= 0x1f;
+                       b = (b << 6) | (*++pIn & 0x3f);
+                       }
+                    else
+                       if (b < 0xf0)
+                          {
+                          b &= 0x0f;
+                          b = (b << 6) | (pIn[1] & 0x3f);
+                          b = (b << 6) | (pIn[2] & 0x3f);
+                          pIn += 2;
+                          }
+                       else
+                          // deal with surrogate-pairs
+                          return toString16 (toString32(input, null, ate), output);
+
+                d = b;
+                ++produced;
+
+                // did we read past the end of the input?
+                if (++pIn >= pMax)
+                    if (pIn > pMax)
+                       {
+                       // yep ~ return tail or throw error?
+                       if (ate)
+                          {
+                          pIn = pValid;
+                          --produced;
+                          break;
+                          }
+                       onUnicodeError ("Unicode.toString16 : incomplete utf8 input", pIn - input.ptr);
+                       }
+                    else
+                       break;
+                }
+
+        // do we still have some input left?
+        if (ate)
+            *ate = pIn - input.ptr;
+        else
+           if (pIn < pMax)
+               // this should never happen!
+               onUnicodeError ("Unicode.toString16 : utf8 overflow", pIn - input.ptr);
+
+        // return the produced output
+        return output [0..produced];
+}
+
+
+/*******************************************************************************
+
+        Encode Utf8 up to a maximum of 4 bytes long (five & six
+        byte variations are not supported). Throws an exception
+        where the input dchar is greater than 0x10ffff.
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+
+*******************************************************************************/
+
+char[] toString (dchar[] input, char[] output=null, uint* ate=null)
+{
+        if (ate)
+            *ate = input.length;
+        else
+           {
+           // potentially reallocate output
+           int estimate = input.length * 2 + 4;
+           if (output.length < estimate)
+               output.length = estimate;
+           }
+
+        char* pOut = output.ptr;
+        char* pMax = pOut + output.length - 4;
+
+        foreach (int eaten, dchar b; input)
+                {
+                // about to overflow the output?
+                if (pOut > pMax)
+                   {
+                   // if streaming, just return the unused input
+                   if (ate)
+                      {
+                      *ate = eaten;
+                      break;
+                      }
+
+                   // reallocate the output buffer
+                   int len = pOut - output.ptr;
+                   output.length = len + len / 2;
+                   pOut = output.ptr + len;
+                   pMax = output.ptr + output.length - 4;
+                   }
+
+                if (b < 0x80)
+                    *pOut++ = b;
+                else
+                   if (b < 0x0800)
+                      {
+                      pOut[0] = 0xc0 | ((b >> 6) & 0x3f);
+                      pOut[1] = 0x80 | (b & 0x3f);
+                      pOut += 2;
+                      }
+                   else
+                      if (b < 0x10000)
+                         {
+                         pOut[0] = 0xe0 | ((b >> 12) & 0x3f);
+                         pOut[1] = 0x80 | ((b >> 6)  & 0x3f);
+                         pOut[2] = 0x80 | (b & 0x3f);
+                         pOut += 3;
+                         }
+                      else
+                         if (b < 0x110000)
+                            {
+                            pOut[0] = 0xf0 | ((b >> 18) & 0x3f);
+                            pOut[1] = 0x80 | ((b >> 12) & 0x3f);
+                            pOut[2] = 0x80 | ((b >> 6)  & 0x3f);
+                            pOut[3] = 0x80 | (b & 0x3f);
+                            pOut += 4;
+                            }
+                         else
+                            onUnicodeError ("Unicode.toString : invalid dchar", eaten);
+                }
+
+        // return the produced output
+        return output [0..(pOut - output.ptr)];
+}
+
+
+/*******************************************************************************
+
+        Decode Utf8 produced by the above toString() method.
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+
+*******************************************************************************/
+
+dchar[] toString32 (char[] input, dchar[] output=null, uint* ate=null)
+{
+        int     produced;
+        char*   pIn = input.ptr;
+        char*   pMax = pIn + input.length;
+        char*   pValid;
+
+        if (ate is null)
+            if (input.length > output.length)
+                output.length = input.length;
+
+        if (input.length)
+        foreach (inout dchar d; output)
+                {
+                pValid = pIn;
+                dchar b = cast(dchar) *pIn;
+
+                if (b & 0x80)
+                    if (b < 0xe0)
+                       {
+                       b &= 0x1f;
+                       b = (b << 6) | (*++pIn & 0x3f);
+                       }
+                    else
+                       if (b < 0xf0)
+                          {
+                          b &= 0x0f;
+                          b = (b << 6) | (pIn[1] & 0x3f);
+                          b = (b << 6) | (pIn[2] & 0x3f);
+                          pIn += 2;
+                          }
+                       else
+                          {
+                          b &= 0x07;
+                          b = (b << 6) | (pIn[1] & 0x3f);
+                          b = (b << 6) | (pIn[2] & 0x3f);
+                          b = (b << 6) | (pIn[3] & 0x3f);
+
+                          if (b >= 0x110000)
+                              onUnicodeError ("Unicode.toString32 : invalid utf8 input", pIn - input.ptr);
+                          pIn += 3;
+                          }
+
+                d = b;
+                ++produced;
+
+                // did we read past the end of the input?
+                if (++pIn >= pMax)
+                    if (pIn > pMax)
+                       {
+                       // yep ~ return tail or throw error?
+                       if (ate)
+                          {
+                          pIn = pValid;
+                          --produced;
+                          break;
+                          }
+                       onUnicodeError ("Unicode.toString32 : incomplete utf8 input", pIn - input.ptr);
+                       }
+                    else
+                       break;
+                }
+
+        // do we still have some input left?
+        if (ate)
+            *ate = pIn - input.ptr;
+        else
+           if (pIn < pMax)
+               // this should never happen!
+               onUnicodeError ("Unicode.toString32 : utf8 overflow", pIn - input.ptr);
+
+        // return the produced output
+        return output [0..produced];
+}
+
+/*******************************************************************************
+
+        Encode Utf16 up to a maximum of 2 bytes long. Throws an exception
+        where the input dchar is greater than 0x10ffff.
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+
+*******************************************************************************/
+
+wchar[] toString16 (dchar[] input, wchar[] output=null, uint* ate=null)
+{
+        if (ate)
+            *ate = input.length;
+        else
+           {
+           int estimate = input.length * 2 + 2;
+           if (output.length < estimate)
+               output.length = estimate;
+           }
+
+        wchar* pOut = output.ptr;
+        wchar* pMax = pOut + output.length - 2;
+
+        foreach (int eaten, dchar b; input)
+                {
+                // about to overflow the output?
+                if (pOut > pMax)
+                   {
+                   // if streaming, just return the unused input
+                   if (ate)
+                      {
+                      *ate = eaten;
+                      break;
+                      }
+
+                   // reallocate the output buffer
+                   int len = pOut - output.ptr;
+                   output.length = len + len / 2;
+                   pOut = output.ptr + len;
+                   pMax = output.ptr + output.length - 2;
+                   }
+
+                if (b < 0x10000)
+                    *pOut++ = b;
+                else
+                   if (b < 0x110000)
+                      {
+                      pOut[0] = 0xd800 | (((b - 0x10000) >> 10) & 0x3ff);
+                      pOut[1] = 0xdc00 | ((b - 0x10000) & 0x3ff);
+                      pOut += 2;
+                      }
+                   else
+                      onUnicodeError ("Unicode.toString16 : invalid dchar", eaten);
+                }
+
+        // return the produced output
+        return output [0..(pOut - output.ptr)];
+}
+
+/*******************************************************************************
+
+        Decode Utf16 produced by the above toString16() method.
+
+        If the output is provided off the stack, it should be large
+        enough to encompass the entire transcoding; failing to do
+        so will cause the output to be moved onto the heap instead.
+
+        Returns a slice of the output buffer, corresponding to the
+        converted characters. For optimum performance, the returned
+        buffer should be specified as 'output' on subsequent calls.
+
+*******************************************************************************/
+
+dchar[] toString32 (wchar[] input, dchar[] output=null, uint* ate=null)
+{
+        int     produced;
+        wchar*  pIn = input.ptr;
+        wchar*  pMax = pIn + input.length;
+        wchar*  pValid;
+
+        if (ate is null)
+            if (input.length > output.length)
+                output.length = input.length;
+
+        if (input.length)
+        foreach (inout dchar d; output)
+                {
+                pValid = pIn;
+                dchar b = cast(dchar) *pIn;
+
+                // simple conversion ~ see http://www.unicode.org/faq/utf_bom.html#35
+                if (b >= 0xd800 && b <= 0xdfff)
+                    b = ((b - 0xd7c0) << 10) + (*++pIn - 0xdc00);
+
+                if (b >= 0x110000)
+                    onUnicodeError ("Unicode.toString32 : invalid utf16 input", pIn - input.ptr);
+
+                d = b;
+                ++produced;
+
+                if (++pIn >= pMax)
+                    if (pIn > pMax)
+                       {
+                       // yep ~ return tail or throw error?
+                       if (ate)
+                          {
+                          pIn = pValid;
+                          --produced;
+                          break;
+                          }
+                       onUnicodeError ("Unicode.toString32 : incomplete utf16 input", pIn - input.ptr);
+                       }
+                    else
+                       break;
+                }
+
+        // do we still have some input left?
+        if (ate)
+            *ate = pIn - input.ptr;
+        else
+           if (pIn < pMax)
+               // this should never happen!
+               onUnicodeError ("Unicode.toString32 : utf16 overflow", pIn - input.ptr);
+
+        // return the produced output
+        return output [0..produced];
+}
+
+
author	lindquist
date	Fri, 11 Jan 2008 17:57:40 +0100
parents
children