Mercurial > projects > dil

--- a/trunk/src/util/utf.d	Mon Feb 25 02:56:22 2008 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,975 +0,0 @@
-// utf.d
-
-/*
- *  Copyright (C) 2003-2004 by Digital Mars, www.digitalmars.com
- *  Written by Walter Bright
- *
- *  This software is provided 'as-is', without any express or implied
- *  warranty. In no event will the authors be held liable for any damages
- *  arising from the use of this software.
- *
- *  Permission is granted to anyone to use this software for any purpose,
- *  including commercial applications, and to alter it and redistribute it
- *  freely, subject to the following restrictions:
- *
- *  o  The origin of this software must not be misrepresented; you must not
- *     claim that you wrote the original software. If you use this software
- *     in a product, an acknowledgment in the product documentation would be
- *     appreciated but is not required.
- *  o  Altered source versions must be plainly marked as such, and must not
- *     be misrepresented as being the original software.
- *  o  This notice may not be removed or altered from any source
- *     distribution.
- */
-
-/********************************************
- * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
- *
- * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
- * wchar type.
- * For linux systems, the C wchar_t type is UTF-32 and corresponds to
- * the D utf.dchar type.
- *
- * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
- *
- * See_Also:
- *	$(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
- *	$(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
- *	$(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
- * Macros:
- *	WIKI = Phobos/StdUtf
- */
-
-/*
-  Note: this is not the original file!
-  Modified by Aziz Köksal:
-    Only commented out deprecated class UtfError.
-*/
-
-module util.utf;
-
-// private import std.stdio;
-
-//debug=utf;		// uncomment to turn on debugging printf's
-/+
-deprecated class UtfError : Error
-{
-    size_t idx;	// index in string of where error occurred
-
-    this(char[] s, size_t i)
-    {
-	idx = i;
-	super(s);
-    }
-}
-+/
-/**********************************
- * Exception class that is thrown upon any errors.
- */
-
-class UtfException : Exception
-{
-    size_t idx;	/// index in string of where error occurred
-
-    this(char[] s, size_t i)
-    {
-	idx = i;
-	super(s);
-    }
-}
-
-/*******************************
- * Test if c is a valid UTF-32 character.
- *
- * \uFFFE and \uFFFF are considered valid by this function,
- * as they are permitted for internal use by an application,
- * but they are not allowed for interchange by the Unicode standard.
- *
- * Returns: true if it is, false if not.
- */
-
-bool isValidDchar(dchar c)
-{
-    /* Note: FFFE and FFFF are specifically permitted by the
-     * Unicode standard for application internal use, but are not
-     * allowed for interchange.
-     * (thanks to Arcane Jill)
-     */
-
-    return c < 0xD800 ||
-	(c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
-}
-
-unittest
-{
-    debug(utf) printf("utf.isValidDchar.unittest\n");
-    assert(isValidDchar(cast(dchar)'a') == true);
-    assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
-}
-
-
-ubyte[256] UTF8stride =
-[
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
-];
-
-/**
- * stride() returns the length of a UTF-8 sequence starting at index i
- * in string s.
- * Returns:
- *	The number of bytes in the UTF-8 sequence or
- *	0xFF meaning s[i] is not the start of of UTF-8 sequence.
- */
-
-uint stride(char[] s, size_t i)
-{
-    return UTF8stride[s[i]];
-}
-
-/**
- * stride() returns the length of a UTF-16 sequence starting at index i
- * in string s.
- */
-
-uint stride(wchar[] s, size_t i)
-{   uint u = s[i];
-    return 1 + (u >= 0xD800 && u <= 0xDBFF);
-}
-
-/**
- * stride() returns the length of a UTF-32 sequence starting at index i
- * in string s.
- * Returns: The return value will always be 1.
- */
-
-uint stride(dchar[] s, size_t i)
-{
-    return 1;
-}
-
-/*******************************************
- * Given an index i into an array of characters s[],
- * and assuming that index i is at the start of a UTF character,
- * determine the number of UCS characters up to that index i.
- */
-
-size_t toUCSindex(char[] s, size_t i)
-{
-    size_t n;
-    size_t j;
-    size_t stride;
-
-    for (j = 0; j < i; j += stride)
-    {
-	stride = UTF8stride[s[j]];
-	if (stride == 0xFF)
-	    goto Lerr;
-	n++;
-    }
-    if (j > i)
-    {
-      Lerr:
-	throw new UtfException("1invalid UTF-8 sequence", j);
-    }
-    return n;
-}
-
-/** ditto */
-
-size_t toUCSindex(wchar[] s, size_t i)
-{
-    size_t n;
-    size_t j;
-
-    for (j = 0; j < i; )
-    {	uint u = s[j];
-
-	j += 1 + (u >= 0xD800 && u <= 0xDBFF);
-	n++;
-    }
-    if (j > i)
-    {
-      Lerr:
-	throw new UtfException("2invalid UTF-16 sequence", j);
-    }
-    return n;
-}
-
-/** ditto */
-
-size_t toUCSindex(dchar[] s, size_t i)
-{
-    return i;
-}
-
-/******************************************
- * Given a UCS index n into an array of characters s[], return the UTF index.
- */
-
-size_t toUTFindex(char[] s, size_t n)
-{
-    size_t i;
-
-    while (n--)
-    {
-	uint j = UTF8stride[s[i]];
-	if (j == 0xFF)
-	    throw new UtfException("3invalid UTF-8 sequence", i);
-	i += j;
-    }
-    return i;
-}
-
-/** ditto */
-
-size_t toUTFindex(wchar[] s, size_t n)
-{
-    size_t i;
-
-    while (n--)
-    {	wchar u = s[i];
-
-	i += 1 + (u >= 0xD800 && u <= 0xDBFF);
-    }
-    return i;
-}
-
-/** ditto */
-
-size_t toUTFindex(dchar[] s, size_t n)
-{
-    return n;
-}
-
-/* =================== Decode ======================= */
-
-/***************
- * Decodes and returns character starting at s[idx]. idx is advanced past the
- * decoded character. If the character is not well formed, a UtfException is
- * thrown and idx remains unchanged.
- */
-
-dchar decode(char[] s, inout size_t idx)
-    in
-    {
-	assert(idx >= 0 && idx < s.length);
-    }
-    out (result)
-    {
-	assert(isValidDchar(result));
-    }
-    body
-    {
-	size_t len = s.length;
-	dchar V;
-	size_t i = idx;
-	char u = s[i];
-
-	if (u & 0x80)
-	{   uint n;
-	    char u2;
-
-	    /* The following encodings are valid, except for the 5 and 6 byte
-	     * combinations:
-	     *	0xxxxxxx
-	     *	110xxxxx 10xxxxxx
-	     *	1110xxxx 10xxxxxx 10xxxxxx
-	     *	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-	     *	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-	     *	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-	     */
-	    for (n = 1; ; n++)
-	    {
-		if (n > 4)
-		    goto Lerr;		// only do the first 4 of 6 encodings
-		if (((u << n) & 0x80) == 0)
-		{
-		    if (n == 1)
-			goto Lerr;
-		    break;
-		}
-	    }
-
-	    // Pick off (7 - n) significant bits of B from first byte of octet
-	    V = cast(dchar)(u & ((1 << (7 - n)) - 1));
-
-	    if (i + (n - 1) >= len)
-		goto Lerr;			// off end of string
-
-	    /* The following combinations are overlong, and illegal:
-	     *	1100000x (10xxxxxx)
-	     *	11100000 100xxxxx (10xxxxxx)
-	     *	11110000 1000xxxx (10xxxxxx 10xxxxxx)
-	     *	11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
-	     *	11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
-	     */
-	    u2 = s[i + 1];
-	    if ((u & 0xFE) == 0xC0 ||
-		(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
-		(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
-		(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
-		(u == 0xFC && (u2 & 0xFC) == 0x80))
-		goto Lerr;			// overlong combination
-
-	    for (uint j = 1; j != n; j++)
-	    {
-		u = s[i + j];
-		if ((u & 0xC0) != 0x80)
-		    goto Lerr;			// trailing bytes are 10xxxxxx
-		V = (V << 6) | (u & 0x3F);
-	    }
-	    if (!isValidDchar(V))
-		goto Lerr;
-	    i += n;
-	}
-	else
-	{
-	    V = cast(dchar) u;
-	    i++;
-	}
-
-	idx = i;
-	return V;
-
-      Lerr:
-	//printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n'%.*s'\n", idx, i, s.length, s, s[i], s[i .. length]);
-	throw new UtfException("4invalid UTF-8 sequence", i);
-    }
-
-unittest
-{   size_t i;
-    dchar c;
-
-    debug(utf) printf("utf.decode.unittest\n");
-
-    static char[] s1 = "abcd";
-    i = 0;
-    c = decode(s1, i);
-    assert(c == cast(dchar)'a');
-    assert(i == 1);
-    c = decode(s1, i);
-    assert(c == cast(dchar)'b');
-    assert(i == 2);
-
-    static char[] s2 = "\xC2\xA9";
-    i = 0;
-    c = decode(s2, i);
-    assert(c == cast(dchar)'\u00A9');
-    assert(i == 2);
-
-    static char[] s3 = "\xE2\x89\xA0";
-    i = 0;
-    c = decode(s3, i);
-    assert(c == cast(dchar)'\u2260');
-    assert(i == 3);
-
-    static char[][] s4 =
-    [	"\xE2\x89",		// too short
-	"\xC0\x8A",
-	"\xE0\x80\x8A",
-	"\xF0\x80\x80\x8A",
-	"\xF8\x80\x80\x80\x8A",
-	"\xFC\x80\x80\x80\x80\x8A",
-    ];
-
-    for (int j = 0; j < s4.length; j++)
-    {
-	try
-	{
-	    i = 0;
-	    c = decode(s4[j], i);
-	    assert(0);
-	}
-	catch (UtfException u)
-	{
-	    i = 23;
-	    delete u;
-	}
-	assert(i == 23);
-    }
-}
-
-/** ditto */
-
-dchar decode(wchar[] s, inout size_t idx)
-    in
-    {
-	assert(idx >= 0 && idx < s.length);
-    }
-    out (result)
-    {
-	assert(isValidDchar(result));
-    }
-    body
-    {
-	char[] msg;
-	dchar V;
-	size_t i = idx;
-	uint u = s[i];
-
-	if (u & ~0x7F)
-	{   if (u >= 0xD800 && u <= 0xDBFF)
-	    {   uint u2;
-
-		if (i + 1 == s.length)
-		{   msg = "surrogate UTF-16 high value past end of string";
-		    goto Lerr;
-		}
-		u2 = s[i + 1];
-		if (u2 < 0xDC00 || u2 > 0xDFFF)
-		{   msg = "surrogate UTF-16 low value out of range";
-		    goto Lerr;
-		}
-		u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
-		i += 2;
-	    }
-	    else if (u >= 0xDC00 && u <= 0xDFFF)
-	    {   msg = "unpaired surrogate UTF-16 value";
-		goto Lerr;
-	    }
-	    else if (u == 0xFFFE || u == 0xFFFF)
-	    {   msg = "illegal UTF-16 value";
-		goto Lerr;
-	    }
-	    else
-		i++;
-	}
-	else
-	{
-	    i++;
-	}
-
-	idx = i;
-	return cast(dchar)u;
-
-      Lerr:
-	throw new UtfException(msg, i);
-    }
-
-/** ditto */
-
-dchar decode(dchar[] s, inout size_t idx)
-    in
-    {
-	assert(idx >= 0 && idx < s.length);
-    }
-    body
-    {
-	size_t i = idx;
-	dchar c = s[i];
-
-	if (!isValidDchar(c))
-	    goto Lerr;
-	idx = i + 1;
-	return c;
-
-      Lerr:
-	throw new UtfException("5invalid UTF-32 value", i);
-    }
-
-
-/* =================== Encode ======================= */
-
-/*******************************
- * Encodes character c and appends it to array s[].
- */
-
-void encode(inout char[] s, dchar c)
-    in
-    {
-	assert(isValidDchar(c));
-    }
-    body
-    {
-	char[] r = s;
-
-	if (c <= 0x7F)
-	{
-	    r ~= cast(char) c;
-	}
-	else
-	{
-	    char[4] buf;
-	    uint L;
-
-	    if (c <= 0x7FF)
-	    {
-		buf[0] = cast(char)(0xC0 | (c >> 6));
-		buf[1] = cast(char)(0x80 | (c & 0x3F));
-		L = 2;
-	    }
-	    else if (c <= 0xFFFF)
-	    {
-		buf[0] = cast(char)(0xE0 | (c >> 12));
-		buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
-		buf[2] = cast(char)(0x80 | (c & 0x3F));
-		L = 3;
-	    }
-	    else if (c <= 0x10FFFF)
-	    {
-		buf[0] = cast(char)(0xF0 | (c >> 18));
-		buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
-		buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
-		buf[3] = cast(char)(0x80 | (c & 0x3F));
-		L = 4;
-	    }
-	    else
-	    {
-		assert(0);
-	    }
-	    r ~= buf[0 .. L];
-	}
-	s = r;
-    }
-
-unittest
-{
-    debug(utf) printf("utf.encode.unittest\n");
-
-    char[] s = "abcd";
-    encode(s, cast(dchar)'a');
-    assert(s.length == 5);
-    assert(s == "abcda");
-
-    encode(s, cast(dchar)'\u00A9');
-    assert(s.length == 7);
-    assert(s == "abcda\xC2\xA9");
-    //assert(s == "abcda\u00A9");	// BUG: fix compiler
-
-    encode(s, cast(dchar)'\u2260');
-    assert(s.length == 10);
-    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
-}
-
-/** ditto */
-
-void encode(inout wchar[] s, dchar c)
-    in
-    {
-	assert(isValidDchar(c));
-    }
-    body
-    {
-	wchar[] r = s;
-
-	if (c <= 0xFFFF)
-	{
-	    r ~= cast(wchar) c;
-	}
-	else
-	{
-	    wchar[2] buf;
-
-	    buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
-	    buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
-	    r ~= buf;
-	}
-	s = r;
-    }
-
-/** ditto */
-
-void encode(inout dchar[] s, dchar c)
-    in
-    {
-	assert(isValidDchar(c));
-    }
-    body
-    {
-	s ~= c;
-    }
-
-/* =================== Validation ======================= */
-
-/***********************************
- * Checks to see if string is well formed or not. Throws a UtfException if it is
- * not. Use to check all untrusted input for correctness.
- */
-
-void validate(char[] s)
-{
-    size_t len = s.length;
-    size_t i;
-
-    for (i = 0; i < len; )
-    {
-	decode(s, i);
-    }
-}
-
-/** ditto */
-
-void validate(wchar[] s)
-{
-    size_t len = s.length;
-    size_t i;
-
-    for (i = 0; i < len; )
-    {
-	decode(s, i);
-    }
-}
-
-/** ditto */
-
-void validate(dchar[] s)
-{
-    size_t len = s.length;
-    size_t i;
-
-    for (i = 0; i < len; )
-    {
-	decode(s, i);
-    }
-}
-
-/* =================== Conversion to UTF8 ======================= */
-
-char[] toUTF8(char[4] buf, dchar c)
-    in
-    {
-	assert(isValidDchar(c));
-    }
-    body
-    {
-	if (c <= 0x7F)
-	{
-	    buf[0] = cast(char) c;
-	    return buf[0 .. 1];
-	}
-	else if (c <= 0x7FF)
-	{
-	    buf[0] = cast(char)(0xC0 | (c >> 6));
-	    buf[1] = cast(char)(0x80 | (c & 0x3F));
-	    return buf[0 .. 2];
-	}
-	else if (c <= 0xFFFF)
-	{
-	    buf[0] = cast(char)(0xE0 | (c >> 12));
-	    buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
-	    buf[2] = cast(char)(0x80 | (c & 0x3F));
-	    return buf[0 .. 3];
-	}
-	else if (c <= 0x10FFFF)
-	{
-	    buf[0] = cast(char)(0xF0 | (c >> 18));
-	    buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
-	    buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
-	    buf[3] = cast(char)(0x80 | (c & 0x3F));
-	    return buf[0 .. 4];
-	}
-	assert(0);
-    }
-
-/*******************
- * Encodes string s into UTF-8 and returns the encoded string.
- */
-
-char[] toUTF8(char[] s)
-    in
-    {
-	validate(s);
-    }
-    body
-    {
-	return s;
-    }
-
-/** ditto */
-
-char[] toUTF8(wchar[] s)
-{
-    char[] r;
-    size_t i;
-    size_t slen = s.length;
-
-    r.length = slen;
-
-    for (i = 0; i < slen; i++)
-    {	wchar c = s[i];
-
-	if (c <= 0x7F)
-	    r[i] = cast(char)c;		// fast path for ascii
-	else
-	{
-	    r.length = i;
-	    foreach (dchar c; s[i .. slen])
-	    {
-		encode(r, c);
-	    }
-	    break;
-	}
-    }
-    return r;
-}
-
-/** ditto */
-
-char[] toUTF8(dchar[] s)
-{
-    char[] r;
-    size_t i;
-    size_t slen = s.length;
-
-    r.length = slen;
-
-    for (i = 0; i < slen; i++)
-    {	dchar c = s[i];
-
-	if (c <= 0x7F)
-	    r[i] = cast(char)c;		// fast path for ascii
-	else
-	{
-	    r.length = i;
-	    foreach (dchar d; s[i .. slen])
-	    {
-		encode(r, d);
-	    }
-	    break;
-	}
-    }
-    return r;
-}
-
-/* =================== Conversion to UTF16 ======================= */
-
-wchar[] toUTF16(wchar[2] buf, dchar c)
-    in
-    {
-	assert(isValidDchar(c));
-    }
-    body
-    {
-	if (c <= 0xFFFF)
-	{
-	    buf[0] = cast(wchar) c;
-	    return buf[0 .. 1];
-	}
-	else
-	{
-	    buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
-	    buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
-	    return buf[0 .. 2];
-	}
-    }
-
-/****************
- * Encodes string s into UTF-16 and returns the encoded string.
- * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
- * an LPWSTR or LPCWSTR argument.
- */
-
-wchar[] toUTF16(char[] s)
-{
-    wchar[] r;
-    size_t slen = s.length;
-
-    r.length = slen;
-    r.length = 0;
-    for (size_t i = 0; i < slen; )
-    {
-	dchar c = s[i];
-	if (c <= 0x7F)
-	{
-	    i++;
-	    r ~= cast(wchar)c;
-	}
-	else
-	{
-	    c = decode(s, i);
-	    encode(r, c);
-	}
-    }
-    return r;
-}
-
-/** ditto */
-
-wchar* toUTF16z(char[] s)
-{
-    wchar[] r;
-    size_t slen = s.length;
-
-    r.length = slen + 1;
-    r.length = 0;
-    for (size_t i = 0; i < slen; )
-    {
-	dchar c = s[i];
-	if (c <= 0x7F)
-	{
-	    i++;
-	    r ~= cast(wchar)c;
-	}
-	else
-	{
-	    c = decode(s, i);
-	    encode(r, c);
-	}
-    }
-    r ~= "\000";
-    return r.ptr;
-}
-
-/** ditto */
-
-wchar[] toUTF16(wchar[] s)
-    in
-    {
-	validate(s);
-    }
-    body
-    {
-	return s;
-    }
-
-/** ditto */
-
-wchar[] toUTF16(dchar[] s)
-{
-    wchar[] r;
-    size_t slen = s.length;
-
-    r.length = slen;
-    r.length = 0;
-    for (size_t i = 0; i < slen; i++)
-    {
-	encode(r, s[i]);
-    }
-    return r;
-}
-
-/* =================== Conversion to UTF32 ======================= */
-
-/*****
- * Encodes string s into UTF-32 and returns the encoded string.
- */
-
-dchar[] toUTF32(char[] s)
-{
-    dchar[] r;
-    size_t slen = s.length;
-    size_t j = 0;
-
-    r.length = slen;		// r[] will never be longer than s[]
-    for (size_t i = 0; i < slen; )
-    {
-	dchar c = s[i];
-	if (c >= 0x80)
-	    c = decode(s, i);
-	else
-	    i++;		// c is ascii, no need for decode
-	r[j++] = c;
-    }
-    return r[0 .. j];
-}
-
-/** ditto */
-
-dchar[] toUTF32(wchar[] s)
-{
-    dchar[] r;
-    size_t slen = s.length;
-    size_t j = 0;
-
-    r.length = slen;		// r[] will never be longer than s[]
-    for (size_t i = 0; i < slen; )
-    {
-	dchar c = s[i];
-	if (c >= 0x80)
-	    c = decode(s, i);
-	else
-	    i++;		// c is ascii, no need for decode
-	r[j++] = c;
-    }
-    return r[0 .. j];
-}
-
-/** ditto */
-
-dchar[] toUTF32(dchar[] s)
-    in
-    {
-	validate(s);
-    }
-    body
-    {
-	return s;
-    }
-
-/* ================================ tests ================================== */
-
-unittest
-{
-    debug(utf) printf("utf.toUTF.unittest\n");
-
-    char[] c;
-    wchar[] w;
-    dchar[] d;
-
-    c = "hello";
-    w = toUTF16(c);
-    assert(w == "hello");
-    d = toUTF32(c);
-    assert(d == "hello");
-
-    c = toUTF8(w);
-    assert(c == "hello");
-    d = toUTF32(w);
-    assert(d == "hello");
-
-    c = toUTF8(d);
-    assert(c == "hello");
-    w = toUTF16(d);
-    assert(w == "hello");
-
-
-    c = "hel\u1234o";
-    w = toUTF16(c);
-    assert(w == "hel\u1234o");
-    d = toUTF32(c);
-    assert(d == "hel\u1234o");
-
-    c = toUTF8(w);
-    assert(c == "hel\u1234o");
-    d = toUTF32(w);
-    assert(d == "hel\u1234o");
-
-    c = toUTF8(d);
-    assert(c == "hel\u1234o");
-    w = toUTF16(d);
-    assert(w == "hel\u1234o");
-
-
-    c = "he\U0010AAAAllo";
-    w = toUTF16(c);
-    //foreach (wchar c; w) printf("c = x%x\n", c);
-    //foreach (wchar c; cast(wchar[])"he\U0010AAAAllo") printf("c = x%x\n", c);
-    assert(w == "he\U0010AAAAllo");
-    d = toUTF32(c);
-    assert(d == "he\U0010AAAAllo");
-
-    c = toUTF8(w);
-    assert(c == "he\U0010AAAAllo");
-    d = toUTF32(w);
-    assert(d == "he\U0010AAAAllo");
-
-    c = toUTF8(d);
-    assert(c == "he\U0010AAAAllo");
-    w = toUTF16(d);
-    assert(w == "he\U0010AAAAllo");
-}