view dmd2/utf.c @ 945:03d7c4aac654

SWITCHED TO LLVM 2.5 ! Applied patch from ticket #129 to compile against latest LLVM. Thanks Frits van Bommel. Fixed implicit return by asm block at the end of a function on x86-32. Other architectures will produce an error at the moment. Adding support for new targets is fairly simple. Fixed return calling convention for complex numbers, ST and ST(1) were switched around. Added some testcases. I've run a dstress test and there are no regressions. However, the runtime does not seem to compile with symbolic debug information. -O3 -release -inline works well and is what I used for the dstress run. Tango does not compile, a small workaround is needed in tango.io.digest.Digest.Digest.hexDigest. See ticket #206 .
author Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date Sun, 08 Feb 2009 05:26:54 +0100
parents f04dde6e882c
children
line wrap: on
line source

// utf.c
// Copyright (c) 2003 by Digital Mars
// All Rights Reserved
// written by Walter Bright
// http://www.digitalmars.com
// License for redistribution is by either the Artistic License
// in artistic.txt, or the GNU General Public License in gnu.txt.
// See the included readme.txt for details.

// Description of UTF-8 at:
// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8

#include <stdio.h>
#include <assert.h>

#include "utf.h"

int utf_isValidDchar(dchar_t c)
{
    return c < 0xD800 ||
	(c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
}

/********************************************
 * Decode a single UTF-8 character sequence.
 * Returns:
 *	NULL	success
 *	!=NULL	error message string
 */

const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult)
{
    dchar_t V;
    size_t i = *pidx;
    unsigned char u = s[i];

    assert(i >= 0 && i < len);

    if (u & 0x80)
    {   unsigned n;
	unsigned char u2;

	/* The following encodings are valid, except for the 5 and 6 byte
	 * combinations:
	 *	0xxxxxxx
	 *	110xxxxx 10xxxxxx
	 *	1110xxxx 10xxxxxx 10xxxxxx
	 *	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	 *	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	 *	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	 */
	for (n = 1; ; n++)
	{
	    if (n > 4)
		goto Lerr;		// only do the first 4 of 6 encodings
	    if (((u << n) & 0x80) == 0)
	    {
		if (n == 1)
		    goto Lerr;
		break;
	    }
	}

	// Pick off (7 - n) significant bits of B from first byte of octet
	V = (dchar_t)(u & ((1 << (7 - n)) - 1));

	if (i + (n - 1) >= len)
	    goto Lerr;			// off end of string

	/* The following combinations are overlong, and illegal:
	 *	1100000x (10xxxxxx)
	 *	11100000 100xxxxx (10xxxxxx)
	 *	11110000 1000xxxx (10xxxxxx 10xxxxxx)
	 *	11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
	 *	11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
	 */
	u2 = s[i + 1];
	if ((u & 0xFE) == 0xC0 ||
	    (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
	    (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
	    (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
	    (u == 0xFC && (u2 & 0xFC) == 0x80))
	    goto Lerr;			// overlong combination

	for (unsigned j = 1; j != n; j++)
	{
	    u = s[i + j];
	    if ((u & 0xC0) != 0x80)
		goto Lerr;			// trailing bytes are 10xxxxxx
	    V = (V << 6) | (u & 0x3F);
	}
	if (!utf_isValidDchar(V))
	    goto Lerr;
	i += n;
    }
    else
    {
	V = (dchar_t) u;
	i++;
    }

    assert(utf_isValidDchar(V));
    *pidx = i;
    *presult = V;
    return NULL;

  Lerr:
    *presult = (dchar_t) s[i];
    *pidx = i + 1;
    return "invalid UTF-8 sequence";
}

/***************************************************
 * Validate a UTF-8 string.
 * Returns:
 *	NULL	success
 *	!=NULL	error message string
 */

const char *utf_validateString(unsigned char *s, size_t len)
{
    size_t idx;
    const char *err = NULL;
    dchar_t dc;

    for (idx = 0; idx < len; )
    {
	err = utf_decodeChar(s, len, &idx, &dc);
	if (err)
	    break;
    }
    return err;
}


/********************************************
 * Decode a single UTF-16 character sequence.
 * Returns:
 *	NULL	success
 *	!=NULL	error message string
 */


const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult)
{
    const char *msg;
    size_t i = *pidx;
    unsigned u = s[i];

    assert(i >= 0 && i < len);
    if (u & ~0x7F)
    {   if (u >= 0xD800 && u <= 0xDBFF)
	{   unsigned u2;

	    if (i + 1 == len)
	    {   msg = "surrogate UTF-16 high value past end of string";
		goto Lerr;
	    }
	    u2 = s[i + 1];
	    if (u2 < 0xDC00 || u2 > 0xDFFF)
	    {   msg = "surrogate UTF-16 low value out of range";
		goto Lerr;
	    }
	    u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
	    i += 2;
	}
	else if (u >= 0xDC00 && u <= 0xDFFF)
	{   msg = "unpaired surrogate UTF-16 value";
	    goto Lerr;
	}
	else if (u == 0xFFFE || u == 0xFFFF)
	{   msg = "illegal UTF-16 value";
	    goto Lerr;
	}
	else
	    i++;
    }
    else
    {
	i++;
    }

    assert(utf_isValidDchar(u));
    *pidx = i;
    *presult = (dchar_t)u;
    return NULL;

  Lerr:
    *presult = (dchar_t)s[i];
    *pidx = i + 1;
    return msg;
}