Mercurial > projects > ldc

diff dmd2/utf.c @ 758:f04dde6e882c
Added initial D2 support, D2 frontend and changes to codegen to make things compile.
author: Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date: Tue, 11 Nov 2008 01:38:48 +0100
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dmd2/utf.c	Tue Nov 11 01:38:48 2008 +0100
@@ -0,0 +1,193 @@
+// utf.c
+// Copyright (c) 2003 by Digital Mars
+// All Rights Reserved
+// written by Walter Bright
+// http://www.digitalmars.com
+// License for redistribution is by either the Artistic License
+// in artistic.txt, or the GNU General Public License in gnu.txt.
+// See the included readme.txt for details.
+
+// Description of UTF-8 at:
+// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "utf.h"
+
+int utf_isValidDchar(dchar_t c)
+{
+    return c < 0xD800 ||
+	(c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
+}
+
+/********************************************
+ * Decode a single UTF-8 character sequence.
+ * Returns:
+ *	NULL	success
+ *	!=NULL	error message string
+ */
+
+const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult)
+{
+    dchar_t V;
+    size_t i = *pidx;
+    unsigned char u = s[i];
+
+    assert(i >= 0 && i < len);
+
+    if (u & 0x80)
+    {   unsigned n;
+	unsigned char u2;
+
+	/* The following encodings are valid, except for the 5 and 6 byte
+	 * combinations:
+	 *	0xxxxxxx
+	 *	110xxxxx 10xxxxxx
+	 *	1110xxxx 10xxxxxx 10xxxxxx
+	 *	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+	 *	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+	 *	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+	 */
+	for (n = 1; ; n++)
+	{
+	    if (n > 4)
+		goto Lerr;		// only do the first 4 of 6 encodings
+	    if (((u << n) & 0x80) == 0)
+	    {
+		if (n == 1)
+		    goto Lerr;
+		break;
+	    }
+	}
+
+	// Pick off (7 - n) significant bits of B from first byte of octet
+	V = (dchar_t)(u & ((1 << (7 - n)) - 1));
+
+	if (i + (n - 1) >= len)
+	    goto Lerr;			// off end of string
+
+	/* The following combinations are overlong, and illegal:
+	 *	1100000x (10xxxxxx)
+	 *	11100000 100xxxxx (10xxxxxx)
+	 *	11110000 1000xxxx (10xxxxxx 10xxxxxx)
+	 *	11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
+	 *	11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
+	 */
+	u2 = s[i + 1];
+	if ((u & 0xFE) == 0xC0 ||
+	    (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
+	    (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
+	    (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
+	    (u == 0xFC && (u2 & 0xFC) == 0x80))
+	    goto Lerr;			// overlong combination
+
+	for (unsigned j = 1; j != n; j++)
+	{
+	    u = s[i + j];
+	    if ((u & 0xC0) != 0x80)
+		goto Lerr;			// trailing bytes are 10xxxxxx
+	    V = (V << 6) | (u & 0x3F);
+	}
+	if (!utf_isValidDchar(V))
+	    goto Lerr;
+	i += n;
+    }
+    else
+    {
+	V = (dchar_t) u;
+	i++;
+    }
+
+    assert(utf_isValidDchar(V));
+    *pidx = i;
+    *presult = V;
+    return NULL;
+
+  Lerr:
+    *presult = (dchar_t) s[i];
+    *pidx = i + 1;
+    return "invalid UTF-8 sequence";
+}
+
+/***************************************************
+ * Validate a UTF-8 string.
+ * Returns:
+ *	NULL	success
+ *	!=NULL	error message string
+ */
+
+const char *utf_validateString(unsigned char *s, size_t len)
+{
+    size_t idx;
+    const char *err = NULL;
+    dchar_t dc;
+
+    for (idx = 0; idx < len; )
+    {
+	err = utf_decodeChar(s, len, &idx, &dc);
+	if (err)
+	    break;
+    }
+    return err;
+}
+
+
+/********************************************
+ * Decode a single UTF-16 character sequence.
+ * Returns:
+ *	NULL	success
+ *	!=NULL	error message string
+ */
+
+
+const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult)
+{
+    const char *msg;
+    size_t i = *pidx;
+    unsigned u = s[i];
+
+    assert(i >= 0 && i < len);
+    if (u & ~0x7F)
+    {   if (u >= 0xD800 && u <= 0xDBFF)
+	{   unsigned u2;
+
+	    if (i + 1 == len)
+	    {   msg = "surrogate UTF-16 high value past end of string";
+		goto Lerr;
+	    }
+	    u2 = s[i + 1];
+	    if (u2 < 0xDC00 || u2 > 0xDFFF)
+	    {   msg = "surrogate UTF-16 low value out of range";
+		goto Lerr;
+	    }
+	    u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
+	    i += 2;
+	}
+	else if (u >= 0xDC00 && u <= 0xDFFF)
+	{   msg = "unpaired surrogate UTF-16 value";
+	    goto Lerr;
+	}
+	else if (u == 0xFFFE || u == 0xFFFF)
+	{   msg = "illegal UTF-16 value";
+	    goto Lerr;
+	}
+	else
+	    i++;
+    }
+    else
+    {
+	i++;
+    }
+
+    assert(utf_isValidDchar(u));
+    *pidx = i;
+    *presult = (dchar_t)u;
+    return NULL;
+
+  Lerr:
+    *presult = (dchar_t)s[i];
+    *pidx = i + 1;
+    return msg;
+}
+
author	Tomas Lindquist Olsen <tomas.l.olsen@gmail.com>
date	Tue, 11 Nov 2008 01:38:48 +0100
parents
children