1
|
1 // utf.c
|
|
2 // Copyright (c) 2003 by Digital Mars
|
|
3 // All Rights Reserved
|
|
4 // written by Walter Bright
|
|
5 // http://www.digitalmars.com
|
|
6 // License for redistribution is by either the Artistic License
|
|
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
|
|
8 // See the included readme.txt for details.
|
|
9
|
|
10 // Description of UTF-8 at:
|
|
11 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
|
|
12
|
|
13 #include <stdio.h>
|
|
14 #include <assert.h>
|
|
15
|
|
16 #include "utf.h"
|
|
17
|
|
18 int utf_isValidDchar(dchar_t c)
|
|
19 {
|
|
20 return c < 0xD800 ||
|
|
21 (c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
|
|
22 }
|
|
23
|
|
24 /********************************************
|
|
25 * Decode a single UTF-8 character sequence.
|
|
26 * Returns:
|
|
27 * NULL success
|
|
28 * !=NULL error message string
|
|
29 */
|
|
30
|
|
31 char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult)
|
|
32 {
|
|
33 dchar_t V;
|
|
34 size_t i = *pidx;
|
|
35 unsigned char u = s[i];
|
|
36
|
|
37 assert(i >= 0 && i < len);
|
|
38
|
|
39 if (u & 0x80)
|
|
40 { unsigned n;
|
|
41 unsigned char u2;
|
|
42
|
|
43 /* The following encodings are valid, except for the 5 and 6 byte
|
|
44 * combinations:
|
|
45 * 0xxxxxxx
|
|
46 * 110xxxxx 10xxxxxx
|
|
47 * 1110xxxx 10xxxxxx 10xxxxxx
|
|
48 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
49 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
50 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
51 */
|
|
52 for (n = 1; ; n++)
|
|
53 {
|
|
54 if (n > 4)
|
|
55 goto Lerr; // only do the first 4 of 6 encodings
|
|
56 if (((u << n) & 0x80) == 0)
|
|
57 {
|
|
58 if (n == 1)
|
|
59 goto Lerr;
|
|
60 break;
|
|
61 }
|
|
62 }
|
|
63
|
|
64 // Pick off (7 - n) significant bits of B from first byte of octet
|
|
65 V = (dchar_t)(u & ((1 << (7 - n)) - 1));
|
|
66
|
|
67 if (i + (n - 1) >= len)
|
|
68 goto Lerr; // off end of string
|
|
69
|
|
70 /* The following combinations are overlong, and illegal:
|
|
71 * 1100000x (10xxxxxx)
|
|
72 * 11100000 100xxxxx (10xxxxxx)
|
|
73 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
|
74 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
|
75 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
76 */
|
|
77 u2 = s[i + 1];
|
|
78 if ((u & 0xFE) == 0xC0 ||
|
|
79 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
|
80 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
|
81 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
|
82 (u == 0xFC && (u2 & 0xFC) == 0x80))
|
|
83 goto Lerr; // overlong combination
|
|
84
|
|
85 for (unsigned j = 1; j != n; j++)
|
|
86 {
|
|
87 u = s[i + j];
|
|
88 if ((u & 0xC0) != 0x80)
|
|
89 goto Lerr; // trailing bytes are 10xxxxxx
|
|
90 V = (V << 6) | (u & 0x3F);
|
|
91 }
|
|
92 if (!utf_isValidDchar(V))
|
|
93 goto Lerr;
|
|
94 i += n;
|
|
95 }
|
|
96 else
|
|
97 {
|
|
98 V = (dchar_t) u;
|
|
99 i++;
|
|
100 }
|
|
101
|
|
102 assert(utf_isValidDchar(V));
|
|
103 *pidx = i;
|
|
104 *presult = V;
|
|
105 return NULL;
|
|
106
|
|
107 Lerr:
|
|
108 *presult = (dchar_t) s[i];
|
|
109 *pidx = i + 1;
|
|
110 return "invalid UTF-8 sequence";
|
|
111 }
|
|
112
|
|
113 /***************************************************
|
|
114 * Validate a UTF-8 string.
|
|
115 * Returns:
|
|
116 * NULL success
|
|
117 * !=NULL error message string
|
|
118 */
|
|
119
|
|
120 char *utf_validateString(unsigned char *s, size_t len)
|
|
121 {
|
|
122 size_t idx;
|
|
123 char *err = NULL;
|
|
124 dchar_t dc;
|
|
125
|
|
126 for (idx = 0; idx < len; )
|
|
127 {
|
|
128 err = utf_decodeChar(s, len, &idx, &dc);
|
|
129 if (err)
|
|
130 break;
|
|
131 }
|
|
132 return err;
|
|
133 }
|
|
134
|
|
135
|
|
136 /********************************************
|
|
137 * Decode a single UTF-16 character sequence.
|
|
138 * Returns:
|
|
139 * NULL success
|
|
140 * !=NULL error message string
|
|
141 */
|
|
142
|
|
143
|
|
144 char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult)
|
|
145 {
|
|
146 char *msg;
|
|
147 size_t i = *pidx;
|
|
148 unsigned u = s[i];
|
|
149
|
|
150 assert(i >= 0 && i < len);
|
|
151 if (u & ~0x7F)
|
|
152 { if (u >= 0xD800 && u <= 0xDBFF)
|
|
153 { unsigned u2;
|
|
154
|
|
155 if (i + 1 == len)
|
|
156 { msg = "surrogate UTF-16 high value past end of string";
|
|
157 goto Lerr;
|
|
158 }
|
|
159 u2 = s[i + 1];
|
|
160 if (u2 < 0xDC00 || u2 > 0xDFFF)
|
|
161 { msg = "surrogate UTF-16 low value out of range";
|
|
162 goto Lerr;
|
|
163 }
|
|
164 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
|
|
165 i += 2;
|
|
166 }
|
|
167 else if (u >= 0xDC00 && u <= 0xDFFF)
|
|
168 { msg = "unpaired surrogate UTF-16 value";
|
|
169 goto Lerr;
|
|
170 }
|
|
171 else if (u == 0xFFFE || u == 0xFFFF)
|
|
172 { msg = "illegal UTF-16 value";
|
|
173 goto Lerr;
|
|
174 }
|
|
175 else
|
|
176 i++;
|
|
177 }
|
|
178 else
|
|
179 {
|
|
180 i++;
|
|
181 }
|
|
182
|
|
183 assert(utf_isValidDchar(u));
|
|
184 *pidx = i;
|
|
185 *presult = (dchar_t)u;
|
|
186 return NULL;
|
|
187
|
|
188 Lerr:
|
|
189 *presult = (dchar_t)s[i];
|
|
190 *pidx = i + 1;
|
|
191 return msg;
|
|
192 }
|
|
193
|