Mercurial > projects > ldc
comparison dmd2/utf.c @ 758:f04dde6e882c
Added initial D2 support, D2 frontend and changes to codegen to make things compile.
author | Tomas Lindquist Olsen <tomas.l.olsen@gmail.com> |
---|---|
date | Tue, 11 Nov 2008 01:38:48 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
757:2c730d530c98 | 758:f04dde6e882c |
---|---|
1 // utf.c | |
2 // Copyright (c) 2003 by Digital Mars | |
3 // All Rights Reserved | |
4 // written by Walter Bright | |
5 // http://www.digitalmars.com | |
6 // License for redistribution is by either the Artistic License | |
7 // in artistic.txt, or the GNU General Public License in gnu.txt. | |
8 // See the included readme.txt for details. | |
9 | |
10 // Description of UTF-8 at: | |
11 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 | |
12 | |
13 #include <stdio.h> | |
14 #include <assert.h> | |
15 | |
16 #include "utf.h" | |
17 | |
18 int utf_isValidDchar(dchar_t c) | |
19 { | |
20 return c < 0xD800 || | |
21 (c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF); | |
22 } | |
23 | |
24 /******************************************** | |
25 * Decode a single UTF-8 character sequence. | |
26 * Returns: | |
27 * NULL success | |
28 * !=NULL error message string | |
29 */ | |
30 | |
31 const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult) | |
32 { | |
33 dchar_t V; | |
34 size_t i = *pidx; | |
35 unsigned char u = s[i]; | |
36 | |
37 assert(i >= 0 && i < len); | |
38 | |
39 if (u & 0x80) | |
40 { unsigned n; | |
41 unsigned char u2; | |
42 | |
43 /* The following encodings are valid, except for the 5 and 6 byte | |
44 * combinations: | |
45 * 0xxxxxxx | |
46 * 110xxxxx 10xxxxxx | |
47 * 1110xxxx 10xxxxxx 10xxxxxx | |
48 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
49 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
50 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
51 */ | |
52 for (n = 1; ; n++) | |
53 { | |
54 if (n > 4) | |
55 goto Lerr; // only do the first 4 of 6 encodings | |
56 if (((u << n) & 0x80) == 0) | |
57 { | |
58 if (n == 1) | |
59 goto Lerr; | |
60 break; | |
61 } | |
62 } | |
63 | |
64 // Pick off (7 - n) significant bits of B from first byte of octet | |
65 V = (dchar_t)(u & ((1 << (7 - n)) - 1)); | |
66 | |
67 if (i + (n - 1) >= len) | |
68 goto Lerr; // off end of string | |
69 | |
70 /* The following combinations are overlong, and illegal: | |
71 * 1100000x (10xxxxxx) | |
72 * 11100000 100xxxxx (10xxxxxx) | |
73 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) | |
74 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) | |
75 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
76 */ | |
77 u2 = s[i + 1]; | |
78 if ((u & 0xFE) == 0xC0 || | |
79 (u == 0xE0 && (u2 & 0xE0) == 0x80) || | |
80 (u == 0xF0 && (u2 & 0xF0) == 0x80) || | |
81 (u == 0xF8 && (u2 & 0xF8) == 0x80) || | |
82 (u == 0xFC && (u2 & 0xFC) == 0x80)) | |
83 goto Lerr; // overlong combination | |
84 | |
85 for (unsigned j = 1; j != n; j++) | |
86 { | |
87 u = s[i + j]; | |
88 if ((u & 0xC0) != 0x80) | |
89 goto Lerr; // trailing bytes are 10xxxxxx | |
90 V = (V << 6) | (u & 0x3F); | |
91 } | |
92 if (!utf_isValidDchar(V)) | |
93 goto Lerr; | |
94 i += n; | |
95 } | |
96 else | |
97 { | |
98 V = (dchar_t) u; | |
99 i++; | |
100 } | |
101 | |
102 assert(utf_isValidDchar(V)); | |
103 *pidx = i; | |
104 *presult = V; | |
105 return NULL; | |
106 | |
107 Lerr: | |
108 *presult = (dchar_t) s[i]; | |
109 *pidx = i + 1; | |
110 return "invalid UTF-8 sequence"; | |
111 } | |
112 | |
113 /*************************************************** | |
114 * Validate a UTF-8 string. | |
115 * Returns: | |
116 * NULL success | |
117 * !=NULL error message string | |
118 */ | |
119 | |
120 const char *utf_validateString(unsigned char *s, size_t len) | |
121 { | |
122 size_t idx; | |
123 const char *err = NULL; | |
124 dchar_t dc; | |
125 | |
126 for (idx = 0; idx < len; ) | |
127 { | |
128 err = utf_decodeChar(s, len, &idx, &dc); | |
129 if (err) | |
130 break; | |
131 } | |
132 return err; | |
133 } | |
134 | |
135 | |
136 /******************************************** | |
137 * Decode a single UTF-16 character sequence. | |
138 * Returns: | |
139 * NULL success | |
140 * !=NULL error message string | |
141 */ | |
142 | |
143 | |
144 const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult) | |
145 { | |
146 const char *msg; | |
147 size_t i = *pidx; | |
148 unsigned u = s[i]; | |
149 | |
150 assert(i >= 0 && i < len); | |
151 if (u & ~0x7F) | |
152 { if (u >= 0xD800 && u <= 0xDBFF) | |
153 { unsigned u2; | |
154 | |
155 if (i + 1 == len) | |
156 { msg = "surrogate UTF-16 high value past end of string"; | |
157 goto Lerr; | |
158 } | |
159 u2 = s[i + 1]; | |
160 if (u2 < 0xDC00 || u2 > 0xDFFF) | |
161 { msg = "surrogate UTF-16 low value out of range"; | |
162 goto Lerr; | |
163 } | |
164 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); | |
165 i += 2; | |
166 } | |
167 else if (u >= 0xDC00 && u <= 0xDFFF) | |
168 { msg = "unpaired surrogate UTF-16 value"; | |
169 goto Lerr; | |
170 } | |
171 else if (u == 0xFFFE || u == 0xFFFF) | |
172 { msg = "illegal UTF-16 value"; | |
173 goto Lerr; | |
174 } | |
175 else | |
176 i++; | |
177 } | |
178 else | |
179 { | |
180 i++; | |
181 } | |
182 | |
183 assert(utf_isValidDchar(u)); | |
184 *pidx = i; | |
185 *presult = (dchar_t)u; | |
186 return NULL; | |
187 | |
188 Lerr: | |
189 *presult = (dchar_t)s[i]; | |
190 *pidx = i + 1; | |
191 return msg; | |
192 } | |
193 |