Mercurial > projects > dil
annotate trunk/src/dil/Unicode.d @ 629:d050e211402b
Moved files in src/std/ to src/util/.
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Fri, 11 Jan 2008 20:03:46 +0100 |
parents | 8f86bb9ef715 |
children | ceaac6a24258 |
rev | line source |
---|---|
510 | 1 /++ |
2 Author: Aziz Köksal | |
3 License: GPL3 | |
4 +/ | |
5 module dil.Unicode; | |
629
d050e211402b
Moved files in src/std/ to src/util/.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
518
diff
changeset
|
6 public import util.uni : isUniAlpha; |
510 | 7 |
8 /// U+FFFD = �. Used to replace invalid Unicode characters. | |
9 const dchar REPLACEMENT_CHAR = '\uFFFD'; | |
10 /// Invalid character, returned on errors. | |
11 const dchar ERROR_CHAR = 0xD800; | |
12 | |
13 /++ | |
14 Returns true if this character is not a surrogate | |
15 code point and not higher than 0x10FFFF. | |
16 +/ | |
17 bool isValidChar(dchar d) | |
18 { | |
19 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF; | |
20 } | |
21 | |
22 /++ | |
23 Returns true if this is one of the | |
24 There are a total of 66 noncharacters. | |
25 See_also: Chapter 16.7 Noncharacters in Unicode 5.0 | |
26 +/ | |
27 bool isNoncharacter(dchar d) | |
28 { | |
29 return 0xFDD0 <= d && d <= 0xFDEF || // 32 | |
30 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 | |
31 } | |
32 | |
33 /// Returns true if this is a trail byte of a UTF-8 sequence? | |
34 bool isTrailByte(ubyte b) | |
35 { | |
36 return (b & 0xC0) == 0x80; // 10xx_xxxx | |
37 } | |
38 | |
39 /// Returns true if this is a lead byte of a UTF-8 sequence. | |
40 bool isLeadByte(ubyte b) | |
41 { | |
42 return (b & 0xC0) == 0xC0; // 11xx_xxxx | |
43 } | |
44 | |
45 dchar decode(char[] str, ref size_t index) | |
46 in { assert(str.length); } | |
47 out(c) { assert(isValidChar(c)); } | |
48 body | |
49 { | |
50 char* p = str.ptr + index; | |
51 char* end = str.ptr + str.length; | |
52 dchar c = *p; | |
53 | |
54 if (!(p < end)) | |
55 return ERROR_CHAR; | |
56 | |
57 if (c < 0x80) | |
58 { | |
59 ++index; | |
60 return c; | |
61 } | |
62 | |
63 ++p; // Move to second byte. | |
64 if (!(p < end)) | |
65 return ERROR_CHAR; | |
66 | |
67 // Error if second byte is not a trail byte. | |
68 if (!isTrailByte(*p)) | |
69 return ERROR_CHAR; | |
70 | |
71 // Check for overlong sequences. | |
72 switch (c) | |
73 { | |
74 case 0xE0, // 11100000 100xxxxx | |
75 0xF0, // 11110000 1000xxxx | |
76 0xF8, // 11111000 10000xxx | |
77 0xFC: // 11111100 100000xx | |
78 if ((*p & c) == 0x80) | |
79 return ERROR_CHAR; | |
80 default: | |
81 if ((c & 0xFE) == 0xC0) // 1100000x | |
82 return ERROR_CHAR; | |
83 } | |
84 | |
85 const char[] checkNextByte = "if (++p < end && !isTrailByte(*p))" | |
86 " return ERROR_CHAR;"; | |
87 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;"; | |
88 | |
89 auto next_index = index; | |
90 // Decode | |
91 if ((c & 0b1110_0000) == 0b1100_0000) | |
92 { | |
93 // 110xxxxx 10xxxxxx | |
94 c &= 0b0001_1111; | |
95 mixin(appendSixBits); | |
96 next_index += 2; | |
97 } | |
98 else if ((c & 0b1111_0000) == 0b1110_0000) | |
99 { | |
100 // 1110xxxx 10xxxxxx 10xxxxxx | |
101 c &= 0b0000_1111; | |
102 mixin(appendSixBits ~ | |
103 checkNextByte ~ appendSixBits); | |
104 next_index += 3; | |
105 } | |
106 else if ((c & 0b1111_1000) == 0b1111_0000) | |
107 { | |
108 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
109 c &= 0b0000_0111; | |
110 mixin(appendSixBits ~ | |
111 checkNextByte ~ appendSixBits ~ | |
112 checkNextByte ~ appendSixBits); | |
113 next_index += 4; | |
114 } | |
115 else | |
116 // 5 and 6 byte UTF-8 sequences are not allowed yet. | |
117 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
118 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
119 return ERROR_CHAR; | |
120 | |
121 assert(isTrailByte(*p)); | |
122 | |
123 if (!isValidChar(c)) | |
124 return ERROR_CHAR; | |
125 index = next_index; | |
126 return c; | |
127 } | |
128 | |
129 /// Encodes a character and appends it to str. | |
518
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
130 void encode(ref char[] str, dchar c) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
131 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
132 assert(isValidChar(c), "check if character is valid before calling encode()."); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
133 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
134 char[6] b = void; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
135 if (c < 0x80) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
136 str ~= c; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
137 if (c < 0x800) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
138 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
139 b[0] = 0xC0 | (c >> 6); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
140 b[1] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
141 str ~= b[0..2]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
142 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
143 else if (c < 0x10000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
144 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
145 b[0] = 0xE0 | (c >> 12); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
146 b[1] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
147 b[2] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
148 str ~= b[0..3]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
149 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
150 else if (c < 0x200000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
151 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
152 b[0] = 0xF0 | (c >> 18); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
153 b[1] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
154 b[2] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
155 b[3] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
156 str ~= b[0..4]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
157 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
158 /+ // There are no 5 and 6 byte UTF-8 sequences yet. |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
159 else if (c < 0x4000000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
160 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
161 b[0] = 0xF8 | (c >> 24); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
162 b[1] = 0x80 | ((c >> 18) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
163 b[2] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
164 b[3] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
165 b[4] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
166 str ~= b[0..5]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
167 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
168 else if (c < 0x80000000) |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
169 { |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
170 b[0] = 0xFC | (c >> 30); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
171 b[1] = 0x80 | ((c >> 24) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
172 b[2] = 0x80 | ((c >> 18) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
173 b[3] = 0x80 | ((c >> 12) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
174 b[4] = 0x80 | ((c >> 6) & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
175 b[5] = 0x80 | (c & 0x3F); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
176 str ~= b[0..6]; |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
177 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
178 +/ |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
179 else |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
180 assert(0); |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
181 } |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
182 |
8f86bb9ef715
Added module dil.Converter and dil.FileBOM.
Aziz K?ksal <aziz.koeksal@gmail.com>
parents:
510
diff
changeset
|
183 /// Encodes a character and appends it to str. |
510 | 184 void encode(ref wchar[] str, dchar c) |
185 in { assert(isValidChar(c)); } | |
186 body | |
187 { | |
188 if (c < 0x10000) | |
189 str ~= cast(wchar)c; | |
190 else | |
191 { | |
192 // Encode with surrogate pair. | |
193 wchar[2] pair = void; | |
194 c -= 0x10000; // c' | |
195 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx | |
196 pair[0] = (c >> 10) | 0xD800; | |
197 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy | |
198 pair[1] = (c & 0x3FF) | 0xDC00; | |
199 str ~= pair; | |
200 } | |
201 } | |
202 | |
203 /++ | |
204 Returns a decoded character from a UTF-16 sequence. | |
205 In case of an error in the sequence 0xD800 is returned. | |
206 Params: | |
207 str = the UTF-16 sequence. | |
208 index = where to start from. | |
209 +/ | |
210 dchar decode(wchar[] str, ref size_t index) | |
211 { | |
212 assert(str.length && index < str.length); | |
213 dchar c = str[index]; | |
214 if (0xD800 > c || c > 0xDFFF) | |
215 { | |
216 ++index; | |
217 return c; | |
218 } | |
219 if (c <= 0xDBFF && index+1 != str.length) | |
220 { | |
221 wchar c2 = str[index+1]; | |
222 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
223 { | |
224 // (c - 0xD800) << 10 + 0x10000 -> | |
225 // (c - 0xD800 + 0x40) << 10 -> | |
226 c = (c - 0xD7C0) << 10; | |
227 c |= (c2 & 0x3FF); | |
228 index += 2; | |
229 return c; | |
230 } | |
231 } | |
232 return ERROR_CHAR; | |
233 } | |
234 | |
235 /++ | |
236 Returns a decoded character from a UTF-16 sequence. | |
237 In case of an error in the sequence 0xD800 is returned. | |
238 Params: | |
239 p = start of the UTF-16 sequence. | |
240 end = one past the end of the sequence. | |
241 +/ | |
242 dchar decode(ref wchar* p, wchar* end) | |
243 { | |
244 assert(p && p < end); | |
245 dchar c = *p; | |
246 if (0xD800 > c || c > 0xDFFF) | |
247 { | |
248 ++p; | |
249 return c; | |
250 } | |
251 if (c <= 0xDBFF && p+1 != end) | |
252 { | |
253 wchar c2 = p[1]; | |
254 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
255 { | |
256 c = (c - 0xD7C0) << 10; | |
257 c |= (c2 & 0x3FF); | |
258 p += 2; | |
259 return c; | |
260 } | |
261 } | |
262 return ERROR_CHAR; | |
263 } | |
264 | |
265 /// Decode a character from a zero-terminated string. | |
266 dchar decode(ref wchar* p) | |
267 { | |
268 assert(p); | |
269 dchar c = *p; | |
270 if (0xD800 > c || c > 0xDFFF) | |
271 { | |
272 ++p; | |
273 return c; | |
274 } | |
275 if (c <= 0xDBFF) | |
276 { | |
277 wchar c2 = p[1]; | |
278 if (0xDC00 <= c2 && c2 <= 0xDFFF) | |
279 { | |
280 c = (c - 0xD7C0) << 10; | |
281 c |= (c2 & 0x3FF); | |
282 p += 2; | |
283 return c; | |
284 } | |
285 } | |
286 return ERROR_CHAR; | |
287 } |