comparison src/dil/Unicode.d @ 806:bcb74c9b895c

Moved out files in the trunk folder to the root.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sun, 09 Mar 2008 00:12:19 +0100
parents trunk/src/dil/Unicode.d@c1d5cfd7aa44
children
comparison
equal deleted inserted replaced
805:a3fab8b74a7d 806:bcb74c9b895c
1 /++
2 Author: Aziz Köksal
3 License: GPL3
4 +/
5 module dil.Unicode;
6 public import util.uni : isUniAlpha;
7
8 /// U+FFFD = �. Used to replace invalid Unicode characters.
9 const dchar REPLACEMENT_CHAR = '\uFFFD';
10 const char[3] REPLACEMENT_STR = \uFFFD; /// Ditto
11 /// Invalid character, returned on errors.
12 const dchar ERROR_CHAR = 0xD800;
13
14 /// Returns: true if this character is not a surrogate
15 /// code point and not higher than 0x10FFFF.
16 bool isValidChar(dchar d)
17 {
18 return d < 0xD800 || d > 0xDFFF && d <= 0x10FFFF;
19 }
20
21 /// There are a total of 66 noncharacters.
22 /// Returns: true if this is one of them.
23 /// See_also: Chapter 16.7 Noncharacters in Unicode 5.0
24 bool isNoncharacter(dchar d)
25 {
26 return 0xFDD0 <= d && d <= 0xFDEF || // 32
27 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
28 }
29
30 /// Returns: true if this is a trail byte of a UTF-8 sequence.
31 bool isTrailByte(ubyte b)
32 {
33 return (b & 0xC0) == 0x80; // 10xx_xxxx
34 }
35
36 /// Returns: true if this is a lead byte of a UTF-8 sequence.
37 bool isLeadByte(ubyte b)
38 {
39 return (b & 0xC0) == 0xC0; // 11xx_xxxx
40 }
41
42 /// Advances ref_p only if this is a valid Unicode alpha character.
43 bool isUnicodeAlpha(ref char* ref_p, char* end)
44 in { assert(ref_p && ref_p < end); }
45 body
46 {
47 if (*ref_p < 0x80)
48 return false;
49 auto p = ref_p;
50 auto c = decode(p, end);
51 if (!isUniAlpha(c))
52 return false;
53 ref_p = p;
54 return true;
55 }
56
57 /// Decodes a character from str at index.
58 /// Params:
59 /// index = set to one past the ASCII char or one past the last trail byte
60 /// of the valid UTF-8 sequence.
61 dchar decode(char[] str, ref size_t index)
62 in { assert(str.length && index < str.length); }
63 out { assert(index <= str.length); }
64 body
65 {
66 char* p = str.ptr + index;
67 char* end = str.ptr + str.length;
68 dchar c = decode(p, end);
69 if (c != ERROR_CHAR)
70 index = p - str.ptr;
71 return c;
72 }
73
74 /// Decodes a character starting at ref_p.
75 /// Params:
76 /// ref_p = set to one past the ASCII char or one past the last trail byte
77 /// of the valid UTF-8 sequence.
78 dchar decode(ref char* ref_p, char* end)
79 in { assert(ref_p && ref_p < end); }
80 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
81 body
82 {
83 char* p = ref_p;
84 dchar c = *p;
85
86 if (c < 0x80)
87 return ref_p++, c;
88
89 p++; // Move to second byte.
90 if (!(p < end))
91 return ERROR_CHAR;
92
93 // Error if second byte is not a trail byte.
94 if (!isTrailByte(*p))
95 return ERROR_CHAR;
96
97 // Check for overlong sequences.
98 switch (c)
99 {
100 case 0xE0, // 11100000 100xxxxx
101 0xF0, // 11110000 1000xxxx
102 0xF8, // 11111000 10000xxx
103 0xFC: // 11111100 100000xx
104 if ((*p & c) == 0x80)
105 return ERROR_CHAR;
106 default:
107 if ((c & 0xFE) == 0xC0) // 1100000x
108 return ERROR_CHAR;
109 }
110
111 const char[] checkNextByte = "if (!(++p < end && isTrailByte(*p)))"
112 " return ERROR_CHAR;";
113 const char[] appendSixBits = "c = (c << 6) | *p & 0b0011_1111;";
114
115 // Decode
116 if ((c & 0b1110_0000) == 0b1100_0000)
117 {
118 // 110xxxxx 10xxxxxx
119 c &= 0b0001_1111;
120 mixin(appendSixBits);
121 }
122 else if ((c & 0b1111_0000) == 0b1110_0000)
123 {
124 // 1110xxxx 10xxxxxx 10xxxxxx
125 c &= 0b0000_1111;
126 mixin(appendSixBits ~
127 checkNextByte ~ appendSixBits);
128 }
129 else if ((c & 0b1111_1000) == 0b1111_0000)
130 {
131 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
132 c &= 0b0000_0111;
133 mixin(appendSixBits ~
134 checkNextByte ~ appendSixBits ~
135 checkNextByte ~ appendSixBits);
136 }
137 else
138 // 5 and 6 byte UTF-8 sequences are not allowed yet.
139 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
140 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
141 return ERROR_CHAR;
142
143 assert(isTrailByte(*p));
144
145 if (!isValidChar(c))
146 return ERROR_CHAR;
147 ref_p = p+1;
148 return c;
149 }
150
151 /// Encodes c and appends it to str.
152 void encode(ref char[] str, dchar c)
153 {
154 assert(isValidChar(c), "check if character is valid before calling encode().");
155
156 char[6] b = void;
157 if (c < 0x80)
158 str ~= c;
159 else if (c < 0x800)
160 {
161 b[0] = 0xC0 | (c >> 6);
162 b[1] = 0x80 | (c & 0x3F);
163 str ~= b[0..2];
164 }
165 else if (c < 0x10000)
166 {
167 b[0] = 0xE0 | (c >> 12);
168 b[1] = 0x80 | ((c >> 6) & 0x3F);
169 b[2] = 0x80 | (c & 0x3F);
170 str ~= b[0..3];
171 }
172 else if (c < 0x200000)
173 {
174 b[0] = 0xF0 | (c >> 18);
175 b[1] = 0x80 | ((c >> 12) & 0x3F);
176 b[2] = 0x80 | ((c >> 6) & 0x3F);
177 b[3] = 0x80 | (c & 0x3F);
178 str ~= b[0..4];
179 }
180 /+ // There are no 5 and 6 byte UTF-8 sequences yet.
181 else if (c < 0x4000000)
182 {
183 b[0] = 0xF8 | (c >> 24);
184 b[1] = 0x80 | ((c >> 18) & 0x3F);
185 b[2] = 0x80 | ((c >> 12) & 0x3F);
186 b[3] = 0x80 | ((c >> 6) & 0x3F);
187 b[4] = 0x80 | (c & 0x3F);
188 str ~= b[0..5];
189 }
190 else if (c < 0x80000000)
191 {
192 b[0] = 0xFC | (c >> 30);
193 b[1] = 0x80 | ((c >> 24) & 0x3F);
194 b[2] = 0x80 | ((c >> 18) & 0x3F);
195 b[3] = 0x80 | ((c >> 12) & 0x3F);
196 b[4] = 0x80 | ((c >> 6) & 0x3F);
197 b[5] = 0x80 | (c & 0x3F);
198 str ~= b[0..6];
199 }
200 +/
201 else
202 assert(0);
203 }
204
205 /// Encodes c and appends it to str.
206 void encode(ref wchar[] str, dchar c)
207 in { assert(isValidChar(c)); }
208 body
209 {
210 if (c < 0x10000)
211 str ~= cast(wchar)c;
212 else
213 { // Encode with surrogate pair.
214 wchar[2] pair = void;
215 c -= 0x10000; // c'
216 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
217 pair[0] = (c >> 10) | 0xD800;
218 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy
219 pair[1] = (c & 0x3FF) | 0xDC00;
220 str ~= pair;
221 }
222 }
223
224 /// Decodes a character from a UTF-16 sequence.
225 /// Params:
226 /// str = the UTF-16 sequence.
227 /// index = where to start from.
228 /// Returns: ERROR_CHAR in case of an error in the sequence.
229 dchar decode(wchar[] str, ref size_t index)
230 {
231 assert(str.length && index < str.length);
232 dchar c = str[index];
233 if (0xD800 > c || c > 0xDFFF)
234 {
235 ++index;
236 return c;
237 }
238 if (c <= 0xDBFF && index+1 != str.length)
239 {
240 wchar c2 = str[index+1];
241 if (0xDC00 <= c2 && c2 <= 0xDFFF)
242 { // Decode surrogate pair.
243 // (c - 0xD800) << 10 + 0x10000 ->
244 // (c - 0xD800 + 0x40) << 10 ->
245 c = (c - 0xD7C0) << 10;
246 c |= (c2 & 0x3FF);
247 index += 2;
248 return c;
249 }
250 }
251 return ERROR_CHAR;
252 }
253
254 /// Decodes a character from a UTF-16 sequence.
255 /// Params:
256 /// p = start of the UTF-16 sequence.
257 /// end = one past the end of the sequence.
258 /// Returns: ERROR_CHAR in case of an error in the sequence.
259 dchar decode(ref wchar* p, wchar* end)
260 {
261 assert(p && p < end);
262 dchar c = *p;
263 if (0xD800 > c || c > 0xDFFF)
264 {
265 ++p;
266 return c;
267 }
268 if (c <= 0xDBFF && p+1 != end)
269 {
270 wchar c2 = p[1];
271 if (0xDC00 <= c2 && c2 <= 0xDFFF)
272 {
273 c = (c - 0xD7C0) << 10;
274 c |= (c2 & 0x3FF);
275 p += 2;
276 return c;
277 }
278 }
279 return ERROR_CHAR;
280 }
281
282 /// Decodes a character from a zero-terminated UTF-16 string.
283 /// Params:
284 /// p = start of the UTF-16 sequence.
285 /// Returns: ERROR_CHAR in case of an error in the sequence.
286 dchar decode(ref wchar* p)
287 {
288 assert(p);
289 dchar c = *p;
290 if (0xD800 > c || c > 0xDFFF)
291 {
292 ++p;
293 return c;
294 }
295 if (c <= 0xDBFF)
296 {
297 wchar c2 = p[1];
298 if (0xDC00 <= c2 && c2 <= 0xDFFF)
299 {
300 c = (c - 0xD7C0) << 10;
301 c |= (c2 & 0x3FF);
302 p += 2;
303 return c;
304 }
305 }
306 return ERROR_CHAR;
307 }
308
309 /// Converts a UTF-8 string to a UTF-16 string.
310 wchar[] toUTF16(char[] str)
311 {
312 wchar[] result;
313 size_t idx;
314 while (idx < str.length)
315 {
316 auto c = decode(str, idx);
317 if (c == ERROR_CHAR)
318 { // Skip trail bytes.
319 while (++idx < str.length && isTrailByte(str[idx]))
320 {}
321 c = REPLACEMENT_CHAR;
322 }
323 encode(result, c);
324 }
325 return result;
326 }
327
328 /// Converts a UTF-8 string to a UTF-32 string.
329 dchar[] toUTF32(char[] str)
330 {
331 dchar[] result;
332 size_t idx;
333 while (idx < str.length)
334 {
335 auto c = decode(str, idx);
336 if (c == ERROR_CHAR)
337 { // Skip trail bytes.
338 while (++idx < str.length && isTrailByte(str[idx]))
339 {}
340 c = REPLACEMENT_CHAR;
341 }
342 result ~= c;
343 }
344 return result;
345 }