Mercurial > projects > dil
comparison trunk/src/dil/Unicode.d @ 789:c1d5cfd7aa44
Implemented string literal conversion.
Removed two MID messages.
Added MSG.InvalidUTF8SequenceInString.
Added toUTF16() and toUTF32().
Fixed escape sequences.
Added formatBytes() and findInvalidUTF8Sequence().
author | Aziz K?ksal <aziz.koeksal@gmail.com> |
---|---|
date | Mon, 25 Feb 2008 02:56:22 +0100 |
parents | 5e3ef1b2011c |
children |
comparison
equal
deleted
inserted
replaced
788:139c9a6a39a8 | 789:c1d5cfd7aa44 |
---|---|
52 return false; | 52 return false; |
53 ref_p = p; | 53 ref_p = p; |
54 return true; | 54 return true; |
55 } | 55 } |
56 | 56 |
57 /// index is set one past the last trail byte of the valid UTF-8 sequence. | 57 /// Decodes a character from str at index. |
58 /// Params: | |
59 /// index = set to one past the ASCII char or one past the last trail byte | |
60 /// of the valid UTF-8 sequence. | |
58 dchar decode(char[] str, ref size_t index) | 61 dchar decode(char[] str, ref size_t index) |
59 in { assert(str.length && index < str.length); } | 62 in { assert(str.length && index < str.length); } |
60 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } | 63 out { assert(index <= str.length); } |
61 body | 64 body |
62 { | 65 { |
63 char* p = str.ptr + index; | 66 char* p = str.ptr + index; |
64 char* end = str.ptr + str.length; | 67 char* end = str.ptr + str.length; |
65 dchar c = decode(p, end); | 68 dchar c = decode(p, end); |
66 if (c != ERROR_CHAR) | 69 if (c != ERROR_CHAR) |
67 index = p - str.ptr + 1; | 70 index = p - str.ptr; |
68 return c; | 71 return c; |
69 } | 72 } |
70 | 73 |
71 /// ref_p is set to the last trail byte of the valid UTF-8 sequence. | 74 /// Decodes a character starting at ref_p. |
75 /// Params: | |
76 /// ref_p = set to one past the ASCII char or one past the last trail byte | |
77 /// of the valid UTF-8 sequence. | |
72 dchar decode(ref char* ref_p, char* end) | 78 dchar decode(ref char* ref_p, char* end) |
73 in { assert(ref_p && ref_p < end); } | 79 in { assert(ref_p && ref_p < end); } |
74 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } | 80 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); } |
75 body | 81 body |
76 { | 82 { |
77 char* p = ref_p; | 83 char* p = ref_p; |
78 dchar c = *p; | 84 dchar c = *p; |
79 | 85 |
80 if (c < 0x80) | 86 if (c < 0x80) |
81 { | 87 return ref_p++, c; |
82 ref_p++; | |
83 return c; | |
84 } | |
85 | 88 |
86 p++; // Move to second byte. | 89 p++; // Move to second byte. |
87 if (!(p < end)) | 90 if (!(p < end)) |
88 return ERROR_CHAR; | 91 return ERROR_CHAR; |
89 | 92 |
139 | 142 |
140 assert(isTrailByte(*p)); | 143 assert(isTrailByte(*p)); |
141 | 144 |
142 if (!isValidChar(c)) | 145 if (!isValidChar(c)) |
143 return ERROR_CHAR; | 146 return ERROR_CHAR; |
144 ref_p = p; | 147 ref_p = p+1; |
145 return c; | 148 return c; |
146 } | 149 } |
147 | 150 |
148 /// Encodes a character and appends it to str. | 151 /// Encodes c and appends it to str. |
149 void encode(ref char[] str, dchar c) | 152 void encode(ref char[] str, dchar c) |
150 { | 153 { |
151 assert(isValidChar(c), "check if character is valid before calling encode()."); | 154 assert(isValidChar(c), "check if character is valid before calling encode()."); |
152 | 155 |
153 char[6] b = void; | 156 char[6] b = void; |
197 +/ | 200 +/ |
198 else | 201 else |
199 assert(0); | 202 assert(0); |
200 } | 203 } |
201 | 204 |
202 /// Encodes a character and appends it to str. | 205 /// Encodes c and appends it to str. |
203 void encode(ref wchar[] str, dchar c) | 206 void encode(ref wchar[] str, dchar c) |
204 in { assert(isValidChar(c)); } | 207 in { assert(isValidChar(c)); } |
205 body | 208 body |
206 { | 209 { |
207 if (c < 0x10000) | 210 if (c < 0x10000) |
216 pair[1] = (c & 0x3FF) | 0xDC00; | 219 pair[1] = (c & 0x3FF) | 0xDC00; |
217 str ~= pair; | 220 str ~= pair; |
218 } | 221 } |
219 } | 222 } |
220 | 223 |
221 /// Returns a decoded character from a UTF-16 sequence. | 224 /// Decodes a character from a UTF-16 sequence. |
222 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
223 /// Params: | 225 /// Params: |
224 /// str = the UTF-16 sequence. | 226 /// str = the UTF-16 sequence. |
225 /// index = where to start from. | 227 /// index = where to start from. |
228 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
226 dchar decode(wchar[] str, ref size_t index) | 229 dchar decode(wchar[] str, ref size_t index) |
227 { | 230 { |
228 assert(str.length && index < str.length); | 231 assert(str.length && index < str.length); |
229 dchar c = str[index]; | 232 dchar c = str[index]; |
230 if (0xD800 > c || c > 0xDFFF) | 233 if (0xD800 > c || c > 0xDFFF) |
246 } | 249 } |
247 } | 250 } |
248 return ERROR_CHAR; | 251 return ERROR_CHAR; |
249 } | 252 } |
250 | 253 |
251 /// Returns a decoded character from a UTF-16 sequence. | 254 /// Decodes a character from a UTF-16 sequence. |
252 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
253 /// Params: | 255 /// Params: |
254 /// p = start of the UTF-16 sequence. | 256 /// p = start of the UTF-16 sequence. |
255 /// end = one past the end of the sequence. | 257 /// end = one past the end of the sequence. |
258 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
256 dchar decode(ref wchar* p, wchar* end) | 259 dchar decode(ref wchar* p, wchar* end) |
257 { | 260 { |
258 assert(p && p < end); | 261 assert(p && p < end); |
259 dchar c = *p; | 262 dchar c = *p; |
260 if (0xD800 > c || c > 0xDFFF) | 263 if (0xD800 > c || c > 0xDFFF) |
274 } | 277 } |
275 } | 278 } |
276 return ERROR_CHAR; | 279 return ERROR_CHAR; |
277 } | 280 } |
278 | 281 |
279 /// Decode a character from a zero-terminated string. | 282 /// Decodes a character from a zero-terminated UTF-16 string. |
283 /// Params: | |
284 /// p = start of the UTF-16 sequence. | |
285 /// Returns: ERROR_CHAR in case of an error in the sequence. | |
280 dchar decode(ref wchar* p) | 286 dchar decode(ref wchar* p) |
281 { | 287 { |
282 assert(p); | 288 assert(p); |
283 dchar c = *p; | 289 dchar c = *p; |
284 if (0xD800 > c || c > 0xDFFF) | 290 if (0xD800 > c || c > 0xDFFF) |
297 return c; | 303 return c; |
298 } | 304 } |
299 } | 305 } |
300 return ERROR_CHAR; | 306 return ERROR_CHAR; |
301 } | 307 } |
308 | |
309 /// Converts a UTF-8 string to a UTF-16 string. | |
310 wchar[] toUTF16(char[] str) | |
311 { | |
312 wchar[] result; | |
313 size_t idx; | |
314 while (idx < str.length) | |
315 { | |
316 auto c = decode(str, idx); | |
317 if (c == ERROR_CHAR) | |
318 { // Skip trail bytes. | |
319 while (++idx < str.length && isTrailByte(str[idx])) | |
320 {} | |
321 c = REPLACEMENT_CHAR; | |
322 } | |
323 encode(result, c); | |
324 } | |
325 return result; | |
326 } | |
327 | |
328 /// Converts a UTF-8 string to a UTF-32 string. | |
329 dchar[] toUTF32(char[] str) | |
330 { | |
331 dchar[] result; | |
332 size_t idx; | |
333 while (idx < str.length) | |
334 { | |
335 auto c = decode(str, idx); | |
336 if (c == ERROR_CHAR) | |
337 { // Skip trail bytes. | |
338 while (++idx < str.length && isTrailByte(str[idx])) | |
339 {} | |
340 c = REPLACEMENT_CHAR; | |
341 } | |
342 result ~= c; | |
343 } | |
344 return result; | |
345 } |