comparison trunk/src/dil/Unicode.d @ 789:c1d5cfd7aa44

Implemented string literal conversion. Removed two MID messages. Added MSG.InvalidUTF8SequenceInString. Added toUTF16() and toUTF32(). Fixed escape sequences. Added formatBytes() and findInvalidUTF8Sequence().
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Mon, 25 Feb 2008 02:56:22 +0100
parents 5e3ef1b2011c
children
comparison
equal deleted inserted replaced
788:139c9a6a39a8 789:c1d5cfd7aa44
52 return false; 52 return false;
53 ref_p = p; 53 ref_p = p;
54 return true; 54 return true;
55 } 55 }
56 56
57 /// index is set one past the last trail byte of the valid UTF-8 sequence. 57 /// Decodes a character from str at index.
58 /// Params:
59 /// index = set to one past the ASCII char or one past the last trail byte
60 /// of the valid UTF-8 sequence.
58 dchar decode(char[] str, ref size_t index) 61 dchar decode(char[] str, ref size_t index)
59 in { assert(str.length && index < str.length); } 62 in { assert(str.length && index < str.length); }
60 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } 63 out { assert(index <= str.length); }
61 body 64 body
62 { 65 {
63 char* p = str.ptr + index; 66 char* p = str.ptr + index;
64 char* end = str.ptr + str.length; 67 char* end = str.ptr + str.length;
65 dchar c = decode(p, end); 68 dchar c = decode(p, end);
66 if (c != ERROR_CHAR) 69 if (c != ERROR_CHAR)
67 index = p - str.ptr + 1; 70 index = p - str.ptr;
68 return c; 71 return c;
69 } 72 }
70 73
71 /// ref_p is set to the last trail byte of the valid UTF-8 sequence. 74 /// Decodes a character starting at ref_p.
75 /// Params:
76 /// ref_p = set to one past the ASCII char or one past the last trail byte
77 /// of the valid UTF-8 sequence.
72 dchar decode(ref char* ref_p, char* end) 78 dchar decode(ref char* ref_p, char* end)
73 in { assert(ref_p && ref_p < end); } 79 in { assert(ref_p && ref_p < end); }
74 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } 80 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
75 body 81 body
76 { 82 {
77 char* p = ref_p; 83 char* p = ref_p;
78 dchar c = *p; 84 dchar c = *p;
79 85
80 if (c < 0x80) 86 if (c < 0x80)
81 { 87 return ref_p++, c;
82 ref_p++;
83 return c;
84 }
85 88
86 p++; // Move to second byte. 89 p++; // Move to second byte.
87 if (!(p < end)) 90 if (!(p < end))
88 return ERROR_CHAR; 91 return ERROR_CHAR;
89 92
139 142
140 assert(isTrailByte(*p)); 143 assert(isTrailByte(*p));
141 144
142 if (!isValidChar(c)) 145 if (!isValidChar(c))
143 return ERROR_CHAR; 146 return ERROR_CHAR;
144 ref_p = p; 147 ref_p = p+1;
145 return c; 148 return c;
146 } 149 }
147 150
148 /// Encodes a character and appends it to str. 151 /// Encodes c and appends it to str.
149 void encode(ref char[] str, dchar c) 152 void encode(ref char[] str, dchar c)
150 { 153 {
151 assert(isValidChar(c), "check if character is valid before calling encode()."); 154 assert(isValidChar(c), "check if character is valid before calling encode().");
152 155
153 char[6] b = void; 156 char[6] b = void;
197 +/ 200 +/
198 else 201 else
199 assert(0); 202 assert(0);
200 } 203 }
201 204
202 /// Encodes a character and appends it to str. 205 /// Encodes c and appends it to str.
203 void encode(ref wchar[] str, dchar c) 206 void encode(ref wchar[] str, dchar c)
204 in { assert(isValidChar(c)); } 207 in { assert(isValidChar(c)); }
205 body 208 body
206 { 209 {
207 if (c < 0x10000) 210 if (c < 0x10000)
216 pair[1] = (c & 0x3FF) | 0xDC00; 219 pair[1] = (c & 0x3FF) | 0xDC00;
217 str ~= pair; 220 str ~= pair;
218 } 221 }
219 } 222 }
220 223
221 /// Returns a decoded character from a UTF-16 sequence. 224 /// Decodes a character from a UTF-16 sequence.
222 /// Returns: ERROR_CHAR in case of an error in the sequence.
223 /// Params: 225 /// Params:
224 /// str = the UTF-16 sequence. 226 /// str = the UTF-16 sequence.
225 /// index = where to start from. 227 /// index = where to start from.
228 /// Returns: ERROR_CHAR in case of an error in the sequence.
226 dchar decode(wchar[] str, ref size_t index) 229 dchar decode(wchar[] str, ref size_t index)
227 { 230 {
228 assert(str.length && index < str.length); 231 assert(str.length && index < str.length);
229 dchar c = str[index]; 232 dchar c = str[index];
230 if (0xD800 > c || c > 0xDFFF) 233 if (0xD800 > c || c > 0xDFFF)
246 } 249 }
247 } 250 }
248 return ERROR_CHAR; 251 return ERROR_CHAR;
249 } 252 }
250 253
251 /// Returns a decoded character from a UTF-16 sequence. 254 /// Decodes a character from a UTF-16 sequence.
252 /// Returns: ERROR_CHAR in case of an error in the sequence.
253 /// Params: 255 /// Params:
254 /// p = start of the UTF-16 sequence. 256 /// p = start of the UTF-16 sequence.
255 /// end = one past the end of the sequence. 257 /// end = one past the end of the sequence.
258 /// Returns: ERROR_CHAR in case of an error in the sequence.
256 dchar decode(ref wchar* p, wchar* end) 259 dchar decode(ref wchar* p, wchar* end)
257 { 260 {
258 assert(p && p < end); 261 assert(p && p < end);
259 dchar c = *p; 262 dchar c = *p;
260 if (0xD800 > c || c > 0xDFFF) 263 if (0xD800 > c || c > 0xDFFF)
274 } 277 }
275 } 278 }
276 return ERROR_CHAR; 279 return ERROR_CHAR;
277 } 280 }
278 281
279 /// Decode a character from a zero-terminated string. 282 /// Decodes a character from a zero-terminated UTF-16 string.
283 /// Params:
284 /// p = start of the UTF-16 sequence.
285 /// Returns: ERROR_CHAR in case of an error in the sequence.
280 dchar decode(ref wchar* p) 286 dchar decode(ref wchar* p)
281 { 287 {
282 assert(p); 288 assert(p);
283 dchar c = *p; 289 dchar c = *p;
284 if (0xD800 > c || c > 0xDFFF) 290 if (0xD800 > c || c > 0xDFFF)
297 return c; 303 return c;
298 } 304 }
299 } 305 }
300 return ERROR_CHAR; 306 return ERROR_CHAR;
301 } 307 }
308
309 /// Converts a UTF-8 string to a UTF-16 string.
310 wchar[] toUTF16(char[] str)
311 {
312 wchar[] result;
313 size_t idx;
314 while (idx < str.length)
315 {
316 auto c = decode(str, idx);
317 if (c == ERROR_CHAR)
318 { // Skip trail bytes.
319 while (++idx < str.length && isTrailByte(str[idx]))
320 {}
321 c = REPLACEMENT_CHAR;
322 }
323 encode(result, c);
324 }
325 return result;
326 }
327
328 /// Converts a UTF-8 string to a UTF-32 string.
329 dchar[] toUTF32(char[] str)
330 {
331 dchar[] result;
332 size_t idx;
333 while (idx < str.length)
334 {
335 auto c = decode(str, idx);
336 if (c == ERROR_CHAR)
337 { // Skip trail bytes.
338 while (++idx < str.length && isTrailByte(str[idx]))
339 {}
340 c = REPLACEMENT_CHAR;
341 }
342 result ~= c;
343 }
344 return result;
345 }