comparison trunk/src/dil/Converter.d @ 739:49fe21aa387c

Added sanitizeText() to dil.Converter. Cleaned predefined.ddoc up a bit. Removed makeString() from dil.doc.Macro. Added REPLACEMENT_STR to dil.Unicode.
author Aziz K?ksal <aziz.koeksal@gmail.com>
date Sat, 09 Feb 2008 14:24:35 +0100
parents 9e811db780a6
children 90668b83ae5e
comparison
equal deleted inserted replaced
738:2afcc305831a 739:49fe21aa387c
235 assert(0); 235 assert(0);
236 } 236 }
237 return text; 237 return text;
238 } 238 }
239 } 239 }
240
241 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
242 /// and Newlines with '\n'.
243 string sanitizeText(string text)
244 {
245 if (!text.length)
246 return null;
247
248 char* p = text.ptr;
249 char* end = p + text.length;
250 char* q = p;
251
252 for (; p < end; p++, q++)
253 {
254 assert(q <= p);
255 switch (*p)
256 {
257 case '\r':
258 if (p+1 < end && p[1] == '\n')
259 p++;
260 case '\n':
261 *q = '\n';
262 continue;
263 default:
264 if (isascii(*p))
265 break;
266 if (p+2 < end && isUnicodeNewline(p))
267 {
268 p += 2;
269 goto case '\n';
270 }
271 auto p2 = p; // Beginning of the UTF-8 sequence.
272 dchar c = decode(p, end);
273 if (c == ERROR_CHAR)
274 { // Skip to next ASCII character or valid UTF-8 sequence.
275 while (++p < end && isTrailByte(*p))
276 {}
277 alias REPLACEMENT_STR R;
278 if (q+2 < p) // Copy replacement char if there is enough space.
279 (*q = R[0]), (*++q = R[1]), (*++q = R[2]);
280 p--;
281 }
282 else
283 { // Copy the valid UTF-8 sequence.
284 while (p2 <= p) // p points to the last trail byte.
285 *q++ = *p2++; // Copy code units.
286 q--;
287 }
288 continue;
289 }
290 assert(isascii(*p));
291 *q = *p;
292 }
293 assert(p == end);
294 text.length = text.length - (p - q);
295 //text = text.ptr[0 .. q - text.ptr]; // Another way.
296 return text;
297 }