Mercurial > projects > ldc
comparison tango/tango/text/Unicode.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:5825d48b27d1 | 132:1700239cab2e |
---|---|
1 /******************************************************************************* | |
2 | |
3 copyright: Copyright (c) 2007 Peter Triller. All rights reserved | |
4 | |
5 license: BSD style: $(LICENSE) | |
6 | |
7 version: Initial release: Sept 2007 | |
8 | |
9 authors: Peter | |
10 | |
11 Provides case mapping Functions for Unicode Strings. As of now it is | |
12 only 99 % complete, because it does not take into account Conditional | |
13 case mappings. This means the Greek Letter Sigma will not be correctly | |
14 case mapped at the end of a Word, and the Locales Lithuanian, Turkish | |
15 and Azeri are not taken into account during Case Mappings. This means | |
16 all in all around 12 Characters will not be mapped correctly under | |
17 some circumstances. | |
18 | |
19 ICU4j also does not handle these cases at the moment. | |
20 | |
21 Unittests are written against output from ICU4j | |
22 | |
23 This Module tries to minimize Memory allocation and usage. You can | |
24 always pass the output buffer that should be used to the case mapping | |
25 function, which will be resized if necessary. | |
26 | |
27 *******************************************************************************/ | |
28 | |
29 module tango.text.Unicode; | |
30 | |
31 private import tango.text.UnicodeData; | |
32 private import tango.text.convert.Utf; | |
33 | |
34 | |
35 | |
36 /** | |
37 * Converts an Utf8 String to Upper case | |
38 * | |
39 * Params: | |
40 * input = String to be case mapped | |
41 * output = this output buffer will be used unless too small | |
42 * Returns: the case mapped string | |
43 */ | |
44 deprecated char[] blockToUpper(char[] input, char[] output = null, dchar[] working = null) { | |
45 | |
46 // ?? How much preallocation ?? This is worst case allocation | |
47 if (working is null) | |
48 working.length = input.length; | |
49 | |
50 uint produced = 0; | |
51 uint ate; | |
52 uint oprod = 0; | |
53 foreach(dchar ch; input) { | |
54 // TODO Conditional Case Mapping | |
55 UnicodeData **d = (ch in unicodeData); | |
56 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
57 SpecialCaseData **s = (ch in specialCaseData); | |
58 debug { | |
59 assert(s !is null); | |
60 } | |
61 if((*s).upperCaseMapping !is null) { | |
62 // To speed up, use worst case for memory prealocation | |
63 // since the length of an UpperCaseMapping list is at most 4 | |
64 // Make sure no relocation is made in the toString Method | |
65 // better allocation algorithm ? | |
66 int len = (*s).upperCaseMapping.length; | |
67 if(produced + len >= working.length) | |
68 working.length = working.length + working.length / 2 + len; | |
69 oprod = produced; | |
70 produced += len; | |
71 working[oprod..produced] = (*s).upperCaseMapping; | |
72 continue; | |
73 } | |
74 } | |
75 // Make sure no relocation is made in the toString Method | |
76 if(produced + 1 >= output.length) | |
77 working.length = working.length + working.length / 2 + 1; | |
78 working[produced++] = d is null ? ch:(*d).simpleUpperCaseMapping; | |
79 } | |
80 return toString(working[0..produced],output); | |
81 } | |
82 | |
83 | |
84 | |
85 /** | |
86 * Converts an Utf8 String to Upper case | |
87 * | |
88 * Params: | |
89 * input = String to be case mapped | |
90 * output = this output buffer will be used unless too small | |
91 * Returns: the case mapped string | |
92 */ | |
93 char[] toUpper(char[] input, char[] output = null) { | |
94 | |
95 dchar[1] buf; | |
96 // assume most common case: String stays the same length | |
97 if (output.length < input.length) | |
98 output.length = input.length; | |
99 | |
100 uint produced = 0; | |
101 uint ate; | |
102 foreach(dchar ch; input) { | |
103 // TODO Conditional Case Mapping | |
104 UnicodeData **d = (ch in unicodeData); | |
105 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
106 SpecialCaseData **s = (ch in specialCaseData); | |
107 debug { | |
108 assert(s !is null); | |
109 } | |
110 if((*s).upperCaseMapping !is null) { | |
111 // To speed up, use worst case for memory prealocation | |
112 // since the length of an UpperCaseMapping list is at most 4 | |
113 // Make sure no relocation is made in the toString Method | |
114 // better allocation algorithm ? | |
115 if(produced + (*s).upperCaseMapping.length * 4 >= output.length) | |
116 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length * 4; | |
117 char[] res = toString((*s).upperCaseMapping, output[produced..output.length], &ate); | |
118 debug { | |
119 assert(ate == (*s).upperCaseMapping.length); | |
120 assert(res.ptr == output[produced..output.length].ptr); | |
121 } | |
122 produced += res.length; | |
123 continue; | |
124 } | |
125 } | |
126 // Make sure no relocation is made in the toString Method | |
127 if(produced + 4 >= output.length) | |
128 output.length = output.length + output.length / 2 + 4; | |
129 buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping; | |
130 char[] res = toString(buf, output[produced..output.length], &ate); | |
131 debug { | |
132 assert(ate == 1); | |
133 assert(res.ptr == output[produced..output.length].ptr); | |
134 } | |
135 produced += res.length; | |
136 } | |
137 return output[0..produced]; | |
138 } | |
139 | |
140 | |
141 /** | |
142 * Converts an Utf16 String to Upper case | |
143 * | |
144 * Params: | |
145 * input = String to be case mapped | |
146 * output = this output buffer will be used unless too small | |
147 * Returns: the case mapped string | |
148 */ | |
149 wchar[] toUpper(wchar[] input, wchar[] output = null) { | |
150 | |
151 dchar[1] buf; | |
152 // assume most common case: String stays the same length | |
153 if (output.length < input.length) | |
154 output.length = input.length; | |
155 | |
156 uint produced = 0; | |
157 uint ate; | |
158 foreach(dchar ch; input) { | |
159 // TODO Conditional Case Mapping | |
160 UnicodeData **d = (ch in unicodeData); | |
161 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
162 SpecialCaseData **s = (ch in specialCaseData); | |
163 debug { | |
164 assert(s !is null); | |
165 } | |
166 if((*s).upperCaseMapping !is null) { | |
167 // To speed up, use worst case for memory prealocation | |
168 // Make sure no relocation is made in the toString16 Method | |
169 // better allocation algorithm ? | |
170 if(produced + (*s).upperCaseMapping.length * 2 >= output.length) | |
171 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length * 3; | |
172 wchar[] res = toString16((*s).upperCaseMapping, output[produced..output.length], &ate); | |
173 debug { | |
174 assert(ate == (*s).upperCaseMapping.length); | |
175 assert(res.ptr == output[produced..output.length].ptr); | |
176 } | |
177 produced += res.length; | |
178 continue; | |
179 } | |
180 } | |
181 // Make sure no relocation is made in the toString16 Method | |
182 if(produced + 4 >= output.length) | |
183 output.length = output.length + output.length / 2 + 3; | |
184 buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping; | |
185 wchar[] res = toString16(buf, output[produced..output.length], &ate); | |
186 debug { | |
187 assert(ate == 1); | |
188 assert(res.ptr == output[produced..output.length].ptr); | |
189 } | |
190 produced += res.length; | |
191 } | |
192 return output[0..produced]; | |
193 } | |
194 | |
195 /** | |
196 * Converts an Utf32 String to Upper case | |
197 * | |
198 * Params: | |
199 * input = String to be case mapped | |
200 * output = this output buffer will be used unless too small | |
201 * Returns: the case mapped string | |
202 */ | |
203 dchar[] toUpper(dchar[] input, dchar[] output = null) { | |
204 | |
205 // assume most common case: String stays the same length | |
206 if (input.length > output.length) | |
207 output.length = input.length; | |
208 | |
209 uint produced = 0; | |
210 if (input.length) | |
211 foreach(dchar orig; input) { | |
212 // TODO Conditional Case Mapping | |
213 UnicodeData **d = (orig in unicodeData); | |
214 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
215 SpecialCaseData **s = (orig in specialCaseData); | |
216 debug { | |
217 assert(s !is null); | |
218 } | |
219 if((*s).upperCaseMapping !is null) { | |
220 // Better resize strategy ??? | |
221 if(produced + (*s).upperCaseMapping.length > output.length) | |
222 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length; | |
223 foreach(ch; (*s).upperCaseMapping) { | |
224 output[produced++] = ch; | |
225 } | |
226 } | |
227 continue; | |
228 } | |
229 if(produced >= output.length) | |
230 output.length = output.length + output.length / 2; | |
231 output[produced++] = d is null ? orig:(*d).simpleUpperCaseMapping; | |
232 } | |
233 return output[0..produced]; | |
234 } | |
235 | |
236 | |
237 /** | |
238 * Converts an Utf8 String to Lower case | |
239 * | |
240 * Params: | |
241 * input = String to be case mapped | |
242 * output = this output buffer will be used unless too small | |
243 * Returns: the case mapped string | |
244 */ | |
245 char[] toLower(char[] input, char[] output = null) { | |
246 | |
247 dchar[1] buf; | |
248 // assume most common case: String stays the same length | |
249 if (output.length < input.length) | |
250 output.length = input.length; | |
251 | |
252 uint produced = 0; | |
253 uint ate; | |
254 foreach(dchar ch; input) { | |
255 // TODO Conditional Case Mapping | |
256 UnicodeData **d = (ch in unicodeData); | |
257 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
258 SpecialCaseData **s = (ch in specialCaseData); | |
259 debug { | |
260 assert(s !is null); | |
261 } | |
262 if((*s).lowerCaseMapping !is null) { | |
263 // To speed up, use worst case for memory prealocation | |
264 // since the length of an LowerCaseMapping list is at most 4 | |
265 // Make sure no relocation is made in the toString Method | |
266 // better allocation algorithm ? | |
267 if(produced + (*s).lowerCaseMapping.length * 4 >= output.length) | |
268 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length * 4; | |
269 char[] res = toString((*s).lowerCaseMapping, output[produced..output.length], &ate); | |
270 debug { | |
271 assert(ate == (*s).lowerCaseMapping.length); | |
272 assert(res.ptr == output[produced..output.length].ptr); | |
273 } | |
274 produced += res.length; | |
275 continue; | |
276 } | |
277 } | |
278 // Make sure no relocation is made in the toString Method | |
279 if(produced + 4 >= output.length) | |
280 output.length = output.length + output.length / 2 + 4; | |
281 buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping; | |
282 char[] res = toString(buf, output[produced..output.length], &ate); | |
283 debug { | |
284 assert(ate == 1); | |
285 assert(res.ptr == output[produced..output.length].ptr); | |
286 } | |
287 produced += res.length; | |
288 } | |
289 return output[0..produced]; | |
290 } | |
291 | |
292 | |
293 /** | |
294 * Converts an Utf16 String to Lower case | |
295 * | |
296 * Params: | |
297 * input = String to be case mapped | |
298 * output = this output buffer will be used unless too small | |
299 * Returns: the case mapped string | |
300 */ | |
301 wchar[] toLower(wchar[] input, wchar[] output = null) { | |
302 | |
303 dchar[1] buf; | |
304 // assume most common case: String stays the same length | |
305 if (output.length < input.length) | |
306 output.length = input.length; | |
307 | |
308 uint produced = 0; | |
309 uint ate; | |
310 foreach(dchar ch; input) { | |
311 // TODO Conditional Case Mapping | |
312 UnicodeData **d = (ch in unicodeData); | |
313 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
314 SpecialCaseData **s = (ch in specialCaseData); | |
315 debug { | |
316 assert(s !is null); | |
317 } | |
318 if((*s).lowerCaseMapping !is null) { | |
319 // To speed up, use worst case for memory prealocation | |
320 // Make sure no relocation is made in the toString16 Method | |
321 // better allocation algorithm ? | |
322 if(produced + (*s).lowerCaseMapping.length * 2 >= output.length) | |
323 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length * 3; | |
324 wchar[] res = toString16((*s).lowerCaseMapping, output[produced..output.length], &ate); | |
325 debug { | |
326 assert(ate == (*s).lowerCaseMapping.length); | |
327 assert(res.ptr == output[produced..output.length].ptr); | |
328 } | |
329 produced += res.length; | |
330 continue; | |
331 } | |
332 } | |
333 // Make sure no relocation is made in the toString16 Method | |
334 if(produced + 4 >= output.length) | |
335 output.length = output.length + output.length / 2 + 3; | |
336 buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping; | |
337 wchar[] res = toString16(buf, output[produced..output.length], &ate); | |
338 debug { | |
339 assert(ate == 1); | |
340 assert(res.ptr == output[produced..output.length].ptr); | |
341 } | |
342 produced += res.length; | |
343 } | |
344 return output[0..produced]; | |
345 } | |
346 | |
347 | |
348 /** | |
349 * Converts an Utf32 String to Lower case | |
350 * | |
351 * Params: | |
352 * input = String to be case mapped | |
353 * output = this output buffer will be used unless too small | |
354 * Returns: the case mapped string | |
355 */ | |
356 dchar[] toLower(dchar[] input, dchar[] output = null) { | |
357 | |
358 // assume most common case: String stays the same length | |
359 if (input.length > output.length) | |
360 output.length = input.length; | |
361 | |
362 uint produced = 0; | |
363 if (input.length) | |
364 foreach(dchar orig; input) { | |
365 // TODO Conditional Case Mapping | |
366 UnicodeData **d = (orig in unicodeData); | |
367 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) { | |
368 SpecialCaseData **s = (orig in specialCaseData); | |
369 debug { | |
370 assert(s !is null); | |
371 } | |
372 if((*s).lowerCaseMapping !is null) { | |
373 // Better resize strategy ??? | |
374 if(produced + (*s).lowerCaseMapping.length > output.length) | |
375 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length; | |
376 foreach(ch; (*s).lowerCaseMapping) { | |
377 output[produced++] = ch; | |
378 } | |
379 } | |
380 continue; | |
381 } | |
382 if(produced >= output.length) | |
383 output.length = output.length + output.length / 2; | |
384 output[produced++] = d is null ? orig:(*d).simpleLowerCaseMapping; | |
385 } | |
386 return output[0..produced]; | |
387 } | |
388 | |
389 /** | |
390 * Converts an Utf8 String to Folding case | |
391 * Folding case is used for case insensitive comparsions. | |
392 * | |
393 * Params: | |
394 * input = String to be case mapped | |
395 * output = this output buffer will be used unless too small | |
396 * Returns: the case mapped string | |
397 */ | |
398 char[] toFold(char[] input, char[] output = null) { | |
399 | |
400 dchar[1] buf; | |
401 // assume most common case: String stays the same length | |
402 if (output.length < input.length) | |
403 output.length = input.length; | |
404 | |
405 uint produced = 0; | |
406 uint ate; | |
407 foreach(dchar ch; input) { | |
408 FoldingCaseData **s = (ch in foldingCaseData); | |
409 if(s !is null) { | |
410 // To speed up, use worst case for memory prealocation | |
411 // since the length of an UpperCaseMapping list is at most 4 | |
412 // Make sure no relocation is made in the toString Method | |
413 // better allocation algorithm ? | |
414 if(produced + (*s).mapping.length * 4 >= output.length) | |
415 output.length = output.length + output.length / 2 + (*s).mapping.length * 4; | |
416 char[] res = toString((*s).mapping, output[produced..output.length], &ate); | |
417 debug { | |
418 assert(ate == (*s).mapping.length); | |
419 assert(res.ptr == output[produced..output.length].ptr); | |
420 } | |
421 produced += res.length; | |
422 continue; | |
423 } | |
424 // Make sure no relocation is made in the toString Method | |
425 if(produced + 4 >= output.length) | |
426 output.length = output.length + output.length / 2 + 4; | |
427 buf[0] = ch; | |
428 char[] res = toString(buf, output[produced..output.length], &ate); | |
429 debug { | |
430 assert(ate == 1); | |
431 assert(res.ptr == output[produced..output.length].ptr); | |
432 } | |
433 produced += res.length; | |
434 } | |
435 return output[0..produced]; | |
436 } | |
437 | |
438 /** | |
439 * Converts an Utf16 String to Folding case | |
440 * Folding case is used for case insensitive comparsions. | |
441 * | |
442 * Params: | |
443 * input = String to be case mapped | |
444 * output = this output buffer will be used unless too small | |
445 * Returns: the case mapped string | |
446 */ | |
447 wchar[] toFold(wchar[] input, wchar[] output = null) { | |
448 | |
449 dchar[1] buf; | |
450 // assume most common case: String stays the same length | |
451 if (output.length < input.length) | |
452 output.length = input.length; | |
453 | |
454 uint produced = 0; | |
455 uint ate; | |
456 foreach(dchar ch; input) { | |
457 FoldingCaseData **s = (ch in foldingCaseData); | |
458 if(s !is null) { | |
459 // To speed up, use worst case for memory prealocation | |
460 // Make sure no relocation is made in the toString16 Method | |
461 // better allocation algorithm ? | |
462 if(produced + (*s).mapping.length * 2 >= output.length) | |
463 output.length = output.length + output.length / 2 + (*s).mapping.length * 3; | |
464 wchar[] res = toString16((*s).mapping, output[produced..output.length], &ate); | |
465 debug { | |
466 assert(ate == (*s).mapping.length); | |
467 assert(res.ptr == output[produced..output.length].ptr); | |
468 } | |
469 produced += res.length; | |
470 continue; | |
471 } | |
472 // Make sure no relocation is made in the toString16 Method | |
473 if(produced + 4 >= output.length) | |
474 output.length = output.length + output.length / 2 + 3; | |
475 buf[0] = ch; | |
476 wchar[] res = toString16(buf, output[produced..output.length], &ate); | |
477 debug { | |
478 assert(ate == 1); | |
479 assert(res.ptr == output[produced..output.length].ptr); | |
480 } | |
481 produced += res.length; | |
482 } | |
483 return output[0..produced]; | |
484 } | |
485 | |
486 /** | |
487 * Converts an Utf32 String to Folding case | |
488 * Folding case is used for case insensitive comparsions. | |
489 * | |
490 * Params: | |
491 * input = String to be case mapped | |
492 * output = this output buffer will be used unless too small | |
493 * Returns: the case mapped string | |
494 */ | |
495 dchar[] toFold(dchar[] input, dchar[] output = null) { | |
496 | |
497 // assume most common case: String stays the same length | |
498 if (input.length > output.length) | |
499 output.length = input.length; | |
500 | |
501 uint produced = 0; | |
502 if (input.length) | |
503 foreach(dchar orig; input) { | |
504 FoldingCaseData **d = (orig in foldingCaseData); | |
505 if(d !is null ) { | |
506 // Better resize strategy ??? | |
507 if(produced + (*d).mapping.length > output.length) | |
508 output.length = output.length + output.length / 2 + (*d).mapping.length; | |
509 foreach(ch; (*d).mapping) { | |
510 output[produced++] = ch; | |
511 } | |
512 continue; | |
513 } | |
514 if(produced >= output.length) | |
515 output.length = output.length + output.length / 2; | |
516 output[produced++] = orig; | |
517 } | |
518 return output[0..produced]; | |
519 } | |
520 | |
521 | |
522 /** | |
523 * Determines if a character is a digit. It returns true for decimal | |
524 * digits only. | |
525 * | |
526 * Params: | |
527 * ch = the character to be inspected | |
528 */ | |
529 bool isDigit(dchar ch) { | |
530 UnicodeData **d = (ch in unicodeData); | |
531 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Nd); | |
532 } | |
533 | |
534 | |
535 /** | |
536 * Determines if a character is a letter. | |
537 * | |
538 * Params: | |
539 * ch = the character to be inspected | |
540 */ | |
541 bool isLetter(int ch) { | |
542 UnicodeData **d = (ch in unicodeData); | |
543 return (d !is null) && ((*d).generalCategory & | |
544 ( UnicodeData.GeneralCategory.Lu | |
545 | UnicodeData.GeneralCategory.Ll | |
546 | UnicodeData.GeneralCategory.Lt | |
547 | UnicodeData.GeneralCategory.Lm | |
548 | UnicodeData.GeneralCategory.Lo)); | |
549 } | |
550 | |
551 /** | |
552 * Determines if a character is a letter or a | |
553 * decimal digit. | |
554 * | |
555 * Params: | |
556 * ch = the character to be inspected | |
557 */ | |
558 bool isLetterOrDigit(int ch) { | |
559 UnicodeData **d = (ch in unicodeData); | |
560 return (d !is null) && ((*d).generalCategory & | |
561 ( UnicodeData.GeneralCategory.Lu | |
562 | UnicodeData.GeneralCategory.Ll | |
563 | UnicodeData.GeneralCategory.Lt | |
564 | UnicodeData.GeneralCategory.Lm | |
565 | UnicodeData.GeneralCategory.Lo | |
566 | UnicodeData.GeneralCategory.Nd)); | |
567 } | |
568 | |
569 /** | |
570 * Determines if a character is a lower case letter. | |
571 * Params: | |
572 * ch = the character to be inspected | |
573 */ | |
574 bool isLower(dchar ch) { | |
575 UnicodeData **d = (ch in unicodeData); | |
576 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Ll); | |
577 } | |
578 | |
579 /** | |
580 * Determines if a character is a title case letter. | |
581 * In case of combined letters, only the first is upper and the second is lower. | |
582 * Some of these special characters can be found in the croatian and greek language. | |
583 * See_Also: http://en.wikipedia.org/wiki/Capitalization | |
584 * Params: | |
585 * ch = the character to be inspected | |
586 */ | |
587 bool isTitle(dchar ch) { | |
588 UnicodeData **d = (ch in unicodeData); | |
589 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Lt); | |
590 } | |
591 | |
592 /** | |
593 * Determines if a character is a upper case letter. | |
594 * Params: | |
595 * ch = the character to be inspected | |
596 */ | |
597 bool isUpper(dchar ch) { | |
598 UnicodeData **d = (ch in unicodeData); | |
599 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Lu); | |
600 } | |
601 | |
602 /** | |
603 * Determines if a character is a Whitespace character. | |
604 * Whitespace characters are characters in the | |
605 * General Catetories Zs, Zl, Zp without the No Break | |
606 * spaces plus the control characters out of the ASCII | |
607 * range, that are used as spaces: | |
608 * TAB VT LF FF CR FS GS RS US NL | |
609 * | |
610 * WARNING: look at isSpace, maybe that function does | |
611 * more what you expect. | |
612 * | |
613 * Params: | |
614 * ch = the character to be inspected | |
615 */ | |
616 bool isWhitespace(dchar ch) { | |
617 if((ch >= 0x0009 && ch <= 0x000D) || (ch >= 0x001C && ch <= 0x001F)) | |
618 return true; | |
619 UnicodeData **d = (ch in unicodeData); | |
620 return (d !is null) && ((*d).generalCategory & | |
621 ( UnicodeData.GeneralCategory.Zs | |
622 | UnicodeData.GeneralCategory.Zl | |
623 | UnicodeData.GeneralCategory.Zp)) | |
624 && ch != 0x00A0 // NBSP | |
625 && ch != 0x202F // NARROW NBSP | |
626 && ch != 0xFEFF; // ZERO WIDTH NBSP | |
627 } | |
628 | |
629 /** | |
630 * Detemines if a character is a Space character as | |
631 * specified in the Unicode Standart. | |
632 * | |
633 * WARNING: look at isWhitepace, maybe that function does | |
634 * more what you expect. | |
635 * | |
636 * Params: | |
637 * ch = the character to be inspected | |
638 */ | |
639 bool isSpace(dchar ch) { | |
640 UnicodeData **d = (ch in unicodeData); | |
641 return (d !is null) && ((*d).generalCategory & | |
642 ( UnicodeData.GeneralCategory.Zs | |
643 | UnicodeData.GeneralCategory.Zl | |
644 | UnicodeData.GeneralCategory.Zp)); | |
645 } | |
646 | |
647 | |
648 /** | |
649 * Detemines if a character is a printable character as | |
650 * specified in the Unicode Standart. | |
651 * | |
652 * | |
653 * WARNING: look at isWhitepace, maybe that function does | |
654 * more what you expect. | |
655 * | |
656 * Params: | |
657 * ch = the character to be inspected | |
658 */ | |
659 bool isPrintable(dchar ch) { | |
660 UnicodeData **d = (ch in unicodeData); | |
661 return (d !is null) && ((*d).generalCategory & | |
662 ( UnicodeData.GeneralCategory.Cn | |
663 | UnicodeData.GeneralCategory.Cc | |
664 | UnicodeData.GeneralCategory.Cf | |
665 | UnicodeData.GeneralCategory.Co | |
666 | UnicodeData.GeneralCategory.Cs)); | |
667 } | |
668 | |
669 debug ( UnicodeTest ): | |
670 void main() {} | |
671 | |
672 debug (UnitTest) { | |
673 | |
674 unittest { | |
675 | |
676 | |
677 // 1) No Buffer passed, no resize, no SpecialCase | |
678 | |
679 char[] testString1utf8 = "\u00E4\u00F6\u00FC"; | |
680 wchar[] testString1utf16 = "\u00E4\u00F6\u00FC"; | |
681 dchar[] testString1utf32 = "\u00E4\u00F6\u00FC"; | |
682 char[] refString1utf8 = "\u00C4\u00D6\u00DC"; | |
683 wchar[] refString1utf16 = "\u00C4\u00D6\u00DC"; | |
684 dchar[] refString1utf32 = "\u00C4\u00D6\u00DC"; | |
685 char[] resultString1utf8 = toUpper(testString1utf8); | |
686 assert(resultString1utf8 == refString1utf8); | |
687 wchar[] resultString1utf16 = toUpper(testString1utf16); | |
688 assert(resultString1utf16 == refString1utf16); | |
689 dchar[] resultString1utf32 = toUpper(testString1utf32); | |
690 assert(resultString1utf32 == refString1utf32); | |
691 | |
692 // 2) Buffer passed, no resize, no SpecialCase | |
693 char[60] buffer1utf8; | |
694 wchar[30] buffer1utf16; | |
695 dchar[30] buffer1utf32; | |
696 resultString1utf8 = toUpper(testString1utf8,buffer1utf8); | |
697 assert(resultString1utf8.ptr == buffer1utf8.ptr); | |
698 assert(resultString1utf8 == refString1utf8); | |
699 resultString1utf16 = toUpper(testString1utf16,buffer1utf16); | |
700 assert(resultString1utf16.ptr == buffer1utf16.ptr); | |
701 assert(resultString1utf16 == refString1utf16); | |
702 resultString1utf32 = toUpper(testString1utf32,buffer1utf32); | |
703 assert(resultString1utf32.ptr == buffer1utf32.ptr); | |
704 assert(resultString1utf32 == refString1utf32); | |
705 | |
706 // 3/ Buffer passed, resize necessary, no Special case | |
707 | |
708 char[5] buffer2utf8; | |
709 wchar[2] buffer2utf16; | |
710 dchar[2] buffer2utf32; | |
711 resultString1utf8 = toUpper(testString1utf8,buffer2utf8); | |
712 assert(resultString1utf8.ptr != buffer2utf8.ptr); | |
713 assert(resultString1utf8 == refString1utf8); | |
714 resultString1utf16 = toUpper(testString1utf16,buffer2utf16); | |
715 assert(resultString1utf16.ptr != buffer2utf16.ptr); | |
716 assert(resultString1utf16 == refString1utf16); | |
717 resultString1utf32 = toUpper(testString1utf32,buffer2utf32); | |
718 assert(resultString1utf32.ptr != buffer2utf32.ptr); | |
719 assert(resultString1utf32 == refString1utf32); | |
720 | |
721 // 4) Buffer passed, resize necessary, extensive SpecialCase | |
722 | |
723 | |
724 char[] testString2utf8 = "\uFB03\uFB04\uFB05"; | |
725 wchar[] testString2utf16 = "\uFB03\uFB04\uFB05"; | |
726 dchar[] testString2utf32 = "\uFB03\uFB04\uFB05"; | |
727 char[] refString2utf8 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054"; | |
728 wchar[] refString2utf16 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054"; | |
729 dchar[] refString2utf32 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054"; | |
730 resultString1utf8 = toUpper(testString2utf8,buffer2utf8); | |
731 assert(resultString1utf8.ptr != buffer2utf8.ptr); | |
732 assert(resultString1utf8 == refString2utf8); | |
733 resultString1utf16 = toUpper(testString2utf16,buffer2utf16); | |
734 assert(resultString1utf16.ptr != buffer2utf16.ptr); | |
735 assert(resultString1utf16 == refString2utf16); | |
736 resultString1utf32 = toUpper(testString2utf32,buffer2utf32); | |
737 assert(resultString1utf32.ptr != buffer2utf32.ptr); | |
738 assert(resultString1utf32 == refString2utf32); | |
739 | |
740 } | |
741 | |
742 | |
743 unittest { | |
744 | |
745 | |
746 // 1) No Buffer passed, no resize, no SpecialCase | |
747 | |
748 char[] testString1utf8 = "\u00C4\u00D6\u00DC"; | |
749 wchar[] testString1utf16 = "\u00C4\u00D6\u00DC"; | |
750 dchar[] testString1utf32 = "\u00C4\u00D6\u00DC"; | |
751 char[] refString1utf8 = "\u00E4\u00F6\u00FC"; | |
752 wchar[] refString1utf16 = "\u00E4\u00F6\u00FC"; | |
753 dchar[] refString1utf32 = "\u00E4\u00F6\u00FC"; | |
754 char[] resultString1utf8 = toLower(testString1utf8); | |
755 assert(resultString1utf8 == refString1utf8); | |
756 wchar[] resultString1utf16 = toLower(testString1utf16); | |
757 assert(resultString1utf16 == refString1utf16); | |
758 dchar[] resultString1utf32 = toLower(testString1utf32); | |
759 assert(resultString1utf32 == refString1utf32); | |
760 | |
761 // 2) Buffer passed, no resize, no SpecialCase | |
762 char[60] buffer1utf8; | |
763 wchar[30] buffer1utf16; | |
764 dchar[30] buffer1utf32; | |
765 resultString1utf8 = toLower(testString1utf8,buffer1utf8); | |
766 assert(resultString1utf8.ptr == buffer1utf8.ptr); | |
767 assert(resultString1utf8 == refString1utf8); | |
768 resultString1utf16 = toLower(testString1utf16,buffer1utf16); | |
769 assert(resultString1utf16.ptr == buffer1utf16.ptr); | |
770 assert(resultString1utf16 == refString1utf16); | |
771 resultString1utf32 = toLower(testString1utf32,buffer1utf32); | |
772 assert(resultString1utf32.ptr == buffer1utf32.ptr); | |
773 assert(resultString1utf32 == refString1utf32); | |
774 | |
775 // 3/ Buffer passed, resize necessary, no Special case | |
776 | |
777 char[5] buffer2utf8; | |
778 wchar[2] buffer2utf16; | |
779 dchar[2] buffer2utf32; | |
780 resultString1utf8 = toLower(testString1utf8,buffer2utf8); | |
781 assert(resultString1utf8.ptr != buffer2utf8.ptr); | |
782 assert(resultString1utf8 == refString1utf8); | |
783 resultString1utf16 = toLower(testString1utf16,buffer2utf16); | |
784 assert(resultString1utf16.ptr != buffer2utf16.ptr); | |
785 assert(resultString1utf16 == refString1utf16); | |
786 resultString1utf32 = toLower(testString1utf32,buffer2utf32); | |
787 assert(resultString1utf32.ptr != buffer2utf32.ptr); | |
788 assert(resultString1utf32 == refString1utf32); | |
789 | |
790 // 4) Buffer passed, resize necessary, extensive SpecialCase | |
791 | |
792 char[] testString2utf8 = "\u0130\u0130\u0130"; | |
793 wchar[] testString2utf16 = "\u0130\u0130\u0130"; | |
794 dchar[] testString2utf32 = "\u0130\u0130\u0130"; | |
795 char[] refString2utf8 = "\u0069\u0307\u0069\u0307\u0069\u0307"; | |
796 wchar[] refString2utf16 = "\u0069\u0307\u0069\u0307\u0069\u0307"; | |
797 dchar[] refString2utf32 = "\u0069\u0307\u0069\u0307\u0069\u0307"; | |
798 resultString1utf8 = toLower(testString2utf8,buffer2utf8); | |
799 assert(resultString1utf8.ptr != buffer2utf8.ptr); | |
800 assert(resultString1utf8 == refString2utf8); | |
801 resultString1utf16 = toLower(testString2utf16,buffer2utf16); | |
802 assert(resultString1utf16.ptr != buffer2utf16.ptr); | |
803 assert(resultString1utf16 == refString2utf16); | |
804 resultString1utf32 = toLower(testString2utf32,buffer2utf32); | |
805 assert(resultString1utf32.ptr != buffer2utf32.ptr); | |
806 assert(resultString1utf32 == refString2utf32); | |
807 } | |
808 | |
809 unittest { | |
810 char[] testString1utf8 = "?!Mädchen \u0390\u0390,;"; | |
811 char[] testString2utf8 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;"; | |
812 assert(toFold(testString1utf8) == toFold(testString2utf8)); | |
813 wchar[] testString1utf16 = "?!Mädchen \u0390\u0390,;";; | |
814 wchar[] testString2utf16 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;"; | |
815 assert(toFold(testString1utf16) == toFold(testString2utf16)); | |
816 wchar[] testString1utf32 = "?!Mädchen \u0390\u0390,;"; | |
817 wchar[] testString2utf32 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;"; | |
818 assert(toFold(testString1utf32) == toFold(testString2utf32)); | |
819 } | |
820 | |
821 } |