132
|
1 /*******************************************************************************
|
|
2
|
|
3 copyright: Copyright (c) 2007 Peter Triller. All rights reserved
|
|
4
|
|
5 license: BSD style: $(LICENSE)
|
|
6
|
|
7 version: Initial release: Sept 2007
|
|
8
|
|
9 authors: Peter
|
|
10
|
|
11 Provides case mapping Functions for Unicode Strings. As of now it is
|
|
12 only 99 % complete, because it does not take into account Conditional
|
|
13 case mappings. This means the Greek Letter Sigma will not be correctly
|
|
14 case mapped at the end of a Word, and the Locales Lithuanian, Turkish
|
|
15 and Azeri are not taken into account during Case Mappings. This means
|
|
16 all in all around 12 Characters will not be mapped correctly under
|
|
17 some circumstances.
|
|
18
|
|
19 ICU4j also does not handle these cases at the moment.
|
|
20
|
|
21 Unittests are written against output from ICU4j
|
|
22
|
|
23 This Module tries to minimize Memory allocation and usage. You can
|
|
24 always pass the output buffer that should be used to the case mapping
|
|
25 function, which will be resized if necessary.
|
|
26
|
|
27 *******************************************************************************/
|
|
28
|
|
29 module tango.text.Unicode;
|
|
30
|
|
31 private import tango.text.UnicodeData;
|
|
32 private import tango.text.convert.Utf;
|
|
33
|
|
34
|
|
35
|
|
36 /**
|
|
37 * Converts an Utf8 String to Upper case
|
|
38 *
|
|
39 * Params:
|
|
40 * input = String to be case mapped
|
|
41 * output = this output buffer will be used unless too small
|
|
42 * Returns: the case mapped string
|
|
43 */
|
|
44 deprecated char[] blockToUpper(char[] input, char[] output = null, dchar[] working = null) {
|
|
45
|
|
46 // ?? How much preallocation ?? This is worst case allocation
|
|
47 if (working is null)
|
|
48 working.length = input.length;
|
|
49
|
|
50 uint produced = 0;
|
|
51 uint ate;
|
|
52 uint oprod = 0;
|
|
53 foreach(dchar ch; input) {
|
|
54 // TODO Conditional Case Mapping
|
|
55 UnicodeData **d = (ch in unicodeData);
|
|
56 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
57 SpecialCaseData **s = (ch in specialCaseData);
|
|
58 debug {
|
|
59 assert(s !is null);
|
|
60 }
|
|
61 if((*s).upperCaseMapping !is null) {
|
|
62 // To speed up, use worst case for memory prealocation
|
|
63 // since the length of an UpperCaseMapping list is at most 4
|
|
64 // Make sure no relocation is made in the toString Method
|
|
65 // better allocation algorithm ?
|
|
66 int len = (*s).upperCaseMapping.length;
|
|
67 if(produced + len >= working.length)
|
|
68 working.length = working.length + working.length / 2 + len;
|
|
69 oprod = produced;
|
|
70 produced += len;
|
|
71 working[oprod..produced] = (*s).upperCaseMapping;
|
|
72 continue;
|
|
73 }
|
|
74 }
|
|
75 // Make sure no relocation is made in the toString Method
|
|
76 if(produced + 1 >= output.length)
|
|
77 working.length = working.length + working.length / 2 + 1;
|
|
78 working[produced++] = d is null ? ch:(*d).simpleUpperCaseMapping;
|
|
79 }
|
|
80 return toString(working[0..produced],output);
|
|
81 }
|
|
82
|
|
83
|
|
84
|
|
85 /**
|
|
86 * Converts an Utf8 String to Upper case
|
|
87 *
|
|
88 * Params:
|
|
89 * input = String to be case mapped
|
|
90 * output = this output buffer will be used unless too small
|
|
91 * Returns: the case mapped string
|
|
92 */
|
|
93 char[] toUpper(char[] input, char[] output = null) {
|
|
94
|
|
95 dchar[1] buf;
|
|
96 // assume most common case: String stays the same length
|
|
97 if (output.length < input.length)
|
|
98 output.length = input.length;
|
|
99
|
|
100 uint produced = 0;
|
|
101 uint ate;
|
|
102 foreach(dchar ch; input) {
|
|
103 // TODO Conditional Case Mapping
|
|
104 UnicodeData **d = (ch in unicodeData);
|
|
105 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
106 SpecialCaseData **s = (ch in specialCaseData);
|
|
107 debug {
|
|
108 assert(s !is null);
|
|
109 }
|
|
110 if((*s).upperCaseMapping !is null) {
|
|
111 // To speed up, use worst case for memory prealocation
|
|
112 // since the length of an UpperCaseMapping list is at most 4
|
|
113 // Make sure no relocation is made in the toString Method
|
|
114 // better allocation algorithm ?
|
|
115 if(produced + (*s).upperCaseMapping.length * 4 >= output.length)
|
|
116 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length * 4;
|
|
117 char[] res = toString((*s).upperCaseMapping, output[produced..output.length], &ate);
|
|
118 debug {
|
|
119 assert(ate == (*s).upperCaseMapping.length);
|
|
120 assert(res.ptr == output[produced..output.length].ptr);
|
|
121 }
|
|
122 produced += res.length;
|
|
123 continue;
|
|
124 }
|
|
125 }
|
|
126 // Make sure no relocation is made in the toString Method
|
|
127 if(produced + 4 >= output.length)
|
|
128 output.length = output.length + output.length / 2 + 4;
|
|
129 buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
|
|
130 char[] res = toString(buf, output[produced..output.length], &ate);
|
|
131 debug {
|
|
132 assert(ate == 1);
|
|
133 assert(res.ptr == output[produced..output.length].ptr);
|
|
134 }
|
|
135 produced += res.length;
|
|
136 }
|
|
137 return output[0..produced];
|
|
138 }
|
|
139
|
|
140
|
|
141 /**
|
|
142 * Converts an Utf16 String to Upper case
|
|
143 *
|
|
144 * Params:
|
|
145 * input = String to be case mapped
|
|
146 * output = this output buffer will be used unless too small
|
|
147 * Returns: the case mapped string
|
|
148 */
|
|
149 wchar[] toUpper(wchar[] input, wchar[] output = null) {
|
|
150
|
|
151 dchar[1] buf;
|
|
152 // assume most common case: String stays the same length
|
|
153 if (output.length < input.length)
|
|
154 output.length = input.length;
|
|
155
|
|
156 uint produced = 0;
|
|
157 uint ate;
|
|
158 foreach(dchar ch; input) {
|
|
159 // TODO Conditional Case Mapping
|
|
160 UnicodeData **d = (ch in unicodeData);
|
|
161 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
162 SpecialCaseData **s = (ch in specialCaseData);
|
|
163 debug {
|
|
164 assert(s !is null);
|
|
165 }
|
|
166 if((*s).upperCaseMapping !is null) {
|
|
167 // To speed up, use worst case for memory prealocation
|
|
168 // Make sure no relocation is made in the toString16 Method
|
|
169 // better allocation algorithm ?
|
|
170 if(produced + (*s).upperCaseMapping.length * 2 >= output.length)
|
|
171 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length * 3;
|
|
172 wchar[] res = toString16((*s).upperCaseMapping, output[produced..output.length], &ate);
|
|
173 debug {
|
|
174 assert(ate == (*s).upperCaseMapping.length);
|
|
175 assert(res.ptr == output[produced..output.length].ptr);
|
|
176 }
|
|
177 produced += res.length;
|
|
178 continue;
|
|
179 }
|
|
180 }
|
|
181 // Make sure no relocation is made in the toString16 Method
|
|
182 if(produced + 4 >= output.length)
|
|
183 output.length = output.length + output.length / 2 + 3;
|
|
184 buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
|
|
185 wchar[] res = toString16(buf, output[produced..output.length], &ate);
|
|
186 debug {
|
|
187 assert(ate == 1);
|
|
188 assert(res.ptr == output[produced..output.length].ptr);
|
|
189 }
|
|
190 produced += res.length;
|
|
191 }
|
|
192 return output[0..produced];
|
|
193 }
|
|
194
|
|
195 /**
|
|
196 * Converts an Utf32 String to Upper case
|
|
197 *
|
|
198 * Params:
|
|
199 * input = String to be case mapped
|
|
200 * output = this output buffer will be used unless too small
|
|
201 * Returns: the case mapped string
|
|
202 */
|
|
203 dchar[] toUpper(dchar[] input, dchar[] output = null) {
|
|
204
|
|
205 // assume most common case: String stays the same length
|
|
206 if (input.length > output.length)
|
|
207 output.length = input.length;
|
|
208
|
|
209 uint produced = 0;
|
|
210 if (input.length)
|
|
211 foreach(dchar orig; input) {
|
|
212 // TODO Conditional Case Mapping
|
|
213 UnicodeData **d = (orig in unicodeData);
|
|
214 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
215 SpecialCaseData **s = (orig in specialCaseData);
|
|
216 debug {
|
|
217 assert(s !is null);
|
|
218 }
|
|
219 if((*s).upperCaseMapping !is null) {
|
|
220 // Better resize strategy ???
|
|
221 if(produced + (*s).upperCaseMapping.length > output.length)
|
|
222 output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length;
|
|
223 foreach(ch; (*s).upperCaseMapping) {
|
|
224 output[produced++] = ch;
|
|
225 }
|
|
226 }
|
|
227 continue;
|
|
228 }
|
|
229 if(produced >= output.length)
|
|
230 output.length = output.length + output.length / 2;
|
|
231 output[produced++] = d is null ? orig:(*d).simpleUpperCaseMapping;
|
|
232 }
|
|
233 return output[0..produced];
|
|
234 }
|
|
235
|
|
236
|
|
237 /**
|
|
238 * Converts an Utf8 String to Lower case
|
|
239 *
|
|
240 * Params:
|
|
241 * input = String to be case mapped
|
|
242 * output = this output buffer will be used unless too small
|
|
243 * Returns: the case mapped string
|
|
244 */
|
|
245 char[] toLower(char[] input, char[] output = null) {
|
|
246
|
|
247 dchar[1] buf;
|
|
248 // assume most common case: String stays the same length
|
|
249 if (output.length < input.length)
|
|
250 output.length = input.length;
|
|
251
|
|
252 uint produced = 0;
|
|
253 uint ate;
|
|
254 foreach(dchar ch; input) {
|
|
255 // TODO Conditional Case Mapping
|
|
256 UnicodeData **d = (ch in unicodeData);
|
|
257 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
258 SpecialCaseData **s = (ch in specialCaseData);
|
|
259 debug {
|
|
260 assert(s !is null);
|
|
261 }
|
|
262 if((*s).lowerCaseMapping !is null) {
|
|
263 // To speed up, use worst case for memory prealocation
|
|
264 // since the length of an LowerCaseMapping list is at most 4
|
|
265 // Make sure no relocation is made in the toString Method
|
|
266 // better allocation algorithm ?
|
|
267 if(produced + (*s).lowerCaseMapping.length * 4 >= output.length)
|
|
268 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length * 4;
|
|
269 char[] res = toString((*s).lowerCaseMapping, output[produced..output.length], &ate);
|
|
270 debug {
|
|
271 assert(ate == (*s).lowerCaseMapping.length);
|
|
272 assert(res.ptr == output[produced..output.length].ptr);
|
|
273 }
|
|
274 produced += res.length;
|
|
275 continue;
|
|
276 }
|
|
277 }
|
|
278 // Make sure no relocation is made in the toString Method
|
|
279 if(produced + 4 >= output.length)
|
|
280 output.length = output.length + output.length / 2 + 4;
|
|
281 buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
|
|
282 char[] res = toString(buf, output[produced..output.length], &ate);
|
|
283 debug {
|
|
284 assert(ate == 1);
|
|
285 assert(res.ptr == output[produced..output.length].ptr);
|
|
286 }
|
|
287 produced += res.length;
|
|
288 }
|
|
289 return output[0..produced];
|
|
290 }
|
|
291
|
|
292
|
|
293 /**
|
|
294 * Converts an Utf16 String to Lower case
|
|
295 *
|
|
296 * Params:
|
|
297 * input = String to be case mapped
|
|
298 * output = this output buffer will be used unless too small
|
|
299 * Returns: the case mapped string
|
|
300 */
|
|
301 wchar[] toLower(wchar[] input, wchar[] output = null) {
|
|
302
|
|
303 dchar[1] buf;
|
|
304 // assume most common case: String stays the same length
|
|
305 if (output.length < input.length)
|
|
306 output.length = input.length;
|
|
307
|
|
308 uint produced = 0;
|
|
309 uint ate;
|
|
310 foreach(dchar ch; input) {
|
|
311 // TODO Conditional Case Mapping
|
|
312 UnicodeData **d = (ch in unicodeData);
|
|
313 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
314 SpecialCaseData **s = (ch in specialCaseData);
|
|
315 debug {
|
|
316 assert(s !is null);
|
|
317 }
|
|
318 if((*s).lowerCaseMapping !is null) {
|
|
319 // To speed up, use worst case for memory prealocation
|
|
320 // Make sure no relocation is made in the toString16 Method
|
|
321 // better allocation algorithm ?
|
|
322 if(produced + (*s).lowerCaseMapping.length * 2 >= output.length)
|
|
323 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length * 3;
|
|
324 wchar[] res = toString16((*s).lowerCaseMapping, output[produced..output.length], &ate);
|
|
325 debug {
|
|
326 assert(ate == (*s).lowerCaseMapping.length);
|
|
327 assert(res.ptr == output[produced..output.length].ptr);
|
|
328 }
|
|
329 produced += res.length;
|
|
330 continue;
|
|
331 }
|
|
332 }
|
|
333 // Make sure no relocation is made in the toString16 Method
|
|
334 if(produced + 4 >= output.length)
|
|
335 output.length = output.length + output.length / 2 + 3;
|
|
336 buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
|
|
337 wchar[] res = toString16(buf, output[produced..output.length], &ate);
|
|
338 debug {
|
|
339 assert(ate == 1);
|
|
340 assert(res.ptr == output[produced..output.length].ptr);
|
|
341 }
|
|
342 produced += res.length;
|
|
343 }
|
|
344 return output[0..produced];
|
|
345 }
|
|
346
|
|
347
|
|
348 /**
|
|
349 * Converts an Utf32 String to Lower case
|
|
350 *
|
|
351 * Params:
|
|
352 * input = String to be case mapped
|
|
353 * output = this output buffer will be used unless too small
|
|
354 * Returns: the case mapped string
|
|
355 */
|
|
356 dchar[] toLower(dchar[] input, dchar[] output = null) {
|
|
357
|
|
358 // assume most common case: String stays the same length
|
|
359 if (input.length > output.length)
|
|
360 output.length = input.length;
|
|
361
|
|
362 uint produced = 0;
|
|
363 if (input.length)
|
|
364 foreach(dchar orig; input) {
|
|
365 // TODO Conditional Case Mapping
|
|
366 UnicodeData **d = (orig in unicodeData);
|
|
367 if(d !is null && ((*d).generalCategory & UnicodeData.GeneralCategory.SpecialMapping)) {
|
|
368 SpecialCaseData **s = (orig in specialCaseData);
|
|
369 debug {
|
|
370 assert(s !is null);
|
|
371 }
|
|
372 if((*s).lowerCaseMapping !is null) {
|
|
373 // Better resize strategy ???
|
|
374 if(produced + (*s).lowerCaseMapping.length > output.length)
|
|
375 output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length;
|
|
376 foreach(ch; (*s).lowerCaseMapping) {
|
|
377 output[produced++] = ch;
|
|
378 }
|
|
379 }
|
|
380 continue;
|
|
381 }
|
|
382 if(produced >= output.length)
|
|
383 output.length = output.length + output.length / 2;
|
|
384 output[produced++] = d is null ? orig:(*d).simpleLowerCaseMapping;
|
|
385 }
|
|
386 return output[0..produced];
|
|
387 }
|
|
388
|
|
389 /**
|
|
390 * Converts an Utf8 String to Folding case
|
|
391 * Folding case is used for case insensitive comparsions.
|
|
392 *
|
|
393 * Params:
|
|
394 * input = String to be case mapped
|
|
395 * output = this output buffer will be used unless too small
|
|
396 * Returns: the case mapped string
|
|
397 */
|
|
398 char[] toFold(char[] input, char[] output = null) {
|
|
399
|
|
400 dchar[1] buf;
|
|
401 // assume most common case: String stays the same length
|
|
402 if (output.length < input.length)
|
|
403 output.length = input.length;
|
|
404
|
|
405 uint produced = 0;
|
|
406 uint ate;
|
|
407 foreach(dchar ch; input) {
|
|
408 FoldingCaseData **s = (ch in foldingCaseData);
|
|
409 if(s !is null) {
|
|
410 // To speed up, use worst case for memory prealocation
|
|
411 // since the length of an UpperCaseMapping list is at most 4
|
|
412 // Make sure no relocation is made in the toString Method
|
|
413 // better allocation algorithm ?
|
|
414 if(produced + (*s).mapping.length * 4 >= output.length)
|
|
415 output.length = output.length + output.length / 2 + (*s).mapping.length * 4;
|
|
416 char[] res = toString((*s).mapping, output[produced..output.length], &ate);
|
|
417 debug {
|
|
418 assert(ate == (*s).mapping.length);
|
|
419 assert(res.ptr == output[produced..output.length].ptr);
|
|
420 }
|
|
421 produced += res.length;
|
|
422 continue;
|
|
423 }
|
|
424 // Make sure no relocation is made in the toString Method
|
|
425 if(produced + 4 >= output.length)
|
|
426 output.length = output.length + output.length / 2 + 4;
|
|
427 buf[0] = ch;
|
|
428 char[] res = toString(buf, output[produced..output.length], &ate);
|
|
429 debug {
|
|
430 assert(ate == 1);
|
|
431 assert(res.ptr == output[produced..output.length].ptr);
|
|
432 }
|
|
433 produced += res.length;
|
|
434 }
|
|
435 return output[0..produced];
|
|
436 }
|
|
437
|
|
438 /**
|
|
439 * Converts an Utf16 String to Folding case
|
|
440 * Folding case is used for case insensitive comparsions.
|
|
441 *
|
|
442 * Params:
|
|
443 * input = String to be case mapped
|
|
444 * output = this output buffer will be used unless too small
|
|
445 * Returns: the case mapped string
|
|
446 */
|
|
447 wchar[] toFold(wchar[] input, wchar[] output = null) {
|
|
448
|
|
449 dchar[1] buf;
|
|
450 // assume most common case: String stays the same length
|
|
451 if (output.length < input.length)
|
|
452 output.length = input.length;
|
|
453
|
|
454 uint produced = 0;
|
|
455 uint ate;
|
|
456 foreach(dchar ch; input) {
|
|
457 FoldingCaseData **s = (ch in foldingCaseData);
|
|
458 if(s !is null) {
|
|
459 // To speed up, use worst case for memory prealocation
|
|
460 // Make sure no relocation is made in the toString16 Method
|
|
461 // better allocation algorithm ?
|
|
462 if(produced + (*s).mapping.length * 2 >= output.length)
|
|
463 output.length = output.length + output.length / 2 + (*s).mapping.length * 3;
|
|
464 wchar[] res = toString16((*s).mapping, output[produced..output.length], &ate);
|
|
465 debug {
|
|
466 assert(ate == (*s).mapping.length);
|
|
467 assert(res.ptr == output[produced..output.length].ptr);
|
|
468 }
|
|
469 produced += res.length;
|
|
470 continue;
|
|
471 }
|
|
472 // Make sure no relocation is made in the toString16 Method
|
|
473 if(produced + 4 >= output.length)
|
|
474 output.length = output.length + output.length / 2 + 3;
|
|
475 buf[0] = ch;
|
|
476 wchar[] res = toString16(buf, output[produced..output.length], &ate);
|
|
477 debug {
|
|
478 assert(ate == 1);
|
|
479 assert(res.ptr == output[produced..output.length].ptr);
|
|
480 }
|
|
481 produced += res.length;
|
|
482 }
|
|
483 return output[0..produced];
|
|
484 }
|
|
485
|
|
486 /**
|
|
487 * Converts an Utf32 String to Folding case
|
|
488 * Folding case is used for case insensitive comparsions.
|
|
489 *
|
|
490 * Params:
|
|
491 * input = String to be case mapped
|
|
492 * output = this output buffer will be used unless too small
|
|
493 * Returns: the case mapped string
|
|
494 */
|
|
495 dchar[] toFold(dchar[] input, dchar[] output = null) {
|
|
496
|
|
497 // assume most common case: String stays the same length
|
|
498 if (input.length > output.length)
|
|
499 output.length = input.length;
|
|
500
|
|
501 uint produced = 0;
|
|
502 if (input.length)
|
|
503 foreach(dchar orig; input) {
|
|
504 FoldingCaseData **d = (orig in foldingCaseData);
|
|
505 if(d !is null ) {
|
|
506 // Better resize strategy ???
|
|
507 if(produced + (*d).mapping.length > output.length)
|
|
508 output.length = output.length + output.length / 2 + (*d).mapping.length;
|
|
509 foreach(ch; (*d).mapping) {
|
|
510 output[produced++] = ch;
|
|
511 }
|
|
512 continue;
|
|
513 }
|
|
514 if(produced >= output.length)
|
|
515 output.length = output.length + output.length / 2;
|
|
516 output[produced++] = orig;
|
|
517 }
|
|
518 return output[0..produced];
|
|
519 }
|
|
520
|
|
521
|
|
522 /**
|
|
523 * Determines if a character is a digit. It returns true for decimal
|
|
524 * digits only.
|
|
525 *
|
|
526 * Params:
|
|
527 * ch = the character to be inspected
|
|
528 */
|
|
529 bool isDigit(dchar ch) {
|
|
530 UnicodeData **d = (ch in unicodeData);
|
|
531 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Nd);
|
|
532 }
|
|
533
|
|
534
|
|
535 /**
|
|
536 * Determines if a character is a letter.
|
|
537 *
|
|
538 * Params:
|
|
539 * ch = the character to be inspected
|
|
540 */
|
|
541 bool isLetter(int ch) {
|
|
542 UnicodeData **d = (ch in unicodeData);
|
|
543 return (d !is null) && ((*d).generalCategory &
|
|
544 ( UnicodeData.GeneralCategory.Lu
|
|
545 | UnicodeData.GeneralCategory.Ll
|
|
546 | UnicodeData.GeneralCategory.Lt
|
|
547 | UnicodeData.GeneralCategory.Lm
|
|
548 | UnicodeData.GeneralCategory.Lo));
|
|
549 }
|
|
550
|
|
551 /**
|
|
552 * Determines if a character is a letter or a
|
|
553 * decimal digit.
|
|
554 *
|
|
555 * Params:
|
|
556 * ch = the character to be inspected
|
|
557 */
|
|
558 bool isLetterOrDigit(int ch) {
|
|
559 UnicodeData **d = (ch in unicodeData);
|
|
560 return (d !is null) && ((*d).generalCategory &
|
|
561 ( UnicodeData.GeneralCategory.Lu
|
|
562 | UnicodeData.GeneralCategory.Ll
|
|
563 | UnicodeData.GeneralCategory.Lt
|
|
564 | UnicodeData.GeneralCategory.Lm
|
|
565 | UnicodeData.GeneralCategory.Lo
|
|
566 | UnicodeData.GeneralCategory.Nd));
|
|
567 }
|
|
568
|
|
569 /**
|
|
570 * Determines if a character is a lower case letter.
|
|
571 * Params:
|
|
572 * ch = the character to be inspected
|
|
573 */
|
|
574 bool isLower(dchar ch) {
|
|
575 UnicodeData **d = (ch in unicodeData);
|
|
576 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Ll);
|
|
577 }
|
|
578
|
|
579 /**
|
|
580 * Determines if a character is a title case letter.
|
|
581 * In case of combined letters, only the first is upper and the second is lower.
|
|
582 * Some of these special characters can be found in the croatian and greek language.
|
|
583 * See_Also: http://en.wikipedia.org/wiki/Capitalization
|
|
584 * Params:
|
|
585 * ch = the character to be inspected
|
|
586 */
|
|
587 bool isTitle(dchar ch) {
|
|
588 UnicodeData **d = (ch in unicodeData);
|
|
589 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Lt);
|
|
590 }
|
|
591
|
|
592 /**
|
|
593 * Determines if a character is a upper case letter.
|
|
594 * Params:
|
|
595 * ch = the character to be inspected
|
|
596 */
|
|
597 bool isUpper(dchar ch) {
|
|
598 UnicodeData **d = (ch in unicodeData);
|
|
599 return (d !is null) && ((*d).generalCategory & UnicodeData.GeneralCategory.Lu);
|
|
600 }
|
|
601
|
|
602 /**
|
|
603 * Determines if a character is a Whitespace character.
|
|
604 * Whitespace characters are characters in the
|
|
605 * General Catetories Zs, Zl, Zp without the No Break
|
|
606 * spaces plus the control characters out of the ASCII
|
|
607 * range, that are used as spaces:
|
|
608 * TAB VT LF FF CR FS GS RS US NL
|
|
609 *
|
|
610 * WARNING: look at isSpace, maybe that function does
|
|
611 * more what you expect.
|
|
612 *
|
|
613 * Params:
|
|
614 * ch = the character to be inspected
|
|
615 */
|
|
616 bool isWhitespace(dchar ch) {
|
|
617 if((ch >= 0x0009 && ch <= 0x000D) || (ch >= 0x001C && ch <= 0x001F))
|
|
618 return true;
|
|
619 UnicodeData **d = (ch in unicodeData);
|
|
620 return (d !is null) && ((*d).generalCategory &
|
|
621 ( UnicodeData.GeneralCategory.Zs
|
|
622 | UnicodeData.GeneralCategory.Zl
|
|
623 | UnicodeData.GeneralCategory.Zp))
|
|
624 && ch != 0x00A0 // NBSP
|
|
625 && ch != 0x202F // NARROW NBSP
|
|
626 && ch != 0xFEFF; // ZERO WIDTH NBSP
|
|
627 }
|
|
628
|
|
629 /**
|
|
630 * Detemines if a character is a Space character as
|
|
631 * specified in the Unicode Standart.
|
|
632 *
|
|
633 * WARNING: look at isWhitepace, maybe that function does
|
|
634 * more what you expect.
|
|
635 *
|
|
636 * Params:
|
|
637 * ch = the character to be inspected
|
|
638 */
|
|
639 bool isSpace(dchar ch) {
|
|
640 UnicodeData **d = (ch in unicodeData);
|
|
641 return (d !is null) && ((*d).generalCategory &
|
|
642 ( UnicodeData.GeneralCategory.Zs
|
|
643 | UnicodeData.GeneralCategory.Zl
|
|
644 | UnicodeData.GeneralCategory.Zp));
|
|
645 }
|
|
646
|
|
647
|
|
648 /**
|
|
649 * Detemines if a character is a printable character as
|
|
650 * specified in the Unicode Standart.
|
|
651 *
|
|
652 *
|
|
653 * WARNING: look at isWhitepace, maybe that function does
|
|
654 * more what you expect.
|
|
655 *
|
|
656 * Params:
|
|
657 * ch = the character to be inspected
|
|
658 */
|
|
659 bool isPrintable(dchar ch) {
|
|
660 UnicodeData **d = (ch in unicodeData);
|
|
661 return (d !is null) && ((*d).generalCategory &
|
|
662 ( UnicodeData.GeneralCategory.Cn
|
|
663 | UnicodeData.GeneralCategory.Cc
|
|
664 | UnicodeData.GeneralCategory.Cf
|
|
665 | UnicodeData.GeneralCategory.Co
|
|
666 | UnicodeData.GeneralCategory.Cs));
|
|
667 }
|
|
668
|
|
669 debug ( UnicodeTest ):
|
|
670 void main() {}
|
|
671
|
|
672 debug (UnitTest) {
|
|
673
|
|
674 unittest {
|
|
675
|
|
676
|
|
677 // 1) No Buffer passed, no resize, no SpecialCase
|
|
678
|
|
679 char[] testString1utf8 = "\u00E4\u00F6\u00FC";
|
|
680 wchar[] testString1utf16 = "\u00E4\u00F6\u00FC";
|
|
681 dchar[] testString1utf32 = "\u00E4\u00F6\u00FC";
|
|
682 char[] refString1utf8 = "\u00C4\u00D6\u00DC";
|
|
683 wchar[] refString1utf16 = "\u00C4\u00D6\u00DC";
|
|
684 dchar[] refString1utf32 = "\u00C4\u00D6\u00DC";
|
|
685 char[] resultString1utf8 = toUpper(testString1utf8);
|
|
686 assert(resultString1utf8 == refString1utf8);
|
|
687 wchar[] resultString1utf16 = toUpper(testString1utf16);
|
|
688 assert(resultString1utf16 == refString1utf16);
|
|
689 dchar[] resultString1utf32 = toUpper(testString1utf32);
|
|
690 assert(resultString1utf32 == refString1utf32);
|
|
691
|
|
692 // 2) Buffer passed, no resize, no SpecialCase
|
|
693 char[60] buffer1utf8;
|
|
694 wchar[30] buffer1utf16;
|
|
695 dchar[30] buffer1utf32;
|
|
696 resultString1utf8 = toUpper(testString1utf8,buffer1utf8);
|
|
697 assert(resultString1utf8.ptr == buffer1utf8.ptr);
|
|
698 assert(resultString1utf8 == refString1utf8);
|
|
699 resultString1utf16 = toUpper(testString1utf16,buffer1utf16);
|
|
700 assert(resultString1utf16.ptr == buffer1utf16.ptr);
|
|
701 assert(resultString1utf16 == refString1utf16);
|
|
702 resultString1utf32 = toUpper(testString1utf32,buffer1utf32);
|
|
703 assert(resultString1utf32.ptr == buffer1utf32.ptr);
|
|
704 assert(resultString1utf32 == refString1utf32);
|
|
705
|
|
706 // 3/ Buffer passed, resize necessary, no Special case
|
|
707
|
|
708 char[5] buffer2utf8;
|
|
709 wchar[2] buffer2utf16;
|
|
710 dchar[2] buffer2utf32;
|
|
711 resultString1utf8 = toUpper(testString1utf8,buffer2utf8);
|
|
712 assert(resultString1utf8.ptr != buffer2utf8.ptr);
|
|
713 assert(resultString1utf8 == refString1utf8);
|
|
714 resultString1utf16 = toUpper(testString1utf16,buffer2utf16);
|
|
715 assert(resultString1utf16.ptr != buffer2utf16.ptr);
|
|
716 assert(resultString1utf16 == refString1utf16);
|
|
717 resultString1utf32 = toUpper(testString1utf32,buffer2utf32);
|
|
718 assert(resultString1utf32.ptr != buffer2utf32.ptr);
|
|
719 assert(resultString1utf32 == refString1utf32);
|
|
720
|
|
721 // 4) Buffer passed, resize necessary, extensive SpecialCase
|
|
722
|
|
723
|
|
724 char[] testString2utf8 = "\uFB03\uFB04\uFB05";
|
|
725 wchar[] testString2utf16 = "\uFB03\uFB04\uFB05";
|
|
726 dchar[] testString2utf32 = "\uFB03\uFB04\uFB05";
|
|
727 char[] refString2utf8 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054";
|
|
728 wchar[] refString2utf16 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054";
|
|
729 dchar[] refString2utf32 = "\u0046\u0046\u0049\u0046\u0046\u004C\u0053\u0054";
|
|
730 resultString1utf8 = toUpper(testString2utf8,buffer2utf8);
|
|
731 assert(resultString1utf8.ptr != buffer2utf8.ptr);
|
|
732 assert(resultString1utf8 == refString2utf8);
|
|
733 resultString1utf16 = toUpper(testString2utf16,buffer2utf16);
|
|
734 assert(resultString1utf16.ptr != buffer2utf16.ptr);
|
|
735 assert(resultString1utf16 == refString2utf16);
|
|
736 resultString1utf32 = toUpper(testString2utf32,buffer2utf32);
|
|
737 assert(resultString1utf32.ptr != buffer2utf32.ptr);
|
|
738 assert(resultString1utf32 == refString2utf32);
|
|
739
|
|
740 }
|
|
741
|
|
742
|
|
743 unittest {
|
|
744
|
|
745
|
|
746 // 1) No Buffer passed, no resize, no SpecialCase
|
|
747
|
|
748 char[] testString1utf8 = "\u00C4\u00D6\u00DC";
|
|
749 wchar[] testString1utf16 = "\u00C4\u00D6\u00DC";
|
|
750 dchar[] testString1utf32 = "\u00C4\u00D6\u00DC";
|
|
751 char[] refString1utf8 = "\u00E4\u00F6\u00FC";
|
|
752 wchar[] refString1utf16 = "\u00E4\u00F6\u00FC";
|
|
753 dchar[] refString1utf32 = "\u00E4\u00F6\u00FC";
|
|
754 char[] resultString1utf8 = toLower(testString1utf8);
|
|
755 assert(resultString1utf8 == refString1utf8);
|
|
756 wchar[] resultString1utf16 = toLower(testString1utf16);
|
|
757 assert(resultString1utf16 == refString1utf16);
|
|
758 dchar[] resultString1utf32 = toLower(testString1utf32);
|
|
759 assert(resultString1utf32 == refString1utf32);
|
|
760
|
|
761 // 2) Buffer passed, no resize, no SpecialCase
|
|
762 char[60] buffer1utf8;
|
|
763 wchar[30] buffer1utf16;
|
|
764 dchar[30] buffer1utf32;
|
|
765 resultString1utf8 = toLower(testString1utf8,buffer1utf8);
|
|
766 assert(resultString1utf8.ptr == buffer1utf8.ptr);
|
|
767 assert(resultString1utf8 == refString1utf8);
|
|
768 resultString1utf16 = toLower(testString1utf16,buffer1utf16);
|
|
769 assert(resultString1utf16.ptr == buffer1utf16.ptr);
|
|
770 assert(resultString1utf16 == refString1utf16);
|
|
771 resultString1utf32 = toLower(testString1utf32,buffer1utf32);
|
|
772 assert(resultString1utf32.ptr == buffer1utf32.ptr);
|
|
773 assert(resultString1utf32 == refString1utf32);
|
|
774
|
|
775 // 3/ Buffer passed, resize necessary, no Special case
|
|
776
|
|
777 char[5] buffer2utf8;
|
|
778 wchar[2] buffer2utf16;
|
|
779 dchar[2] buffer2utf32;
|
|
780 resultString1utf8 = toLower(testString1utf8,buffer2utf8);
|
|
781 assert(resultString1utf8.ptr != buffer2utf8.ptr);
|
|
782 assert(resultString1utf8 == refString1utf8);
|
|
783 resultString1utf16 = toLower(testString1utf16,buffer2utf16);
|
|
784 assert(resultString1utf16.ptr != buffer2utf16.ptr);
|
|
785 assert(resultString1utf16 == refString1utf16);
|
|
786 resultString1utf32 = toLower(testString1utf32,buffer2utf32);
|
|
787 assert(resultString1utf32.ptr != buffer2utf32.ptr);
|
|
788 assert(resultString1utf32 == refString1utf32);
|
|
789
|
|
790 // 4) Buffer passed, resize necessary, extensive SpecialCase
|
|
791
|
|
792 char[] testString2utf8 = "\u0130\u0130\u0130";
|
|
793 wchar[] testString2utf16 = "\u0130\u0130\u0130";
|
|
794 dchar[] testString2utf32 = "\u0130\u0130\u0130";
|
|
795 char[] refString2utf8 = "\u0069\u0307\u0069\u0307\u0069\u0307";
|
|
796 wchar[] refString2utf16 = "\u0069\u0307\u0069\u0307\u0069\u0307";
|
|
797 dchar[] refString2utf32 = "\u0069\u0307\u0069\u0307\u0069\u0307";
|
|
798 resultString1utf8 = toLower(testString2utf8,buffer2utf8);
|
|
799 assert(resultString1utf8.ptr != buffer2utf8.ptr);
|
|
800 assert(resultString1utf8 == refString2utf8);
|
|
801 resultString1utf16 = toLower(testString2utf16,buffer2utf16);
|
|
802 assert(resultString1utf16.ptr != buffer2utf16.ptr);
|
|
803 assert(resultString1utf16 == refString2utf16);
|
|
804 resultString1utf32 = toLower(testString2utf32,buffer2utf32);
|
|
805 assert(resultString1utf32.ptr != buffer2utf32.ptr);
|
|
806 assert(resultString1utf32 == refString2utf32);
|
|
807 }
|
|
808
|
|
809 unittest {
|
|
810 char[] testString1utf8 = "?!Mädchen \u0390\u0390,;";
|
|
811 char[] testString2utf8 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;";
|
|
812 assert(toFold(testString1utf8) == toFold(testString2utf8));
|
|
813 wchar[] testString1utf16 = "?!Mädchen \u0390\u0390,;";;
|
|
814 wchar[] testString2utf16 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;";
|
|
815 assert(toFold(testString1utf16) == toFold(testString2utf16));
|
|
816 wchar[] testString1utf32 = "?!Mädchen \u0390\u0390,;";
|
|
817 wchar[] testString2utf32 = "?!MÄDCHEN \u03B9\u0308\u0301\u03B9\u0308\u0301,;";
|
|
818 assert(toFold(testString1utf32) == toFold(testString2utf32));
|
|
819 }
|
|
820
|
|
821 }
|