comparison lphobos/std/string.d @ 108:288fe1029e1f trunk

[svn r112] Fixed 'case 1,2,3:' style case statements. Fixed a bunch of bugs with return/break/continue in loops. Fixed support for the DMDFE hidden implicit return value variable. This can be needed for some foreach statements where the loop body is converted to a nested delegate, but also possibly returns from the function. Added std.math to phobos. Added AA runtime support code, done ground work for implementing AAs. Several other bugfixes.
author lindquist
date Tue, 20 Nov 2007 05:29:20 +0100
parents
children 373489eeaf90
comparison
equal deleted inserted replaced
107:3efbcc81ba45 108:288fe1029e1f
1
2 // Written in the D programming language.
3
4 /**
5 * String handling functions.
6 *
7 * To copy or not to copy?
8 * When a function takes a string as a parameter, and returns a string,
9 * is that string the same as the input string, modified in place, or
10 * is it a modified copy of the input string? The D array convention is
11 * "copy-on-write". This means that if no modifications are done, the
12 * original string (or slices of it) can be returned. If any modifications
13 * are done, the returned string is a copy.
14 *
15 * Macros:
16 * WIKI = Phobos/StdString
17 * Copyright:
18 * Public Domain
19 */
20
21 /* Author:
22 * Walter Bright, Digital Mars, www.digitalmars.com
23 */
24
25 // The code is not optimized for speed, that will have to wait
26 // until the design is solidified.
27
28 module std.string;
29
30 //debug=string; // uncomment to turn on debugging printf's
31
32 //private import std.stdio;
33 private import std.c.stdio;
34 private import std.c.stdlib;
35 private import std.c.string;
36 private import std.utf;
37 private import std.uni;
38 private import std.array;
39 private import std.format;
40 private import std.ctype;
41 private import std.stdarg;
42
43 extern (C)
44 {
45
46 size_t wcslen(wchar *);
47 int wcscmp(wchar *, wchar *);
48 }
49
50 /* ************* Exceptions *************** */
51
52 /// Thrown on errors in string functions.
53 class StringException : Exception
54 {
55 this(char[] msg) /// Constructor
56 {
57 super(msg);
58 }
59 }
60
61 /* ************* Constants *************** */
62
63 const char[16] hexdigits = "0123456789ABCDEF"; /// 0..9A..F
64 const char[10] digits = "0123456789"; /// 0..9
65 const char[8] octdigits = "01234567"; /// 0..7
66 const char[26] lowercase = "abcdefghijklmnopqrstuvwxyz"; /// a..z
67 const char[26] uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; /// A..Z
68 const char[52] letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
69 "abcdefghijklmnopqrstuvwxyz"; /// A..Za..z
70 const char[6] whitespace = " \t\v\r\n\f"; /// ASCII whitespace
71
72 const dchar LS = '\u2028'; /// UTF line separator
73 const dchar PS = '\u2029'; /// UTF paragraph separator
74
75 /// Newline sequence for this system
76 version (Windows)
77 const char[2] newline = "\r\n";
78 else version (linux)
79 const char[1] newline = "\n";
80
81 /**********************************
82 * Returns true if c is whitespace
83 */
84
85 bool iswhite(dchar c)
86 {
87 return (c <= 0x7F)
88 ? find(whitespace, c) != -1
89 : (c == PS || c == LS);
90 }
91
92 /*********************************
93 * Convert string to integer.
94 */
95
96 long atoi(char[] s)
97 {
98 return std.c.stdlib.atoi(toStringz(s));
99 }
100
101 /*************************************
102 * Convert string to real.
103 */
104
105 real atof(char[] s)
106 { char* endptr;
107
108 auto result = strtold(toStringz(s), &endptr);
109 return result;
110 }
111
112 /**********************************
113 * Compare two strings. cmp is case sensitive, icmp is case insensitive.
114 * Returns:
115 * <table border=1 cellpadding=4 cellspacing=0>
116 * $(TR $(TD < 0) $(TD s1 < s2))
117 * $(TR $(TD = 0) $(TD s1 == s2))
118 * $(TR $(TD > 0) $(TD s1 > s2))
119 * </table>
120 */
121
122 int cmp(char[] s1, char[] s2)
123 {
124 auto len = s1.length;
125 int result;
126
127 //printf("cmp('%.*s', '%.*s')\n", s1, s2);
128 if (s2.length < len)
129 len = s2.length;
130 result = memcmp(s1.ptr, s2.ptr, len);
131 if (result == 0)
132 result = cast(int)s1.length - cast(int)s2.length;
133 return result;
134 }
135
136 /*********************************
137 * ditto
138 */
139
140 int icmp(char[] s1, char[] s2)
141 {
142 auto len = s1.length;
143 int result;
144
145 if (s2.length < len)
146 len = s2.length;
147 version (Win32)
148 {
149 result = memicmp(s1.ptr, s2.ptr, len);
150 }
151 version (linux)
152 {
153 for (size_t i = 0; i < len; i++)
154 {
155 if (s1[i] != s2[i])
156 {
157 char c1 = s1[i];
158 char c2 = s2[i];
159
160 if (c1 >= 'A' && c1 <= 'Z')
161 c1 += cast(int)'a' - cast(int)'A';
162 if (c2 >= 'A' && c2 <= 'Z')
163 c2 += cast(int)'a' - cast(int)'A';
164 result = cast(int)c1 - cast(int)c2;
165 if (result)
166 break;
167 }
168 }
169 }
170 if (result == 0)
171 result = cast(int)s1.length - cast(int)s2.length;
172 return result;
173 }
174
175 unittest
176 {
177 int result;
178
179 debug(string) printf("string.cmp.unittest\n");
180 result = icmp("abc", "abc");
181 assert(result == 0);
182 result = icmp(null, null);
183 assert(result == 0);
184 result = icmp("", "");
185 assert(result == 0);
186 result = icmp("abc", "abcd");
187 assert(result < 0);
188 result = icmp("abcd", "abc");
189 assert(result > 0);
190 result = icmp("abc", "abd");
191 assert(result < 0);
192 result = icmp("bbc", "abc");
193 assert(result > 0);
194 }
195
196 /* ********************************
197 * Converts a D array of chars to a C-style 0 terminated string.
198 * Deprecated: replaced with toStringz().
199 */
200
201 deprecated char* toCharz(char[] s)
202 {
203 return toStringz(s);
204 }
205
206 /*********************************
207 * Convert array of chars s[] to a C-style 0 terminated string.
208 */
209
210 char* toStringz(char[] s)
211 in
212 {
213 }
214 out (result)
215 {
216 if (result)
217 { assert(strlen(result) == s.length);
218 assert(memcmp(result, s.ptr, s.length) == 0);
219 }
220 }
221 body
222 {
223 char[] copy;
224
225 if (s.length == 0)
226 return "";
227
228 /+ Unfortunately, this isn't reliable.
229 We could make this work if string literals are put
230 in read-only memory and we test if s[] is pointing into
231 that.
232
233 /* Peek past end of s[], if it's 0, no conversion necessary.
234 * Note that the compiler will put a 0 past the end of static
235 * strings, and the storage allocator will put a 0 past the end
236 * of newly allocated char[]'s.
237 */
238 char* p = &s[0] + s.length;
239 if (*p == 0)
240 return s;
241 +/
242
243 // Need to make a copy
244 copy = new char[s.length + 1];
245 copy[0..s.length] = s;
246 copy[s.length] = 0;
247 return copy.ptr;
248 }
249
250 unittest
251 {
252 debug(string) printf("string.toStringz.unittest\n");
253
254 char* p = toStringz("foo");
255 assert(strlen(p) == 3);
256 char foo[] = "abbzxyzzy";
257 p = toStringz(foo[3..5]);
258 assert(strlen(p) == 2);
259
260 char[] test = "";
261 p = toStringz(test);
262 assert(*p == 0);
263 }
264
265 /******************************************
266 * find, ifind _find first occurrence of c in string s.
267 * rfind, irfind _find last occurrence of c in string s.
268 *
269 * find, rfind are case sensitive; ifind, irfind are case insensitive.
270 * Returns:
271 * Index in s where c is found, -1 if not found.
272 */
273
274 int find(char[] s, dchar c)
275 {
276 if (c <= 0x7F)
277 { // Plain old ASCII
278 auto p = cast(char*)memchr(s.ptr, c, s.length);
279 if (p)
280 return p - cast(char *)s;
281 else
282 return -1;
283 }
284
285 // c is a universal character
286 foreach (int i, dchar c2; s)
287 {
288 if (c == c2)
289 return i;
290 }
291 return -1;
292 }
293
294 unittest
295 {
296 debug(string) printf("string.find.unittest\n");
297
298 int i;
299
300 i = find(null, cast(dchar)'a');
301 assert(i == -1);
302 i = find("def", cast(dchar)'a');
303 assert(i == -1);
304 i = find("abba", cast(dchar)'a');
305 assert(i == 0);
306 i = find("def", cast(dchar)'f');
307 assert(i == 2);
308 }
309
310
311 /******************************************
312 * ditto
313 */
314
315 int ifind(char[] s, dchar c)
316 {
317 char* p;
318
319 if (c <= 0x7F)
320 { // Plain old ASCII
321 char c1 = cast(char) std.ctype.tolower(c);
322
323 foreach (int i, char c2; s)
324 {
325 c2 = cast(char)std.ctype.tolower(c2);
326 if (c1 == c2)
327 return i;
328 }
329 }
330 else
331 { // c is a universal character
332 dchar c1 = std.uni.toUniLower(c);
333
334 foreach (int i, dchar c2; s)
335 {
336 c2 = std.uni.toUniLower(c2);
337 if (c1 == c2)
338 return i;
339 }
340 }
341 return -1;
342 }
343
344 unittest
345 {
346 debug(string) printf("string.ifind.unittest\n");
347
348 int i;
349
350 i = ifind(null, cast(dchar)'a');
351 assert(i == -1);
352 i = ifind("def", cast(dchar)'a');
353 assert(i == -1);
354 i = ifind("Abba", cast(dchar)'a');
355 assert(i == 0);
356 i = ifind("def", cast(dchar)'F');
357 assert(i == 2);
358
359 char[] sPlts = "Mars: the fourth Rock (Planet) from the Sun.";
360
361 i = ifind("def", cast(char)'f');
362 assert(i == 2);
363
364 i = ifind(sPlts, cast(char)'P');
365 assert(i == 23);
366 i = ifind(sPlts, cast(char)'R');
367 assert(i == 2);
368 }
369
370
371 /******************************************
372 * ditto
373 */
374
375 int rfind(char[] s, dchar c)
376 {
377 size_t i;
378
379 if (c <= 0x7F)
380 { // Plain old ASCII
381 for (i = s.length; i-- != 0;)
382 {
383 if (s[i] == c)
384 break;
385 }
386 return i;
387 }
388
389 // c is a universal character
390 char[4] buf;
391 char[] t;
392 t = std.utf.toUTF8(buf, c);
393 return rfind(s, t);
394 }
395
396 unittest
397 {
398 debug(string) printf("string.rfind.unittest\n");
399
400 int i;
401
402 i = rfind(null, cast(dchar)'a');
403 assert(i == -1);
404 i = rfind("def", cast(dchar)'a');
405 assert(i == -1);
406 i = rfind("abba", cast(dchar)'a');
407 assert(i == 3);
408 i = rfind("def", cast(dchar)'f');
409 assert(i == 2);
410 }
411
412 /******************************************
413 * ditto
414 */
415
416 int irfind(char[] s, dchar c)
417 {
418 size_t i;
419
420 if (c <= 0x7F)
421 { // Plain old ASCII
422 char c1 = cast(char) std.ctype.tolower(c);
423
424 for (i = s.length; i-- != 0;)
425 { char c2 = s[i];
426
427 c2 = cast(char) std.ctype.tolower(c2);
428 if (c1 == c2)
429 break;
430 }
431 }
432 else
433 { // c is a universal character
434 dchar c1 = std.uni.toUniLower(c);
435
436 for (i = s.length; i-- != 0;)
437 { char cx = s[i];
438
439 if (cx <= 0x7F)
440 continue; // skip, since c is not ASCII
441 if ((cx & 0xC0) == 0x80)
442 continue; // skip non-starting UTF-8 chars
443
444 size_t j = i;
445 dchar c2 = std.utf.decode(s, j);
446 c2 = std.uni.toUniLower(c2);
447 if (c1 == c2)
448 break;
449 }
450 }
451 return i;
452 }
453
454 unittest
455 {
456 debug(string) printf("string.irfind.unittest\n");
457
458 int i;
459
460 i = irfind(null, cast(dchar)'a');
461 assert(i == -1);
462 i = irfind("def", cast(dchar)'a');
463 assert(i == -1);
464 i = irfind("AbbA", cast(dchar)'a');
465 assert(i == 3);
466 i = irfind("def", cast(dchar)'F');
467 assert(i == 2);
468
469 char[] sPlts = "Mars: the fourth Rock (Planet) from the Sun.";
470
471 i = irfind("def", cast(char)'f');
472 assert(i == 2);
473
474 i = irfind(sPlts, cast(char)'M');
475 assert(i == 34);
476 i = irfind(sPlts, cast(char)'S');
477 assert(i == 40);
478 }
479
480
481 /******************************************
482 * find, ifind _find first occurrence of sub[] in string s[].
483 * rfind, irfind _find last occurrence of sub[] in string s[].
484 *
485 * find, rfind are case sensitive; ifind, irfind are case insensitive.
486 * Returns:
487 * Index in s where c is found, -1 if not found.
488 */
489
490 int find(char[] s, char[] sub)
491 out (result)
492 {
493 if (result == -1)
494 {
495 }
496 else
497 {
498 assert(0 <= result && result < s.length - sub.length + 1);
499 assert(memcmp(&s[result], sub.ptr, sub.length) == 0);
500 }
501 }
502 body
503 {
504 auto sublength = sub.length;
505
506 if (sublength == 0)
507 return 0;
508
509 if (s.length >= sublength)
510 {
511 auto c = sub[0];
512 if (sublength == 1)
513 {
514 auto p = cast(char*)memchr(s.ptr, c, s.length);
515 if (p)
516 return p - &s[0];
517 }
518 else
519 {
520 size_t imax = s.length - sublength + 1;
521
522 // Remainder of sub[]
523 char *q = &sub[1];
524 sublength--;
525
526 for (size_t i = 0; i < imax; i++)
527 {
528 char *p = cast(char*)memchr(&s[i], c, imax - i);
529 if (!p)
530 break;
531 i = p - &s[0];
532 if (memcmp(p + 1, q, sublength) == 0)
533 return i;
534 }
535 }
536 }
537 return -1;
538 }
539
540
541 unittest
542 {
543 debug(string) printf("string.find.unittest\n");
544
545 int i;
546
547 i = find(null, "a");
548 assert(i == -1);
549 i = find("def", "a");
550 assert(i == -1);
551 i = find("abba", "a");
552 assert(i == 0);
553 i = find("def", "f");
554 assert(i == 2);
555 i = find("dfefffg", "fff");
556 assert(i == 3);
557 i = find("dfeffgfff", "fff");
558 assert(i == 6);
559 }
560
561 /******************************************
562 * ditto
563 */
564
565 int ifind(char[] s, char[] sub)
566 out (result)
567 {
568 if (result == -1)
569 {
570 }
571 else
572 {
573 assert(0 <= result && result < s.length - sub.length + 1);
574 assert(icmp(s[result .. result + sub.length], sub) == 0);
575 }
576 }
577 body
578 {
579 auto sublength = sub.length;
580 int i;
581
582 if (sublength == 0)
583 return 0;
584
585 if (s.length < sublength)
586 return -1;
587
588 auto c = sub[0];
589 if (sublength == 1)
590 {
591 i = ifind(s, c);
592 }
593 else if (c <= 0x7F)
594 {
595 size_t imax = s.length - sublength + 1;
596
597 // Remainder of sub[]
598 char[] subn = sub[1 .. sublength];
599
600 for (i = 0; i < imax; i++)
601 {
602 auto j = ifind(s[i .. imax], c);
603 if (j == -1)
604 return -1;
605 i += j;
606 if (icmp(s[i + 1 .. i + sublength], subn) == 0)
607 return i;
608 }
609 i = -1;
610 }
611 else
612 {
613 size_t imax = s.length - sublength;
614
615 for (i = 0; i <= imax; i++)
616 {
617 if (icmp(s[i .. i + sublength], sub) == 0)
618 return i;
619 }
620 i = -1;
621 }
622 return i;
623 }
624
625
626 unittest
627 {
628 debug(string) printf("string.ifind.unittest\n");
629
630 int i;
631
632 i = ifind(null, "a");
633 assert(i == -1);
634 i = ifind("def", "a");
635 assert(i == -1);
636 i = ifind("abba", "a");
637 assert(i == 0);
638 i = ifind("def", "f");
639 assert(i == 2);
640 i = ifind("dfefffg", "fff");
641 assert(i == 3);
642 i = ifind("dfeffgfff", "fff");
643 assert(i == 6);
644
645 char[] sPlts = "Mars: the fourth Rock (Planet) from the Sun.";
646 char[] sMars = "Who\'s \'My Favorite Maritian?\'";
647
648 i = ifind(sMars, "MY fAVe");
649 assert(i == -1);
650 i = ifind(sMars, "mY fAVOriTe");
651 assert(i == 7);
652 i = ifind(sPlts, "mArS:");
653 assert(i == 0);
654 i = ifind(sPlts, "rOcK");
655 assert(i == 17);
656 i = ifind(sPlts, "Un.");
657 assert(i == 41);
658 i = ifind(sPlts, sPlts);
659 assert(i == 0);
660
661 i = ifind("\u0100", "\u0100");
662 assert(i == 0);
663
664 // Thanks to Carlos Santander B. and zwang
665 i = ifind("sus mejores cortesanos. Se embarcaron en el puerto de Dubai y",
666 "page-break-before");
667 assert(i == -1);
668 }
669
670 /******************************************
671 * ditto
672 */
673
674 int rfind(char[] s, char[] sub)
675 out (result)
676 {
677 if (result == -1)
678 {
679 }
680 else
681 {
682 assert(0 <= result && result < s.length - sub.length + 1);
683 assert(memcmp(&s[0] + result, sub.ptr, sub.length) == 0);
684 }
685 }
686 body
687 {
688 char c;
689
690 if (sub.length == 0)
691 return s.length;
692 c = sub[0];
693 if (sub.length == 1)
694 return rfind(s, c);
695 for (int i = s.length - sub.length; i >= 0; i--)
696 {
697 if (s[i] == c)
698 {
699 if (memcmp(&s[i + 1], &sub[1], sub.length - 1) == 0)
700 return i;
701 }
702 }
703 return -1;
704 }
705
706 unittest
707 {
708 int i;
709
710 debug(string) printf("string.rfind.unittest\n");
711 i = rfind("abcdefcdef", "c");
712 assert(i == 6);
713 i = rfind("abcdefcdef", "cd");
714 assert(i == 6);
715 i = rfind("abcdefcdef", "x");
716 assert(i == -1);
717 i = rfind("abcdefcdef", "xy");
718 assert(i == -1);
719 i = rfind("abcdefcdef", "");
720 assert(i == 10);
721 }
722
723
724 /******************************************
725 * ditto
726 */
727
728 int irfind(char[] s, char[] sub)
729 out (result)
730 {
731 if (result == -1)
732 {
733 }
734 else
735 {
736 assert(0 <= result && result < s.length - sub.length + 1);
737 assert(icmp(s[result .. result + sub.length], sub) == 0);
738 }
739 }
740 body
741 {
742 dchar c;
743
744 if (sub.length == 0)
745 return s.length;
746 c = sub[0];
747 if (sub.length == 1)
748 return irfind(s, c);
749 if (c <= 0x7F)
750 {
751 c = std.ctype.tolower(c);
752 for (int i = s.length - sub.length; i >= 0; i--)
753 {
754 if (std.ctype.tolower(s[i]) == c)
755 {
756 if (icmp(s[i + 1 .. i + sub.length], sub[1 .. sub.length]) == 0)
757 return i;
758 }
759 }
760 }
761 else
762 {
763 for (int i = s.length - sub.length; i >= 0; i--)
764 {
765 if (icmp(s[i .. i + sub.length], sub) == 0)
766 return i;
767 }
768 }
769 return -1;
770 }
771
772 unittest
773 {
774 int i;
775
776 debug(string) printf("string.irfind.unittest\n");
777 i = irfind("abcdefCdef", "c");
778 assert(i == 6);
779 i = irfind("abcdefCdef", "cD");
780 assert(i == 6);
781 i = irfind("abcdefcdef", "x");
782 assert(i == -1);
783 i = irfind("abcdefcdef", "xy");
784 assert(i == -1);
785 i = irfind("abcdefcdef", "");
786 assert(i == 10);
787
788 char[] sPlts = "Mars: the fourth Rock (Planet) from the Sun.";
789 char[] sMars = "Who\'s \'My Favorite Maritian?\'";
790
791 i = irfind("abcdefcdef", "c");
792 assert(i == 6);
793 i = irfind("abcdefcdef", "cd");
794 assert(i == 6);
795 i = irfind( "abcdefcdef", "def" );
796 assert(i == 7);
797
798 i = irfind(sMars, "RiTE maR");
799 assert(i == 14);
800 i = irfind(sPlts, "FOuRTh");
801 assert(i == 10);
802 i = irfind(sMars, "whO\'s \'MY");
803 assert(i == 0);
804 i = irfind(sMars, sMars);
805 assert(i == 0);
806 }
807
808
809 /************************************
810 * Convert string s[] to lower case.
811 */
812
813 string tolower(string s)
814 {
815 int changed;
816 char[] r;
817
818 for (size_t i = 0; i < s.length; i++)
819 {
820 auto c = s[i];
821 if ('A' <= c && c <= 'Z')
822 {
823 if (!changed)
824 {
825 r = s.dup;
826 changed = 1;
827 }
828 r[i] = cast(char) (c + (cast(char)'a' - 'A'));
829 }
830 else if (c > 0x7F)
831 {
832 foreach (size_t j, dchar dc; s[i .. length])
833 {
834 if (std.uni.isUniUpper(dc))
835 {
836 dc = std.uni.toUniLower(dc);
837 if (!changed)
838 {
839 r = s[0 .. i + j].dup;
840 changed = 2;
841 }
842 }
843 if (changed)
844 {
845 if (changed == 1)
846 { r = r[0 .. i + j];
847 changed = 2;
848 }
849 std.utf.encode(r, dc);
850 }
851 }
852 break;
853 }
854 }
855 return changed ? r : s;
856 }
857
858 unittest
859 {
860 debug(string) printf("string.tolower.unittest\n");
861
862 char[] s1 = "FoL";
863 char[] s2;
864
865 s2 = tolower(s1);
866 assert(cmp(s2, "fol") == 0);
867 assert(s2 != s1);
868
869 s1 = "A\u0100B\u0101d";
870 s2 = tolower(s1);
871 assert(cmp(s2, "a\u0101b\u0101d") == 0);
872 assert(s2 !is s1);
873
874 s1 = "A\u0460B\u0461d";
875 s2 = tolower(s1);
876 assert(cmp(s2, "a\u0461b\u0461d") == 0);
877 assert(s2 !is s1);
878
879 s1 = "\u0130";
880 s2 = tolower(s1);
881 assert(s2 == "i");
882 assert(s2 !is s1);
883 }
884
885 /************************************
886 * Convert string s[] to upper case.
887 */
888
889 string toupper(string s)
890 {
891 int changed;
892 char[] r;
893
894 for (size_t i = 0; i < s.length; i++)
895 {
896 auto c = s[i];
897 if ('a' <= c && c <= 'z')
898 {
899 if (!changed)
900 {
901 r = s.dup;
902 changed = 1;
903 }
904 r[i] = cast(char) (c - (cast(char)'a' - 'A'));
905 }
906 else if (c > 0x7F)
907 {
908 foreach (size_t j, dchar dc; s[i .. length])
909 {
910 if (std.uni.isUniLower(dc))
911 {
912 dc = std.uni.toUniUpper(dc);
913 if (!changed)
914 {
915 r = s[0 .. i + j].dup;
916 changed = 2;
917 }
918 }
919 if (changed)
920 {
921 if (changed == 1)
922 { r = r[0 .. i + j];
923 changed = 2;
924 }
925 std.utf.encode(r, dc);
926 }
927 }
928 break;
929 }
930 }
931 return changed ? r : s;
932 }
933
934 unittest
935 {
936 debug(string) printf("string.toupper.unittest\n");
937
938 char[] s1 = "FoL";
939 char[] s2;
940
941 s2 = toupper(s1);
942 assert(cmp(s2, "FOL") == 0);
943 assert(s2 !is s1);
944
945 s1 = "a\u0100B\u0101d";
946 s2 = toupper(s1);
947 assert(cmp(s2, "A\u0100B\u0100D") == 0);
948 assert(s2 !is s1);
949
950 s1 = "a\u0460B\u0461d";
951 s2 = toupper(s1);
952 assert(cmp(s2, "A\u0460B\u0460D") == 0);
953 assert(s2 !is s1);
954 }
955
956
957 /********************************************
958 * Capitalize first character of string s[], convert rest of string s[]
959 * to lower case.
960 */
961
962 char[] capitalize(char[] s)
963 {
964 int changed;
965 int i;
966 char[] r = s;
967
968 changed = 0;
969
970 foreach (size_t i, dchar c; s)
971 { dchar c2;
972
973 if (i == 0)
974 {
975 c2 = std.uni.toUniUpper(c);
976 if (c != c2)
977 {
978 changed = 1;
979 r = null;
980 }
981 }
982 else
983 {
984 c2 = std.uni.toUniLower(c);
985 if (c != c2)
986 {
987 if (!changed)
988 { changed = 1;
989 r = s[0 .. i].dup;
990 }
991 }
992 }
993 if (changed)
994 std.utf.encode(r, c2);
995 }
996 return r;
997 }
998
999
1000 unittest
1001 {
1002 debug(string) printf("string.toupper.capitalize\n");
1003
1004 char[] s1 = "FoL";
1005 char[] s2;
1006
1007 s2 = capitalize(s1);
1008 assert(cmp(s2, "Fol") == 0);
1009 assert(s2 !is s1);
1010
1011 s2 = capitalize(s1[0 .. 2]);
1012 assert(cmp(s2, "Fo") == 0);
1013 assert(s2.ptr == s1.ptr);
1014
1015 s1 = "fOl";
1016 s2 = capitalize(s1);
1017 assert(cmp(s2, "Fol") == 0);
1018 assert(s2 !is s1);
1019 }
1020
1021
1022 /********************************************
1023 * Capitalize all words in string s[].
1024 * Remove leading and trailing whitespace.
1025 * Replace all sequences of whitespace with a single space.
1026 */
1027
1028 char[] capwords(char[] s)
1029 {
1030 char[] r;
1031 bool inword = false;
1032 size_t istart = 0;
1033 size_t i;
1034
1035 for (i = 0; i < s.length; i++)
1036 {
1037 switch (s[i])
1038 {
1039 case ' ':
1040 case '\t':
1041 case '\f':
1042 case '\r':
1043 case '\n':
1044 case '\v':
1045 if (inword)
1046 {
1047 r ~= capitalize(s[istart .. i]);
1048 inword = false;
1049 }
1050 break;
1051
1052 default:
1053 if (!inword)
1054 {
1055 if (r.length)
1056 r ~= ' ';
1057 istart = i;
1058 inword = true;
1059 }
1060 break;
1061 }
1062 }
1063 if (inword)
1064 {
1065 r ~= capitalize(s[istart .. i]);
1066 }
1067
1068 return r;
1069 }
1070
1071
1072 unittest
1073 {
1074 debug(string) printf("string.capwords.unittest\n");
1075
1076 char[] s1 = "\tfoo abc(aD)* \t (q PTT ";
1077 char[] s2;
1078
1079 s2 = capwords(s1);
1080 //writefln("s2 = '%s'", s2);
1081 assert(cmp(s2, "Foo Abc(ad)* (q Ptt") == 0);
1082 }
1083
1084 /********************************************
1085 * Return a string that consists of s[] repeated n times.
1086 */
1087
1088 char[] repeat(char[] s, size_t n)
1089 {
1090 if (n == 0)
1091 return null;
1092 if (n == 1)
1093 return s;
1094 char[] r = new char[n * s.length];
1095 if (s.length == 1)
1096 r[] = s[0];
1097 else
1098 { auto len = s.length;
1099
1100 for (size_t i = 0; i < n * len; i += len)
1101 {
1102 r[i .. i + len] = s[];
1103 }
1104 }
1105 return r;
1106 }
1107
1108
1109 unittest
1110 {
1111 debug(string) printf("string.repeat.unittest\n");
1112
1113 char[] s;
1114
1115 s = repeat("1234", 0);
1116 assert(s is null);
1117 s = repeat("1234", 1);
1118 assert(cmp(s, "1234") == 0);
1119 s = repeat("1234", 2);
1120 assert(cmp(s, "12341234") == 0);
1121 s = repeat("1", 4);
1122 assert(cmp(s, "1111") == 0);
1123 s = repeat(null, 4);
1124 assert(s is null);
1125 }
1126
1127
1128 /********************************************
1129 * Concatenate all the strings in words[] together into one
1130 * string; use sep[] as the separator.
1131 */
1132
1133 char[] join(char[][] words, char[] sep)
1134 {
1135 char[] result;
1136
1137 if (words.length)
1138 {
1139 size_t len = 0;
1140 size_t i;
1141
1142 for (i = 0; i < words.length; i++)
1143 len += words[i].length;
1144
1145 auto seplen = sep.length;
1146 len += (words.length - 1) * seplen;
1147
1148 result = new char[len];
1149
1150 size_t j;
1151 i = 0;
1152 while (true)
1153 {
1154 uint wlen = words[i].length;
1155
1156 result[j .. j + wlen] = words[i];
1157 j += wlen;
1158 i++;
1159 if (i >= words.length)
1160 break;
1161 result[j .. j + seplen] = sep;
1162 j += seplen;
1163 }
1164 assert(j == len);
1165 }
1166 return result;
1167 }
1168
1169 unittest
1170 {
1171 debug(string) printf("string.join.unittest\n");
1172
1173 char[] word1 = "peter";
1174 char[] word2 = "paul";
1175 char[] word3 = "jerry";
1176 char[][3] words;
1177 char[] r;
1178 int i;
1179
1180 words[0] = word1;
1181 words[1] = word2;
1182 words[2] = word3;
1183 r = join(words, ",");
1184 i = cmp(r, "peter,paul,jerry");
1185 assert(i == 0);
1186 }
1187
1188
1189 /**************************************
1190 * Split s[] into an array of words,
1191 * using whitespace as the delimiter.
1192 */
1193
1194 char[][] split(char[] s)
1195 {
1196 size_t i;
1197 size_t istart = 0;
1198 bool inword = false;
1199 char[][] words;
1200
1201 for (i = 0; i < s.length; i++)
1202 {
1203 switch (s[i])
1204 {
1205 case ' ':
1206 case '\t':
1207 case '\f':
1208 case '\r':
1209 case '\n':
1210 case '\v':
1211 if (inword)
1212 {
1213 words ~= s[istart .. i];
1214 inword = false;
1215 }
1216 break;
1217
1218 default:
1219 if (!inword)
1220 { istart = i;
1221 inword = true;
1222 }
1223 break;
1224 }
1225 }
1226 if (inword)
1227 words ~= s[istart .. i];
1228 return words;
1229 }
1230
1231 unittest
1232 {
1233 debug(string) printf("string.split1\n");
1234
1235 char[] s = " peter paul\tjerry ";
1236 char[][] words;
1237 int i;
1238
1239 words = split(s);
1240 assert(words.length == 3);
1241 i = cmp(words[0], "peter");
1242 assert(i == 0);
1243 i = cmp(words[1], "paul");
1244 assert(i == 0);
1245 i = cmp(words[2], "jerry");
1246 assert(i == 0);
1247 }
1248
1249
1250 /**************************************
1251 * Split s[] into an array of words,
1252 * using delim[] as the delimiter.
1253 */
1254
1255 char[][] split(char[] s, char[] delim)
1256 in
1257 {
1258 assert(delim.length > 0);
1259 }
1260 body
1261 {
1262 size_t i;
1263 size_t j;
1264 char[][] words;
1265
1266 i = 0;
1267 if (s.length)
1268 {
1269 if (delim.length == 1)
1270 { char c = delim[0];
1271 size_t nwords = 0;
1272 char* p = &s[0];
1273 char* pend = p + s.length;
1274
1275 while (true)
1276 {
1277 nwords++;
1278 p = cast(char*)memchr(p, c, pend - p);
1279 if (!p)
1280 break;
1281 p++;
1282 if (p == pend)
1283 { nwords++;
1284 break;
1285 }
1286 }
1287 words.length = nwords;
1288
1289 int wordi = 0;
1290 i = 0;
1291 while (true)
1292 {
1293 p = cast(char*)memchr(&s[i], c, s.length - i);
1294 if (!p)
1295 {
1296 words[wordi] = s[i .. s.length];
1297 break;
1298 }
1299 j = p - &s[0];
1300 words[wordi] = s[i .. j];
1301 wordi++;
1302 i = j + 1;
1303 if (i == s.length)
1304 {
1305 words[wordi] = "";
1306 break;
1307 }
1308 }
1309 assert(wordi + 1 == nwords);
1310 }
1311 else
1312 { size_t nwords = 0;
1313
1314 while (true)
1315 {
1316 nwords++;
1317 j = find(s[i .. s.length], delim);
1318 if (j == -1)
1319 break;
1320 i += j + delim.length;
1321 if (i == s.length)
1322 { nwords++;
1323 break;
1324 }
1325 assert(i < s.length);
1326 }
1327 words.length = nwords;
1328
1329 int wordi = 0;
1330 i = 0;
1331 while (true)
1332 {
1333 j = find(s[i .. s.length], delim);
1334 if (j == -1)
1335 {
1336 words[wordi] = s[i .. s.length];
1337 break;
1338 }
1339 words[wordi] = s[i .. i + j];
1340 wordi++;
1341 i += j + delim.length;
1342 if (i == s.length)
1343 {
1344 words[wordi] = "";
1345 break;
1346 }
1347 assert(i < s.length);
1348 }
1349 assert(wordi + 1 == nwords);
1350 }
1351 }
1352 return words;
1353 }
1354
1355 unittest
1356 {
1357 debug(string) printf("string.split2\n");
1358
1359 char[] s = ",peter,paul,jerry,";
1360 char[][] words;
1361 int i;
1362
1363 words = split(s, ",");
1364 assert(words.length == 5);
1365 i = cmp(words[0], "");
1366 assert(i == 0);
1367 i = cmp(words[1], "peter");
1368 assert(i == 0);
1369 i = cmp(words[2], "paul");
1370 assert(i == 0);
1371 i = cmp(words[3], "jerry");
1372 assert(i == 0);
1373 i = cmp(words[4], "");
1374 assert(i == 0);
1375
1376 s = s[0 .. s.length - 1]; // lop off trailing ','
1377 words = split(s, ",");
1378 assert(words.length == 4);
1379 i = cmp(words[3], "jerry");
1380 assert(i == 0);
1381
1382 s = s[1 .. s.length]; // lop off leading ','
1383 words = split(s, ",");
1384 assert(words.length == 3);
1385 i = cmp(words[0], "peter");
1386 assert(i == 0);
1387
1388 char[] s2 = ",,peter,,paul,,jerry,,";
1389
1390 words = split(s2, ",,");
1391 //printf("words.length = %d\n", words.length);
1392 assert(words.length == 5);
1393 i = cmp(words[0], "");
1394 assert(i == 0);
1395 i = cmp(words[1], "peter");
1396 assert(i == 0);
1397 i = cmp(words[2], "paul");
1398 assert(i == 0);
1399 i = cmp(words[3], "jerry");
1400 assert(i == 0);
1401 i = cmp(words[4], "");
1402 assert(i == 0);
1403
1404 s2 = s2[0 .. s2.length - 2]; // lop off trailing ',,'
1405 words = split(s2, ",,");
1406 assert(words.length == 4);
1407 i = cmp(words[3], "jerry");
1408 assert(i == 0);
1409
1410 s2 = s2[2 .. s2.length]; // lop off leading ',,'
1411 words = split(s2, ",,");
1412 assert(words.length == 3);
1413 i = cmp(words[0], "peter");
1414 assert(i == 0);
1415 }
1416
1417
1418 /**************************************
1419 * Split s[] into an array of lines,
1420 * using CR, LF, or CR-LF as the delimiter.
1421 * The delimiter is not included in the line.
1422 */
1423
1424 char[][] splitlines(char[] s)
1425 {
1426 uint i;
1427 uint istart;
1428 uint nlines;
1429 char[][] lines;
1430
1431 nlines = 0;
1432 for (i = 0; i < s.length; i++)
1433 { char c;
1434
1435 c = s[i];
1436 if (c == '\r' || c == '\n')
1437 {
1438 nlines++;
1439 istart = i + 1;
1440 if (c == '\r' && i + 1 < s.length && s[i + 1] == '\n')
1441 {
1442 i++;
1443 istart++;
1444 }
1445 }
1446 }
1447 if (istart != i)
1448 nlines++;
1449
1450 lines = new char[][nlines];
1451 nlines = 0;
1452 istart = 0;
1453 for (i = 0; i < s.length; i++)
1454 { char c;
1455
1456 c = s[i];
1457 if (c == '\r' || c == '\n')
1458 {
1459 lines[nlines] = s[istart .. i];
1460 nlines++;
1461 istart = i + 1;
1462 if (c == '\r' && i + 1 < s.length && s[i + 1] == '\n')
1463 {
1464 i++;
1465 istart++;
1466 }
1467 }
1468 }
1469 if (istart != i)
1470 { lines[nlines] = s[istart .. i];
1471 nlines++;
1472 }
1473
1474 assert(nlines == lines.length);
1475 return lines;
1476 }
1477
1478 unittest
1479 {
1480 debug(string) printf("string.splitlines\n");
1481
1482 char[] s = "\rpeter\n\rpaul\r\njerry\n";
1483 char[][] lines;
1484 int i;
1485
1486 lines = splitlines(s);
1487 //printf("lines.length = %d\n", lines.length);
1488 assert(lines.length == 5);
1489 //printf("lines[0] = %llx, '%.*s'\n", lines[0], lines[0]);
1490 assert(lines[0].length == 0);
1491 i = cmp(lines[1], "peter");
1492 assert(i == 0);
1493 assert(lines[2].length == 0);
1494 i = cmp(lines[3], "paul");
1495 assert(i == 0);
1496 i = cmp(lines[4], "jerry");
1497 assert(i == 0);
1498
1499 s = s[0 .. s.length - 1]; // lop off trailing \n
1500 lines = splitlines(s);
1501 //printf("lines.length = %d\n", lines.length);
1502 assert(lines.length == 5);
1503 i = cmp(lines[4], "jerry");
1504 assert(i == 0);
1505 }
1506
1507
1508 /*****************************************
1509 * Strips leading or trailing whitespace, or both.
1510 */
1511
1512 char[] stripl(char[] s)
1513 {
1514 uint i;
1515
1516 for (i = 0; i < s.length; i++)
1517 {
1518 if (!std.ctype.isspace(s[i]))
1519 break;
1520 }
1521 return s[i .. s.length];
1522 }
1523
1524 char[] stripr(char[] s) /// ditto
1525 {
1526 uint i;
1527
1528 for (i = s.length; i > 0; i--)
1529 {
1530 if (!std.ctype.isspace(s[i - 1]))
1531 break;
1532 }
1533 return s[0 .. i];
1534 }
1535
1536 char[] strip(char[] s) /// ditto
1537 {
1538 return stripr(stripl(s));
1539 }
1540
1541 unittest
1542 {
1543 debug(string) printf("string.strip.unittest\n");
1544 char[] s;
1545 int i;
1546
1547 s = strip(" foo\t ");
1548 i = cmp(s, "foo");
1549 assert(i == 0);
1550 }
1551
1552 /*******************************************
1553 * Returns s[] sans trailing delimiter[], if any.
1554 * If delimiter[] is null, removes trailing CR, LF, or CRLF, if any.
1555 */
1556
1557 char[] chomp(char[] s, char[] delimiter = null)
1558 {
1559 if (delimiter is null)
1560 { auto len = s.length;
1561
1562 if (len)
1563 { auto c = s[len - 1];
1564
1565 if (c == '\r') // if ends in CR
1566 len--;
1567 else if (c == '\n') // if ends in LF
1568 {
1569 len--;
1570 if (len && s[len - 1] == '\r')
1571 len--; // remove CR-LF
1572 }
1573 }
1574 return s[0 .. len];
1575 }
1576 else if (s.length >= delimiter.length)
1577 {
1578 if (s[length - delimiter.length .. length] == delimiter)
1579 return s[0 .. length - delimiter.length];
1580 }
1581 return s;
1582 }
1583
1584 unittest
1585 {
1586 debug(string) printf("string.chomp.unittest\n");
1587 char[] s;
1588
1589 s = chomp(null);
1590 assert(s is null);
1591 s = chomp("hello");
1592 assert(s == "hello");
1593 s = chomp("hello\n");
1594 assert(s == "hello");
1595 s = chomp("hello\r");
1596 assert(s == "hello");
1597 s = chomp("hello\r\n");
1598 assert(s == "hello");
1599 s = chomp("hello\n\r");
1600 assert(s == "hello\n");
1601 s = chomp("hello\n\n");
1602 assert(s == "hello\n");
1603 s = chomp("hello\r\r");
1604 assert(s == "hello\r");
1605 s = chomp("hello\nxxx\n");
1606 assert(s == "hello\nxxx");
1607
1608 s = chomp(null, null);
1609 assert(s is null);
1610 s = chomp("hello", "o");
1611 assert(s == "hell");
1612 s = chomp("hello", "p");
1613 assert(s == "hello");
1614 s = chomp("hello", null);
1615 assert(s == "hello");
1616 s = chomp("hello", "llo");
1617 assert(s == "he");
1618 }
1619
1620
1621 /***********************************************
1622 * Returns s[] sans trailing character, if there is one.
1623 * If last two characters are CR-LF, then both are removed.
1624 */
1625
1626 char[] chop(char[] s)
1627 { auto len = s.length;
1628
1629 if (len)
1630 {
1631 if (len >= 2 && s[len - 1] == '\n' && s[len - 2] == '\r')
1632 return s[0 .. len - 2];
1633
1634 // If we're in a tail of a UTF-8 sequence, back up
1635 while ((s[len - 1] & 0xC0) == 0x80)
1636 {
1637 len--;
1638 if (len == 0)
1639 throw new std.utf.UtfException("invalid UTF sequence", 0);
1640 }
1641
1642 return s[0 .. len - 1];
1643 }
1644 return s;
1645 }
1646
1647
1648 unittest
1649 {
1650 debug(string) printf("string.chop.unittest\n");
1651 char[] s;
1652
1653 s = chop(null);
1654 assert(s is null);
1655 s = chop("hello");
1656 assert(s == "hell");
1657 s = chop("hello\r\n");
1658 assert(s == "hello");
1659 s = chop("hello\n\r");
1660 assert(s == "hello\n");
1661 }
1662
1663
1664 /*******************************************
1665 * Left justify, right justify, or center string s[]
1666 * in field width chars wide.
1667 */
1668
1669 char[] ljustify(char[] s, int width)
1670 {
1671 if (s.length >= width)
1672 return s;
1673 char[] r = new char[width];
1674 r[0..s.length] = s;
1675 r[s.length .. width] = cast(char)' ';
1676 return r;
1677 }
1678
1679 /// ditto
1680 char[] rjustify(char[] s, int width)
1681 {
1682 if (s.length >= width)
1683 return s;
1684 char[] r = new char[width];
1685 r[0 .. width - s.length] = cast(char)' ';
1686 r[width - s.length .. width] = s;
1687 return r;
1688 }
1689
1690 /// ditto
1691 char[] center(char[] s, int width)
1692 {
1693 if (s.length >= width)
1694 return s;
1695 char[] r = new char[width];
1696 int left = (width - s.length) / 2;
1697 r[0 .. left] = cast(char)' ';
1698 r[left .. left + s.length] = s;
1699 r[left + s.length .. width] = cast(char)' ';
1700 return r;
1701 }
1702
1703 unittest
1704 {
1705 debug(string) printf("string.justify.unittest\n");
1706
1707 char[] s = "hello";
1708 char[] r;
1709 int i;
1710
1711 r = ljustify(s, 8);
1712 i = cmp(r, "hello ");
1713 assert(i == 0);
1714
1715 r = rjustify(s, 8);
1716 i = cmp(r, " hello");
1717 assert(i == 0);
1718
1719 r = center(s, 8);
1720 i = cmp(r, " hello ");
1721 assert(i == 0);
1722
1723 r = zfill(s, 8);
1724 i = cmp(r, "000hello");
1725 assert(i == 0);
1726 }
1727
1728
1729 /*****************************************
1730 * Same as rjustify(), but fill with '0's.
1731 */
1732
1733 char[] zfill(char[] s, int width)
1734 {
1735 if (s.length >= width)
1736 return s;
1737 char[] r = new char[width];
1738 r[0 .. width - s.length] = cast(char)'0';
1739 r[width - s.length .. width] = s;
1740 return r;
1741 }
1742
1743 /********************************************
1744 * Replace occurrences of from[] with to[] in s[].
1745 */
1746
1747 char[] replace(char[] s, char[] from, char[] to)
1748 {
1749 char[] p;
1750 int i;
1751 size_t istart;
1752
1753 //printf("replace('%.*s','%.*s','%.*s')\n", s, from, to);
1754 if (from.length == 0)
1755 return s;
1756 istart = 0;
1757 while (istart < s.length)
1758 {
1759 i = find(s[istart .. s.length], from);
1760 if (i == -1)
1761 {
1762 p ~= s[istart .. s.length];
1763 break;
1764 }
1765 p ~= s[istart .. istart + i];
1766 p ~= to;
1767 istart += i + from.length;
1768 }
1769 return p;
1770 }
1771
1772 unittest
1773 {
1774 debug(string) printf("string.replace.unittest\n");
1775
1776 char[] s = "This is a foo foo list";
1777 char[] from = "foo";
1778 char[] to = "silly";
1779 char[] r;
1780 int i;
1781
1782 r = replace(s, from, to);
1783 i = cmp(r, "This is a silly silly list");
1784 assert(i == 0);
1785
1786 r = replace(s, "", to);
1787 i = cmp(r, "This is a foo foo list");
1788 assert(i == 0);
1789 }
1790
1791 /*****************************
1792 * Return a _string that is string[] with slice[] replaced by replacement[].
1793 */
1794
1795 char[] replaceSlice(char[] string, char[] slice, char[] replacement)
1796 in
1797 {
1798 // Verify that slice[] really is a slice of string[]
1799 int so = cast(char*)slice - cast(char*)string;
1800 assert(so >= 0);
1801 //printf("string.length = %d, so = %d, slice.length = %d\n", string.length, so, slice.length);
1802 assert(string.length >= so + slice.length);
1803 }
1804 body
1805 {
1806 char[] result;
1807 int so = cast(char*)slice - cast(char*)string;
1808
1809 result.length = string.length - slice.length + replacement.length;
1810
1811 result[0 .. so] = string[0 .. so];
1812 result[so .. so + replacement.length] = replacement;
1813 result[so + replacement.length .. result.length] = string[so + slice.length .. string.length];
1814
1815 return result;
1816 }
1817
1818 unittest
1819 {
1820 debug(string) printf("string.replaceSlice.unittest\n");
1821
1822 char[] string = "hello";
1823 char[] slice = string[2 .. 4];
1824
1825 char[] r = replaceSlice(string, slice, "bar");
1826 int i;
1827 i = cmp(r, "hebaro");
1828 assert(i == 0);
1829 }
1830
1831 /**********************************************
1832 * Insert sub[] into s[] at location index.
1833 */
1834
1835 char[] insert(char[] s, size_t index, char[] sub)
1836 in
1837 {
1838 assert(0 <= index && index <= s.length);
1839 }
1840 body
1841 {
1842 if (sub.length == 0)
1843 return s;
1844
1845 if (s.length == 0)
1846 return sub;
1847
1848 int newlength = s.length + sub.length;
1849 char[] result = new char[newlength];
1850
1851 result[0 .. index] = s[0 .. index];
1852 result[index .. index + sub.length] = sub;
1853 result[index + sub.length .. newlength] = s[index .. s.length];
1854 return result;
1855 }
1856
1857 unittest
1858 {
1859 debug(string) printf("string.insert.unittest\n");
1860
1861 char[] r;
1862 int i;
1863
1864 r = insert("abcd", 0, "e");
1865 i = cmp(r, "eabcd");
1866 assert(i == 0);
1867
1868 r = insert("abcd", 4, "e");
1869 i = cmp(r, "abcde");
1870 assert(i == 0);
1871
1872 r = insert("abcd", 2, "ef");
1873 i = cmp(r, "abefcd");
1874 assert(i == 0);
1875
1876 r = insert(null, 0, "e");
1877 i = cmp(r, "e");
1878 assert(i == 0);
1879
1880 r = insert("abcd", 0, null);
1881 i = cmp(r, "abcd");
1882 assert(i == 0);
1883 }
1884
1885 /***********************************************
1886 * Count up all instances of sub[] in s[].
1887 */
1888
1889 size_t count(char[] s, char[] sub)
1890 {
1891 size_t i;
1892 int j;
1893 int count = 0;
1894
1895 for (i = 0; i < s.length; i += j + sub.length)
1896 {
1897 j = find(s[i .. s.length], sub);
1898 if (j == -1)
1899 break;
1900 count++;
1901 }
1902 return count;
1903 }
1904
1905 unittest
1906 {
1907 debug(string) printf("string.count.unittest\n");
1908
1909 char[] s = "This is a fofofof list";
1910 char[] sub = "fof";
1911 int i;
1912
1913 i = count(s, sub);
1914 assert(i == 2);
1915 }
1916
1917
1918 /************************************************
1919 * Replace tabs with the appropriate number of spaces.
1920 * tabsize is the distance between tab stops.
1921 */
1922
1923 char[] expandtabs(char[] string, int tabsize = 8)
1924 {
1925 bool changes = false;
1926 char[] result = string;
1927 int column;
1928 int nspaces;
1929
1930 foreach (size_t i, dchar c; string)
1931 {
1932 switch (c)
1933 {
1934 case '\t':
1935 nspaces = tabsize - (column % tabsize);
1936 if (!changes)
1937 {
1938 changes = true;
1939 result = null;
1940 result.length = string.length + nspaces - 1;
1941 result.length = i + nspaces;
1942 result[0 .. i] = string[0 .. i];
1943 result[i .. i + nspaces] = ' ';
1944 }
1945 else
1946 { int j = result.length;
1947 result.length = j + nspaces;
1948 result[j .. j + nspaces] = ' ';
1949 }
1950 column += nspaces;
1951 break;
1952
1953 case '\r':
1954 case '\n':
1955 case PS:
1956 case LS:
1957 column = 0;
1958 goto L1;
1959
1960 default:
1961 column++;
1962 L1:
1963 if (changes)
1964 {
1965 if (c <= 0x7F)
1966 result ~= cast(char)c;
1967 else
1968 std.utf.encode(result, c);
1969 }
1970 break;
1971 }
1972 }
1973
1974 return result;
1975 }
1976
1977 unittest
1978 {
1979 debug(string) printf("string.expandtabs.unittest\n");
1980
1981 char[] s = "This \tis\t a fofof\tof list";
1982 char[] r;
1983 int i;
1984
1985 r = expandtabs(s, 8);
1986 i = cmp(r, "This is a fofof of list");
1987 assert(i == 0);
1988
1989 r = expandtabs(null);
1990 assert(r == null);
1991 r = expandtabs("");
1992 assert(r.length == 0);
1993 r = expandtabs("a");
1994 assert(r == "a");
1995 r = expandtabs("\t");
1996 assert(r == " ");
1997 r = expandtabs( " ab\tasdf ");
1998 //writefln("r = '%s'", r);
1999 assert(r == " ab asdf ");
2000 // TODO: need UTF test case
2001 }
2002
2003
2004 /*******************************************
2005 * Replace spaces in string with the optimal number of tabs.
2006 * Trailing spaces or tabs in a line are removed.
2007 * Params:
2008 * string = String to convert.
2009 * tabsize = Tab columns are tabsize spaces apart. tabsize defaults to 8.
2010 */
2011
2012 char[] entab(char[] string, int tabsize = 8)
2013 {
2014 bool changes = false;
2015 char[] result = string;
2016
2017 int nspaces = 0;
2018 int nwhite = 0;
2019 int column = 0; // column number
2020
2021 foreach (size_t i, dchar c; string)
2022 {
2023
2024 void change()
2025 {
2026 changes = true;
2027 result = null;
2028 result.length = string.length;
2029 result.length = i;
2030 result[0 .. i] = string[0 .. i];
2031 }
2032
2033 switch (c)
2034 {
2035 case '\t':
2036 nwhite++;
2037 if (nspaces)
2038 {
2039 if (!changes)
2040 change();
2041
2042 int j = result.length - nspaces;
2043 int ntabs = (((column - nspaces) % tabsize) + nspaces) / tabsize;
2044 result.length = j + ntabs;
2045 result[j .. j + ntabs] = '\t';
2046 nwhite += ntabs - nspaces;
2047 nspaces = 0;
2048 }
2049 column = (column + tabsize) / tabsize * tabsize;
2050 break;
2051
2052 case '\r':
2053 case '\n':
2054 case PS:
2055 case LS:
2056 // Truncate any trailing spaces or tabs
2057 if (nwhite)
2058 {
2059 if (!changes)
2060 change();
2061 result = result[0 .. result.length - nwhite];
2062 }
2063 break;
2064
2065 default:
2066 if (nspaces >= 2 && (column % tabsize) == 0)
2067 {
2068 if (!changes)
2069 change();
2070
2071 int j = result.length - nspaces;
2072 int ntabs = (nspaces + tabsize - 1) / tabsize;
2073 result.length = j + ntabs;
2074 result[j .. j + ntabs] = '\t';
2075 nwhite += ntabs - nspaces;
2076 nspaces = 0;
2077 }
2078 if (c == ' ')
2079 { nwhite++;
2080 nspaces++;
2081 }
2082 else
2083 { nwhite = 0;
2084 nspaces = 0;
2085 }
2086 column++;
2087 break;
2088 }
2089 if (changes)
2090 {
2091 if (c <= 0x7F)
2092 result ~= cast(char)c;
2093 else
2094 std.utf.encode(result, c);
2095 }
2096 }
2097
2098 // Truncate any trailing spaces or tabs
2099 if (nwhite)
2100 result = result[0 .. result.length - nwhite];
2101
2102 return result;
2103 }
2104
2105 unittest
2106 {
2107 debug(string) printf("string.entab.unittest\n");
2108
2109 char[] r;
2110
2111 r = entab(null);
2112 assert(r == null);
2113 r = entab("");
2114 assert(r.length == 0);
2115 r = entab("a");
2116 assert(r == "a");
2117 r = entab(" ");
2118 assert(r == "");
2119 r = entab(" x");
2120 assert(r == "\tx");
2121 r = entab(" ab asdf ");
2122 assert(r == " ab\tasdf");
2123 r = entab(" ab asdf ");
2124 assert(r == " ab\t asdf");
2125 r = entab(" ab \t asdf ");
2126 assert(r == " ab\t asdf");
2127 r = entab("1234567 \ta");
2128 assert(r == "1234567\t\ta");
2129 r = entab("1234567 \ta");
2130 assert(r == "1234567\t\ta");
2131 r = entab("1234567 \ta");
2132 assert(r == "1234567\t\ta");
2133 r = entab("1234567 \ta");
2134 assert(r == "1234567\t\ta");
2135 r = entab("1234567 \ta");
2136 assert(r == "1234567\t\ta");
2137 r = entab("1234567 \ta");
2138 assert(r == "1234567\t\ta");
2139 r = entab("1234567 \ta");
2140 assert(r == "1234567\t\ta");
2141 r = entab("1234567 \ta");
2142 assert(r == "1234567\t\ta");
2143 r = entab("1234567 \ta");
2144 assert(r == "1234567\t\t\ta");
2145 // TODO: need UTF test case
2146 }
2147
2148
2149
2150 /************************************
2151 * Construct translation table for translate().
2152 * BUG: only works with ASCII
2153 */
2154
2155 char[] maketrans(char[] from, char[] to)
2156 in
2157 {
2158 assert(from.length == to.length);
2159 assert(from.length <= 128);
2160 foreach (char c; from)
2161 {
2162 assert(c <= 0x7F);
2163 }
2164 foreach (char c; to)
2165 {
2166 assert(c <= 0x7F);
2167 }
2168 }
2169 body
2170 {
2171 char[] t = new char[256];
2172 int i;
2173
2174 for (i = 0; i < t.length; i++)
2175 t[i] = cast(char)i;
2176
2177 for (i = 0; i < from.length; i++)
2178 t[from[i]] = to[i];
2179
2180 return t;
2181 }
2182
2183 /******************************************
2184 * Translate characters in s[] using table created by maketrans().
2185 * Delete chars in delchars[].
2186 * BUG: only works with ASCII
2187 */
2188
2189 char[] translate(char[] s, char[] transtab, char[] delchars)
2190 in
2191 {
2192 assert(transtab.length == 256);
2193 }
2194 body
2195 {
2196 char[] r;
2197 int count;
2198 bool[256] deltab;
2199
2200 deltab[] = false;
2201 foreach (char c; delchars)
2202 {
2203 deltab[c] = true;
2204 }
2205
2206 count = 0;
2207 foreach (char c; s)
2208 {
2209 if (!deltab[c])
2210 count++;
2211 //printf("s[%d] = '%c', count = %d\n", i, s[i], count);
2212 }
2213
2214 r = new char[count];
2215 count = 0;
2216 foreach (char c; s)
2217 {
2218 if (!deltab[c])
2219 {
2220 r[count] = transtab[c];
2221 count++;
2222 }
2223 }
2224
2225 return r;
2226 }
2227
2228 unittest
2229 {
2230 debug(string) printf("string.translate.unittest\n");
2231
2232 char[] from = "abcdef";
2233 char[] to = "ABCDEF";
2234 char[] s = "The quick dog fox";
2235 char[] t;
2236 char[] r;
2237 int i;
2238
2239 t = maketrans(from, to);
2240 r = translate(s, t, "kg");
2241 //printf("r = '%.*s'\n", r);
2242 i = cmp(r, "ThE quiC Do Fox");
2243 assert(i == 0);
2244 }
2245
2246 /***********************************************
2247 * Convert to char[].
2248 */
2249
2250 char[] toString(bool b)
2251 {
2252 return b ? "true" : "false";
2253 }
2254
2255 /// ditto
2256 char[] toString(char c)
2257 {
2258 char[] result = new char[2];
2259 result[0] = c;
2260 result[1] = 0;
2261 return result[0 .. 1];
2262 }
2263
2264 unittest
2265 {
2266 debug(string) printf("string.toString(char).unittest\n");
2267
2268 char[] s = "foo";
2269 char[] s2;
2270 foreach (char c; s)
2271 {
2272 s2 ~= std.string.toString(c);
2273 }
2274 //printf("%.*s", s2);
2275 assert(s2 == "foo");
2276 }
2277
2278 char[] toString(ubyte ub) { return toString(cast(uint) ub); } /// ditto
2279 char[] toString(ushort us) { return toString(cast(uint) us); } /// ditto
2280
2281 /// ditto
2282 char[] toString(uint u)
2283 { char[uint.sizeof * 3] buffer = void;
2284 int ndigits;
2285 char[] result;
2286
2287 ndigits = 0;
2288 if (u < 10)
2289 // Avoid storage allocation for simple stuff
2290 result = digits[u .. u + 1];
2291 else
2292 {
2293 while (u)
2294 {
2295 uint c = (u % 10) + '0';
2296 u /= 10;
2297 ndigits++;
2298 buffer[buffer.length - ndigits] = cast(char)c;
2299 }
2300 result = new char[ndigits];
2301 result[] = buffer[buffer.length - ndigits .. buffer.length];
2302 }
2303 return result;
2304 }
2305
2306 unittest
2307 {
2308 debug(string) printf("string.toString(uint).unittest\n");
2309
2310 char[] r;
2311 int i;
2312
2313 r = toString(0u);
2314 i = cmp(r, "0");
2315 assert(i == 0);
2316
2317 r = toString(9u);
2318 i = cmp(r, "9");
2319 assert(i == 0);
2320
2321 r = toString(123u);
2322 i = cmp(r, "123");
2323 assert(i == 0);
2324 }
2325
2326 /// ditto
2327 char[] toString(ulong u)
2328 { char[ulong.sizeof * 3] buffer;
2329 int ndigits;
2330 char[] result;
2331
2332 if (u < 0x1_0000_0000)
2333 return toString(cast(uint)u);
2334 ndigits = 0;
2335 while (u)
2336 {
2337 char c = cast(char)((u % 10) + '0');
2338 u /= 10;
2339 ndigits++;
2340 buffer[buffer.length - ndigits] = c;
2341 }
2342 result = new char[ndigits];
2343 result[] = buffer[buffer.length - ndigits .. buffer.length];
2344 return result;
2345 }
2346
2347 unittest
2348 {
2349 debug(string) printf("string.toString(ulong).unittest\n");
2350
2351 char[] r;
2352 int i;
2353
2354 r = toString(0uL);
2355 i = cmp(r, "0");
2356 assert(i == 0);
2357
2358 r = toString(9uL);
2359 i = cmp(r, "9");
2360 assert(i == 0);
2361
2362 r = toString(123uL);
2363 i = cmp(r, "123");
2364 assert(i == 0);
2365 }
2366
2367 char[] toString(byte b) { return toString(cast(int) b); } /// ditto
2368 char[] toString(short s) { return toString(cast(int) s); } /// ditto
2369
2370 /// ditto
2371 char[] toString(int i)
2372 { char[1 + int.sizeof * 3] buffer;
2373 char[] result;
2374
2375 if (i >= 0)
2376 return toString(cast(uint)i);
2377
2378 uint u = -i;
2379 int ndigits = 1;
2380 while (u)
2381 {
2382 char c = cast(char)((u % 10) + '0');
2383 u /= 10;
2384 buffer[buffer.length - ndigits] = c;
2385 ndigits++;
2386 }
2387 buffer[buffer.length - ndigits] = '-';
2388 result = new char[ndigits];
2389 result[] = buffer[buffer.length - ndigits .. buffer.length];
2390 return result;
2391 }
2392
2393 unittest
2394 {
2395 debug(string) printf("string.toString(int).unittest\n");
2396
2397 char[] r;
2398 int i;
2399
2400 r = toString(0);
2401 i = cmp(r, "0");
2402 assert(i == 0);
2403
2404 r = toString(9);
2405 i = cmp(r, "9");
2406 assert(i == 0);
2407
2408 r = toString(123);
2409 i = cmp(r, "123");
2410 assert(i == 0);
2411
2412 r = toString(-0);
2413 i = cmp(r, "0");
2414 assert(i == 0);
2415
2416 r = toString(-9);
2417 i = cmp(r, "-9");
2418 assert(i == 0);
2419
2420 r = toString(-123);
2421 i = cmp(r, "-123");
2422 assert(i == 0);
2423 }
2424
2425 /// ditto
2426 char[] toString(long i)
2427 { char[1 + long.sizeof * 3] buffer;
2428 char[] result;
2429
2430 if (i >= 0)
2431 return toString(cast(ulong)i);
2432 if (cast(int)i == i)
2433 return toString(cast(int)i);
2434
2435 ulong u = cast(ulong)(-i);
2436 int ndigits = 1;
2437 while (u)
2438 {
2439 char c = cast(char)((u % 10) + '0');
2440 u /= 10;
2441 buffer[buffer.length - ndigits] = c;
2442 ndigits++;
2443 }
2444 buffer[buffer.length - ndigits] = '-';
2445 result = new char[ndigits];
2446 result[] = buffer[buffer.length - ndigits .. buffer.length];
2447 return result;
2448 }
2449
2450 unittest
2451 {
2452 debug(string) printf("string.toString(long).unittest\n");
2453
2454 char[] r;
2455 int i;
2456
2457 r = toString(0L);
2458 i = cmp(r, "0");
2459 assert(i == 0);
2460
2461 r = toString(9L);
2462 i = cmp(r, "9");
2463 assert(i == 0);
2464
2465 r = toString(123L);
2466 i = cmp(r, "123");
2467 assert(i == 0);
2468
2469 r = toString(-0L);
2470 i = cmp(r, "0");
2471 assert(i == 0);
2472
2473 r = toString(-9L);
2474 i = cmp(r, "-9");
2475 assert(i == 0);
2476
2477 r = toString(-123L);
2478 i = cmp(r, "-123");
2479 assert(i == 0);
2480 }
2481
2482 /// ditto
2483 char[] toString(float f) { return toString(cast(double) f); }
2484
2485 /// ditto
2486 char[] toString(double d)
2487 {
2488 char[20] buffer;
2489
2490 int len = sprintf(buffer.ptr, "%g", d);
2491 return buffer[0 .. len].dup;
2492 }
2493
2494 /// ditto
2495 char[] toString(real r)
2496 {
2497 char[20] buffer;
2498
2499 int len = sprintf(buffer.ptr, "%Lg", r);
2500 return buffer[0 .. len].dup;
2501 }
2502
2503 /// ditto
2504 char[] toString(ifloat f) { return toString(cast(idouble) f); }
2505
2506 /// ditto
2507 char[] toString(idouble d)
2508 {
2509 char[21] buffer;
2510
2511 int len = sprintf(buffer.ptr, "%gi", d);
2512 return buffer[0 .. len].dup;
2513 }
2514
2515 /// ditto
2516 char[] toString(ireal r)
2517 {
2518 char[21] buffer;
2519
2520 int len = sprintf(buffer.ptr, "%Lgi", r);
2521 return buffer[0 .. len].dup;
2522 }
2523
2524 /// ditto
2525 char[] toString(cfloat f) { return toString(cast(cdouble) f); }
2526
2527 /// ditto
2528 char[] toString(cdouble d)
2529 {
2530 char[20 + 1 + 20 + 1] buffer;
2531
2532 int len = sprintf(buffer.ptr, "%g+%gi", d.re, d.im);
2533 return buffer[0 .. len].dup;
2534 }
2535
2536 /// ditto
2537 char[] toString(creal r)
2538 {
2539 char[20 + 1 + 20 + 1] buffer;
2540
2541 int len = sprintf(buffer.ptr, "%Lg+%Lgi", r.re, r.im);
2542 return buffer[0 .. len].dup;
2543 }
2544
2545
2546 /******************************************
2547 * Convert value to string in _radix radix.
2548 *
2549 * radix must be a value from 2 to 36.
2550 * value is treated as a signed value only if radix is 10.
2551 * The characters A through Z are used to represent values 10 through 36.
2552 */
2553 char[] toString(long value, uint radix)
2554 in
2555 {
2556 assert(radix >= 2 && radix <= 36);
2557 }
2558 body
2559 {
2560 if (radix == 10)
2561 return toString(value); // handle signed cases only for radix 10
2562 return toString(cast(ulong)value, radix);
2563 }
2564
2565 /// ditto
2566 char[] toString(ulong value, uint radix)
2567 in
2568 {
2569 assert(radix >= 2 && radix <= 36);
2570 }
2571 body
2572 {
2573 char[value.sizeof * 8] buffer;
2574 uint i = buffer.length;
2575
2576 if (value < radix && value < hexdigits.length)
2577 return hexdigits[cast(size_t)value .. cast(size_t)value + 1];
2578
2579 do
2580 { ubyte c;
2581
2582 c = cast(ubyte)(value % radix);
2583 value = value / radix;
2584 i--;
2585 buffer[i] = cast(char)((c < 10) ? c + '0' : c + 'A' - 10);
2586 } while (value);
2587 return buffer[i .. length].dup;
2588 }
2589
2590 unittest
2591 {
2592 debug(string) printf("string.toString(ulong, uint).unittest\n");
2593
2594 char[] r;
2595 int i;
2596
2597 r = toString(-10L, 10u);
2598 assert(r == "-10");
2599
2600 r = toString(15L, 2u);
2601 //writefln("r = '%s'", r);
2602 assert(r == "1111");
2603
2604 r = toString(1L, 2u);
2605 //writefln("r = '%s'", r);
2606 assert(r == "1");
2607
2608 r = toString(0x1234AFL, 16u);
2609 //writefln("r = '%s'", r);
2610 assert(r == "1234AF");
2611 }
2612
2613 /*************************************************
2614 * Convert C-style 0 terminated string s to char[] string.
2615 */
2616
2617 char[] toString(char *s)
2618 {
2619 return s ? s[0 .. strlen(s)] : cast(char[])null;
2620 }
2621
2622 unittest
2623 {
2624 debug(string) printf("string.toString(char*).unittest\n");
2625
2626 char[] r;
2627 int i;
2628
2629 r = toString(null);
2630 i = cmp(r, "");
2631 assert(i == 0);
2632
2633 r = toString("foo\0");
2634 i = cmp(r, "foo");
2635 assert(i == 0);
2636 }
2637
2638
2639 /*****************************************************
2640 * Format arguments into a string.
2641 */
2642
2643
2644 char[] format(...)
2645 {
2646 char[] s;
2647
2648 void putc(dchar c)
2649 {
2650 std.utf.encode(s, c);
2651 }
2652
2653 std.format.doFormat(&putc, _arguments, _argptr);
2654 return s;
2655 }
2656
2657
2658 /*****************************************************
2659 * Format arguments into string <i>s</i> which must be large
2660 * enough to hold the result. Throws ArrayBoundsError if it is not.
2661 * Returns: s
2662 */
2663 char[] sformat(char[] s, ...)
2664 { size_t i;
2665
2666 void putc(dchar c)
2667 {
2668 if (c <= 0x7F)
2669 {
2670 if (i >= s.length)
2671 throw new ArrayBoundsError("std.string.sformat", 0);
2672 s[i] = cast(char)c;
2673 ++i;
2674 }
2675 else
2676 { char[4] buf;
2677 char[] b;
2678
2679 b = std.utf.toUTF8(buf, c);
2680 if (i + b.length > s.length)
2681 throw new ArrayBoundsError("std.string.sformat", 0);
2682 s[i..i+b.length] = b[];
2683 i += b.length;
2684 }
2685 }
2686
2687 std.format.doFormat(&putc, _arguments, _argptr);
2688 return s[0 .. i];
2689 }
2690
2691
2692 unittest
2693 {
2694 debug(string) printf("std.string.format.unittest\n");
2695
2696 char[] r;
2697 int i;
2698 /+
2699 r = format(null);
2700 i = cmp(r, "");
2701 assert(i == 0);
2702 +/
2703 r = format("foo");
2704 i = cmp(r, "foo");
2705 assert(i == 0);
2706
2707 r = format("foo%%");
2708 i = cmp(r, "foo%");
2709 assert(i == 0);
2710
2711 r = format("foo%s", 'C');
2712 i = cmp(r, "fooC");
2713 assert(i == 0);
2714
2715 r = format("%s foo", "bar");
2716 i = cmp(r, "bar foo");
2717 assert(i == 0);
2718
2719 r = format("%s foo %s", "bar", "abc");
2720 i = cmp(r, "bar foo abc");
2721 assert(i == 0);
2722
2723 r = format("foo %d", -123);
2724 i = cmp(r, "foo -123");
2725 assert(i == 0);
2726
2727 r = format("foo %d", 123);
2728 i = cmp(r, "foo 123");
2729 assert(i == 0);
2730 }
2731
2732
2733 /***********************************************
2734 * See if character c is in the pattern.
2735 * Patterns:
2736 *
2737 * A <i>pattern</i> is an array of characters much like a <i>character
2738 * class</i> in regular expressions. A sequence of characters
2739 * can be given, such as "abcde". The '-' can represent a range
2740 * of characters, as "a-e" represents the same pattern as "abcde".
2741 * "a-fA-F0-9" represents all the hex characters.
2742 * If the first character of a pattern is '^', then the pattern
2743 * is negated, i.e. "^0-9" means any character except a digit.
2744 * The functions inPattern, <b>countchars</b>, <b>removeschars</b>,
2745 * and <b>squeeze</b>
2746 * use patterns.
2747 *
2748 * Note: In the future, the pattern syntax may be improved
2749 * to be more like regular expression character classes.
2750 */
2751
2752 bool inPattern(dchar c, char[] pattern)
2753 {
2754 bool result = false;
2755 int range = 0;
2756 dchar lastc;
2757
2758 foreach (size_t i, dchar p; pattern)
2759 {
2760 if (p == '^' && i == 0)
2761 { result = true;
2762 if (i + 1 == pattern.length)
2763 return (c == p); // or should this be an error?
2764 }
2765 else if (range)
2766 {
2767 range = 0;
2768 if (lastc <= c && c <= p || c == p)
2769 return !result;
2770 }
2771 else if (p == '-' && i > result && i + 1 < pattern.length)
2772 {
2773 range = 1;
2774 continue;
2775 }
2776 else if (c == p)
2777 return !result;
2778 lastc = p;
2779 }
2780 return result;
2781 }
2782
2783
2784 unittest
2785 {
2786 debug(string) printf("std.string.inPattern.unittest\n");
2787
2788 int i;
2789
2790 i = inPattern('x', "x");
2791 assert(i == 1);
2792 i = inPattern('x', "y");
2793 assert(i == 0);
2794 i = inPattern('x', cast(char[])null);
2795 assert(i == 0);
2796 i = inPattern('x', "^y");
2797 assert(i == 1);
2798 i = inPattern('x', "yxxy");
2799 assert(i == 1);
2800 i = inPattern('x', "^yxxy");
2801 assert(i == 0);
2802 i = inPattern('x', "^abcd");
2803 assert(i == 1);
2804 i = inPattern('^', "^^");
2805 assert(i == 0);
2806 i = inPattern('^', "^");
2807 assert(i == 1);
2808 i = inPattern('^', "a^");
2809 assert(i == 1);
2810 i = inPattern('x', "a-z");
2811 assert(i == 1);
2812 i = inPattern('x', "A-Z");
2813 assert(i == 0);
2814 i = inPattern('x', "^a-z");
2815 assert(i == 0);
2816 i = inPattern('x', "^A-Z");
2817 assert(i == 1);
2818 i = inPattern('-', "a-");
2819 assert(i == 1);
2820 i = inPattern('-', "^A-");
2821 assert(i == 0);
2822 i = inPattern('a', "z-a");
2823 assert(i == 1);
2824 i = inPattern('z', "z-a");
2825 assert(i == 1);
2826 i = inPattern('x', "z-a");
2827 assert(i == 0);
2828 }
2829
2830
2831 /***********************************************
2832 * See if character c is in the intersection of the patterns.
2833 */
2834
2835 int inPattern(dchar c, char[][] patterns)
2836 { int result;
2837
2838 foreach (char[] pattern; patterns)
2839 {
2840 if (!inPattern(c, pattern))
2841 { result = 0;
2842 break;
2843 }
2844 result = 1;
2845 }
2846 return result;
2847 }
2848
2849
2850 /********************************************
2851 * Count characters in s that match pattern.
2852 */
2853
2854 size_t countchars(char[] s, char[] pattern)
2855 {
2856 size_t count;
2857
2858 foreach (dchar c; s)
2859 {
2860 count += inPattern(c, pattern);
2861 }
2862 return count;
2863 }
2864
2865
2866 unittest
2867 {
2868 debug(string) printf("std.string.count.unittest\n");
2869
2870 size_t c;
2871
2872 c = countchars("abc", "a-c");
2873 assert(c == 3);
2874 c = countchars("hello world", "or");
2875 assert(c == 3);
2876 }
2877
2878
2879 /********************************************
2880 * Return string that is s with all characters removed that match pattern.
2881 */
2882
2883 char[] removechars(char[] s, char[] pattern)
2884 {
2885 char[] r = s;
2886 int changed;
2887 size_t j;
2888
2889 foreach (size_t i, dchar c; s)
2890 {
2891 if (!inPattern(c, pattern))
2892 {
2893 if (changed)
2894 {
2895 if (r is s)
2896 r = s[0 .. j].dup;
2897 std.utf.encode(r, c);
2898 }
2899 }
2900 else if (!changed)
2901 { changed = 1;
2902 j = i;
2903 }
2904 }
2905 if (changed && r is s)
2906 r = s[0 .. j].dup;
2907 return r;
2908 }
2909
2910
2911 unittest
2912 {
2913 debug(string) printf("std.string.remove.unittest\n");
2914
2915 char[] r;
2916
2917 r = removechars("abc", "a-c");
2918 assert(r is null);
2919 r = removechars("hello world", "or");
2920 assert(r == "hell wld");
2921 r = removechars("hello world", "d");
2922 assert(r == "hello worl");
2923 }
2924
2925
2926 /***************************************************
2927 * Return string where sequences of a character in s[] from pattern[]
2928 * are replaced with a single instance of that character.
2929 * If pattern is null, it defaults to all characters.
2930 */
2931
2932 char[] squeeze(char[] s, char[] pattern = null)
2933 {
2934 char[] r = s;
2935 dchar lastc;
2936 size_t lasti;
2937 int run;
2938 bool changed;
2939
2940 foreach (size_t i, dchar c; s)
2941 {
2942 if (run && lastc == c)
2943 {
2944 changed = true;
2945 }
2946 else if (pattern is null || inPattern(c, pattern))
2947 {
2948 run = 1;
2949 if (changed)
2950 { if (r is s)
2951 r = s[0 .. lasti].dup;
2952 std.utf.encode(r, c);
2953 }
2954 else
2955 lasti = i + std.utf.stride(s, i);
2956 lastc = c;
2957 }
2958 else
2959 {
2960 run = 0;
2961 if (changed)
2962 { if (r is s)
2963 r = s[0 .. lasti].dup;
2964 std.utf.encode(r, c);
2965 }
2966 }
2967 }
2968 if (changed)
2969 {
2970 if (r is s)
2971 r = s[0 .. lasti];
2972 }
2973 return r;
2974 }
2975
2976
2977 unittest
2978 {
2979 debug(string) printf("std.string.squeeze.unittest\n");
2980 char[] s,r;
2981
2982 r = squeeze("hello");
2983 //writefln("r = '%s'", r);
2984 assert(r == "helo");
2985 s = "abcd";
2986 r = squeeze(s);
2987 assert(r is s);
2988 s = "xyzz";
2989 r = squeeze(s);
2990 assert(r.ptr == s.ptr); // should just be a slice
2991 r = squeeze("hello goodbyee", "oe");
2992 assert(r == "hello godbye");
2993 }
2994
2995
2996 /**********************************************
2997 * Return string that is the 'successor' to s[].
2998 * If the rightmost character is a-zA-Z0-9, it is incremented within
2999 * its case or digits. If it generates a carry, the process is
3000 * repeated with the one to its immediate left.
3001 */
3002
3003 char[] succ(char[] s)
3004 {
3005 if (s.length && isalnum(s[length - 1]))
3006 {
3007 char[] r = s.dup;
3008 size_t i = r.length - 1;
3009
3010 while (1)
3011 { dchar c = s[i];
3012 dchar carry;
3013
3014 switch (c)
3015 {
3016 case '9':
3017 c = '0';
3018 carry = '1';
3019 goto Lcarry;
3020 case 'z':
3021 case 'Z':
3022 c -= 'Z' - 'A';
3023 carry = c;
3024 Lcarry:
3025 r[i] = cast(char)c;
3026 if (i == 0)
3027 {
3028 char[] t = new char[r.length + 1];
3029 t[0] = cast(char)carry;
3030 t[1 .. length] = r[];
3031 return t;
3032 }
3033 i--;
3034 break;
3035
3036 default:
3037 if (std.ctype.isalnum(c))
3038 r[i]++;
3039 return r;
3040 }
3041 }
3042 }
3043 return s;
3044 }
3045
3046 unittest
3047 {
3048 debug(string) printf("std.string.succ.unittest\n");
3049
3050 char[] r;
3051
3052 r = succ(null);
3053 assert(r is null);
3054 r = succ("!@#$%");
3055 assert(r == "!@#$%");
3056 r = succ("1");
3057 assert(r == "2");
3058 r = succ("9");
3059 assert(r == "10");
3060 r = succ("999");
3061 assert(r == "1000");
3062 r = succ("zz99");
3063 assert(r == "aaa00");
3064 }
3065
3066
3067 /***********************************************
3068 * Replaces characters in str[] that are in from[]
3069 * with corresponding characters in to[] and returns the resulting
3070 * string.
3071 * Params:
3072 * modifiers = a string of modifier characters
3073 * Modifiers:
3074 <table border=1 cellspacing=0 cellpadding=5>
3075 <tr> <th>Modifier <th>Description
3076 <tr> <td><b>c</b> <td>Complement the list of characters in from[]
3077 <tr> <td><b>d</b> <td>Removes matching characters with no corresponding replacement in to[]
3078 <tr> <td><b>s</b> <td>Removes adjacent duplicates in the replaced characters
3079 </table>
3080
3081 If modifier <b>d</b> is present, then the number of characters
3082 in to[] may be only 0 or 1.
3083
3084 If modifier <b>d</b> is not present and to[] is null,
3085 then to[] is taken _to be the same as from[].
3086
3087 If modifier <b>d</b> is not present and to[] is shorter
3088 than from[], then to[] is extended by replicating the
3089 last character in to[].
3090
3091 Both from[] and to[] may contain ranges using the <b>-</b>
3092 character, for example <b>a-d</b> is synonymous with <b>abcd</b>.
3093 Neither accept a leading <b>^</b> as meaning the complement of
3094 the string (use the <b>c</b> modifier for that).
3095 */
3096
3097 char[] tr(char[] str, char[] from, char[] to, char[] modifiers = null)
3098 {
3099 int mod_c;
3100 int mod_d;
3101 int mod_s;
3102
3103 foreach (char c; modifiers)
3104 {
3105 switch (c)
3106 {
3107 case 'c': mod_c = 1; break; // complement
3108 case 'd': mod_d = 1; break; // delete unreplaced chars
3109 case 's': mod_s = 1; break; // squeeze duplicated replaced chars
3110 default: assert(0);
3111 }
3112 }
3113
3114 if (to is null && !mod_d)
3115 to = from;
3116
3117 char[] result = new char[str.length];
3118 result.length = 0;
3119 int m;
3120 dchar lastc;
3121
3122 foreach (dchar c; str)
3123 { dchar lastf;
3124 dchar lastt;
3125 dchar newc;
3126 int n = 0;
3127
3128 for (size_t i = 0; i < from.length; )
3129 {
3130 dchar f = std.utf.decode(from, i);
3131 //writefln("\tf = '%s', c = '%s', lastf = '%x', '%x', i = %d, %d", f, c, lastf, dchar.init, i, from.length);
3132 if (f == '-' && lastf != dchar.init && i < from.length)
3133 {
3134 dchar nextf = std.utf.decode(from, i);
3135 //writefln("\tlastf = '%s', c = '%s', nextf = '%s'", lastf, c, nextf);
3136 if (lastf <= c && c <= nextf)
3137 {
3138 n += c - lastf - 1;
3139 if (mod_c)
3140 goto Lnotfound;
3141 goto Lfound;
3142 }
3143 n += nextf - lastf;
3144 lastf = lastf.init;
3145 continue;
3146 }
3147
3148 if (c == f)
3149 { if (mod_c)
3150 goto Lnotfound;
3151 goto Lfound;
3152 }
3153 lastf = f;
3154 n++;
3155 }
3156 if (!mod_c)
3157 goto Lnotfound;
3158 n = 0; // consider it 'found' at position 0
3159
3160 Lfound:
3161
3162 // Find the nth character in to[]
3163 //writefln("\tc = '%s', n = %d", c, n);
3164 dchar nextt;
3165 for (size_t i = 0; i < to.length; )
3166 { dchar t = std.utf.decode(to, i);
3167 if (t == '-' && lastt != dchar.init && i < to.length)
3168 {
3169 nextt = std.utf.decode(to, i);
3170 //writefln("\tlastt = '%s', c = '%s', nextt = '%s', n = %d", lastt, c, nextt, n);
3171 n -= nextt - lastt;
3172 if (n < 0)
3173 {
3174 newc = nextt + n + 1;
3175 goto Lnewc;
3176 }
3177 lastt = dchar.init;
3178 continue;
3179 }
3180 if (n == 0)
3181 { newc = t;
3182 goto Lnewc;
3183 }
3184 lastt = t;
3185 nextt = t;
3186 n--;
3187 }
3188 if (mod_d)
3189 continue;
3190 newc = nextt;
3191
3192 Lnewc:
3193 if (mod_s && m && newc == lastc)
3194 continue;
3195 std.utf.encode(result, newc);
3196 m = 1;
3197 lastc = newc;
3198 continue;
3199
3200 Lnotfound:
3201 std.utf.encode(result, c);
3202 lastc = c;
3203 m = 0;
3204 }
3205 return result;
3206 }
3207
3208 unittest
3209 {
3210 debug(string) printf("std.string.tr.unittest\n");
3211
3212 char[] r;
3213 //writefln("r = '%s'", r);
3214
3215 r = tr("abcdef", "cd", "CD");
3216 assert(r == "abCDef");
3217
3218 r = tr("abcdef", "b-d", "B-D");
3219 assert(r == "aBCDef");
3220
3221 r = tr("abcdefgh", "b-dh", "B-Dx");
3222 assert(r == "aBCDefgx");
3223
3224 r = tr("abcdefgh", "b-dh", "B-CDx");
3225 assert(r == "aBCDefgx");
3226
3227 r = tr("abcdefgh", "b-dh", "B-BCDx");
3228 assert(r == "aBCDefgx");
3229
3230 r = tr("abcdef", "ef", "*", "c");
3231 assert(r == "****ef");
3232
3233 r = tr("abcdef", "ef", "", "d");
3234 assert(r == "abcd");
3235
3236 r = tr("hello goodbye", "lo", null, "s");
3237 assert(r == "helo godbye");
3238
3239 r = tr("hello goodbye", "lo", "x", "s");
3240 assert(r == "hex gxdbye");
3241
3242 r = tr("14-Jul-87", "a-zA-Z", " ", "cs");
3243 assert(r == " Jul ");
3244
3245 r = tr("Abc", "AAA", "XYZ");
3246 assert(r == "Xbc");
3247 }
3248
3249
3250 /* ************************************************
3251 * Version : v0.3
3252 * Author : David L. 'SpottedTiger' Davis
3253 * Date Created : 31.May.05 Compiled and Tested with dmd v0.125
3254 * Date Modified : 01.Jun.05 Modified the function to handle the
3255 * : imaginary and complex float-point
3256 * : datatypes.
3257 * :
3258 * Licence : Public Domain / Contributed to Digital Mars
3259 */
3260
3261 /**
3262 * [in] char[] s can be formatted in the following ways:
3263 *
3264 * Integer Whole Number:
3265 * (for byte, ubyte, short, ushort, int, uint, long, and ulong)
3266 * ['+'|'-']digit(s)[U|L|UL]
3267 *
3268 * examples: 123, 123UL, 123L, +123U, -123L
3269 *
3270 * Floating-Point Number:
3271 * (for float, double, real, ifloat, idouble, and ireal)
3272 * ['+'|'-']digit(s)[.][digit(s)][[e-|e+]digit(s)][i|f|L|Li|fi]]
3273 * or [nan|nani|inf|-inf]
3274 *
3275 * examples: +123., -123.01, 123.3e-10f, 123.3e-10fi, 123.3e-10L
3276 *
3277 * (for cfloat, cdouble, and creal)
3278 * ['+'|'-']digit(s)[.][digit(s)][[e-|e+]digit(s)][+]
3279 * [digit(s)[.][digit(s)][[e-|e+]digit(s)][i|f|L|Li|fi]]
3280 * or [nan|nani|nan+nani|inf|-inf]
3281 *
3282 * examples: nan, -123e-1+456.9e-10Li, +123e+10+456i, 123+456
3283 *
3284 * [in] bool bAllowSep
3285 * False by default, but when set to true it will accept the
3286 * separator characters "," and "_" within the string, but these
3287 * characters should be stripped from the string before using any
3288 * of the conversion functions like toInt(), toFloat(), and etc
3289 * else an error will occur.
3290 *
3291 * Also please note, that no spaces are allowed within the string
3292 * anywhere whether it's a leading, trailing, or embedded space(s),
3293 * thus they too must be stripped from the string before using this
3294 * function, or any of the conversion functions.
3295 */
3296
3297 final bool isNumeric(in char[] s, in bool bAllowSep = false)
3298 {
3299 int iLen = s.length;
3300 bool bDecimalPoint = false;
3301 bool bExponent = false;
3302 bool bComplex = false;
3303 char[] sx = std.string.tolower(s);
3304 int j = 0;
3305 char c;
3306
3307 //writefln("isNumeric(char[], bool = false) called!");
3308 // Empty string, return false
3309 if (iLen == 0)
3310 return false;
3311
3312 // Check for NaN (Not a Number)
3313 if (sx == "nan" || sx == "nani" || sx == "nan+nani")
3314 return true;
3315
3316 // Check for Infinity
3317 if (sx == "inf" || sx == "-inf")
3318 return true;
3319
3320 // A sign is allowed only in the 1st character
3321 if (sx[0] == '-' || sx[0] == '+')
3322 j++;
3323
3324 for (int i = j; i < iLen; i++)
3325 {
3326 c = sx[i];
3327
3328 // Digits are good, continue checking
3329 // with the next character... ;)
3330 if (c >= '0' && c <= '9')
3331 continue;
3332
3333 // Check for the complex type, and if found
3334 // reset the flags for checking the 2nd number.
3335 else if (c == '+')
3336 if (i > 0)
3337 {
3338 bDecimalPoint = false;
3339 bExponent = false;
3340 bComplex = true;
3341 continue;
3342 }
3343 else
3344 return false;
3345
3346 // Allow only one exponent per number
3347 else if (c == 'e')
3348 {
3349 // A 2nd exponent found, return not a number
3350 if (bExponent)
3351 return false;
3352
3353 if (i + 1 < iLen)
3354 {
3355 // Look forward for the sign, and if
3356 // missing then this is not a number.
3357 if (sx[i + 1] != '-' && sx[i + 1] != '+')
3358 return false;
3359 else
3360 {
3361 bExponent = true;
3362 i++;
3363 }
3364 }
3365 else
3366 // Ending in "E", return not a number
3367 return false;
3368 }
3369 // Allow only one decimal point per number to be used
3370 else if (c == '.' )
3371 {
3372 // A 2nd decimal point found, return not a number
3373 if (bDecimalPoint)
3374 return false;
3375
3376 bDecimalPoint = true;
3377 continue;
3378 }
3379 // Check for ending literal characters: "f,u,l,i,ul,fi,li",
3380 // and wheater they're being used with the correct datatype.
3381 else if (i == iLen - 2)
3382 {
3383 // Integer Whole Number
3384 if (sx[i..iLen] == "ul" &&
3385 (!bDecimalPoint && !bExponent && !bComplex))
3386 return true;
3387 // Floating-Point Number
3388 else if ((sx[i..iLen] == "fi" || sx[i..iLen] == "li") &&
3389 (bDecimalPoint || bExponent || bComplex))
3390 return true;
3391 else if (sx[i..iLen] == "ul" &&
3392 (bDecimalPoint || bExponent || bComplex))
3393 return false;
3394 // Could be a Integer or a Float, thus
3395 // all these suffixes are valid for both
3396 else if (sx[i..iLen] == "ul" ||
3397 sx[i..iLen] == "fi" ||
3398 sx[i..iLen] == "li")
3399 return true;
3400 else
3401 return false;
3402 }
3403 else if (i == iLen - 1)
3404 {
3405 // Integer Whole Number
3406 if ((c == 'u' || c == 'l') &&
3407 (!bDecimalPoint && !bExponent && !bComplex))
3408 return true;
3409 // Check to see if the last character in the string
3410 // is the required 'i' character
3411 else if (bComplex)
3412 if (c == 'i')
3413 return true;
3414 else
3415 return false;
3416 // Floating-Point Number
3417 else if ((c == 'l' || c == 'f' || c == 'i') &&
3418 (bDecimalPoint || bExponent))
3419 return true;
3420 // Could be a Integer or a Float, thus
3421 // all these suffixes are valid for both
3422 else if (c == 'l' || c == 'f' || c == 'i')
3423 return true;
3424 else
3425 return false;
3426 }
3427 else
3428 // Check if separators are allow
3429 // to be in the numeric string
3430 if (bAllowSep == true && (c == '_' || c == ','))
3431 continue;
3432 else
3433 return false;
3434 }
3435
3436 return true;
3437 }
3438
3439 /// Allow any object as a parameter
3440 bool isNumeric(...)
3441 {
3442 return isNumeric(_arguments, _argptr);
3443 }
3444
3445 /// Check only the first parameter, all others will be ignored.
3446 bool isNumeric(TypeInfo[] _arguments, va_list _argptr)
3447 {
3448 char[] s = "";
3449 wchar[] ws = "";
3450 dchar[] ds = "";
3451
3452 //writefln("isNumeric(...) called!");
3453 if (_arguments.length == 0)
3454 return false;
3455
3456 if (_arguments[0] == typeid(char[]))
3457 return isNumeric(va_arg!(char[])(_argptr));
3458 else if (_arguments[0] == typeid(wchar[]))
3459 return isNumeric(std.utf.toUTF8(va_arg!(wchar[])(_argptr)));
3460 else if (_arguments[0] == typeid(dchar[]))
3461 return isNumeric(std.utf.toUTF8(va_arg!(dchar[])(_argptr)));
3462 else if (_arguments[0] == typeid(real))
3463 return true;
3464 else if (_arguments[0] == typeid(double))
3465 return true;
3466 else if (_arguments[0] == typeid(float))
3467 return true;
3468 else if (_arguments[0] == typeid(ulong))
3469 return true;
3470 else if (_arguments[0] == typeid(long))
3471 return true;
3472 else if (_arguments[0] == typeid(uint))
3473 return true;
3474 else if (_arguments[0] == typeid(int))
3475 return true;
3476 else if (_arguments[0] == typeid(ushort))
3477 return true;
3478 else if (_arguments[0] == typeid(short))
3479 return true;
3480 else if (_arguments[0] == typeid(ubyte))
3481 {
3482 s.length = 1;
3483 s[0]= va_arg!(ubyte)(_argptr);
3484 return isNumeric(cast(char[])s);
3485 }
3486 else if (_arguments[0] == typeid(byte))
3487 {
3488 s.length = 1;
3489 s[0] = va_arg!(byte)(_argptr);
3490 return isNumeric(cast(char[])s);
3491 }
3492 else if (_arguments[0] == typeid(ireal))
3493 return true;
3494 else if (_arguments[0] == typeid(idouble))
3495 return true;
3496 else if (_arguments[0] == typeid(ifloat))
3497 return true;
3498 else if (_arguments[0] == typeid(creal))
3499 return true;
3500 else if (_arguments[0] == typeid(cdouble))
3501 return true;
3502 else if (_arguments[0] == typeid(cfloat))
3503 return true;
3504 else if (_arguments[0] == typeid(char))
3505 {
3506 s.length = 1;
3507 s[0] = va_arg!(char)(_argptr);
3508 return isNumeric(s);
3509 }
3510 else if (_arguments[0] == typeid(wchar))
3511 {
3512 ws.length = 1;
3513 ws[0] = va_arg!(wchar)(_argptr);
3514 return isNumeric(std.utf.toUTF8(ws));
3515 }
3516 else if (_arguments[0] == typeid(dchar))
3517 {
3518 ds.length = 1;
3519 ds[0] = va_arg!(dchar)(_argptr);
3520 return isNumeric(std.utf.toUTF8(ds));
3521 }
3522 //else if (_arguments[0] == typeid(cent))
3523 // return true;
3524 //else if (_arguments[0] == typeid(ucent))
3525 // return true;
3526 else
3527 return false;
3528 }
3529
3530 unittest
3531 {
3532 debug (string) printf("isNumeric(in char[], bool = false).unittest\n");
3533 char[] s;
3534
3535 // Test the isNumeric(in char[]) function
3536 assert(isNumeric("1") == true );
3537 assert(isNumeric("1.0") == true );
3538 assert(isNumeric("1e-1") == true );
3539 assert(isNumeric("12345xxxx890") == false );
3540 assert(isNumeric("567L") == true );
3541 assert(isNumeric("23UL") == true );
3542 assert(isNumeric("-123..56f") == false );
3543 assert(isNumeric("12.3.5.6") == false );
3544 assert(isNumeric(" 12.356") == false );
3545 assert(isNumeric("123 5.6") == false );
3546 assert(isNumeric("1233E-1+1.0e-1i") == true );
3547
3548 assert(isNumeric("123.00E-5+1234.45E-12Li") == true);
3549 assert(isNumeric("123.00e-5+1234.45E-12iL") == false);
3550 assert(isNumeric("123.00e-5+1234.45e-12uL") == false);
3551 assert(isNumeric("123.00E-5+1234.45e-12lu") == false);
3552
3553 assert(isNumeric("123fi") == true);
3554 assert(isNumeric("123li") == true);
3555 assert(isNumeric("--123L") == false);
3556 assert(isNumeric("+123.5UL") == false);
3557 assert(isNumeric("123f") == true);
3558 assert(isNumeric("123.u") == false);
3559
3560 assert(isNumeric(std.string.toString(real.nan)) == true);
3561 assert(isNumeric(std.string.toString(-real.infinity)) == true);
3562 assert(isNumeric(std.string.toString(123e+2+1234.78Li)) == true);
3563
3564 s = "$250.99-";
3565 assert(isNumeric(s[1..s.length - 2]) == true);
3566 assert(isNumeric(s) == false);
3567 assert(isNumeric(s[0..s.length - 1]) == false);
3568
3569 // These test calling the isNumeric(...) function
3570 assert(isNumeric(1,123UL) == true);
3571 assert(isNumeric('2') == true);
3572 assert(isNumeric('x') == false);
3573 assert(isNumeric(cast(byte)0x57) == false); // 'W'
3574 assert(isNumeric(cast(byte)0x37) == true); // '7'
3575 assert(isNumeric(cast(wchar[])"145.67") == true);
3576 assert(isNumeric(cast(dchar[])"145.67U") == false);
3577 assert(isNumeric(123_000.23fi) == true);
3578 assert(isNumeric(123.00E-5+1234.45E-12Li) == true);
3579 assert(isNumeric(real.nan) == true);
3580 assert(isNumeric(-real.infinity) == true);
3581 }
3582
3583
3584 /*****************************
3585 * Soundex algorithm.
3586 *
3587 * The Soundex algorithm converts a word into 4 characters
3588 * based on how the word sounds phonetically. The idea is that
3589 * two spellings that sound alike will have the same Soundex
3590 * value, which means that Soundex can be used for fuzzy matching
3591 * of names.
3592 *
3593 * Params:
3594 * string = String to convert to Soundex representation.
3595 * buffer = Optional 4 char array to put the resulting Soundex
3596 * characters into. If null, the return value
3597 * buffer will be allocated on the heap.
3598 * Returns:
3599 * The four character array with the Soundex result in it.
3600 * Returns null if there is no Soundex representation for the string.
3601 *
3602 * See_Also:
3603 * $(LINK2 http://en.wikipedia.org/wiki/Soundex, Wikipedia),
3604 * $(LINK2 http://www.archives.gov/publications/general-info-leaflets/55.html, The Soundex Indexing System)
3605 *
3606 * Bugs:
3607 * Only works well with English names.
3608 * There are other arguably better Soundex algorithms,
3609 * but this one is the standard one.
3610 */
3611
3612 char[] soundex(char[] string, char[] buffer = null)
3613 in
3614 {
3615 assert(!buffer || buffer.length >= 4);
3616 }
3617 out (result)
3618 {
3619 if (result)
3620 {
3621 assert(result.length == 4);
3622 assert(result[0] >= 'A' && result[0] <= 'Z');
3623 foreach (char c; result[1 .. 4])
3624 assert(c >= '0' && c <= '6');
3625 }
3626 }
3627 body
3628 {
3629 static char[26] dex =
3630 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
3631 "01230120022455012623010202";
3632
3633 int b = 0;
3634 char lastc;
3635 foreach (char c; string)
3636 {
3637 if (c >= 'a' && c <= 'z')
3638 c -= 'a' - 'A';
3639 else if (c >= 'A' && c <= 'Z')
3640 {
3641 ;
3642 }
3643 else
3644 { lastc = lastc.init;
3645 continue;
3646 }
3647 if (b == 0)
3648 {
3649 if (!buffer)
3650 buffer = new char[4];
3651 buffer[0] = c;
3652 b++;
3653 lastc = dex[c - 'A'];
3654 }
3655 else
3656 {
3657 if (c == 'H' || c == 'W')
3658 continue;
3659 if (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U')
3660 lastc = lastc.init;
3661 c = dex[c - 'A'];
3662 if (c != '0' && c != lastc)
3663 {
3664 buffer[b] = c;
3665 b++;
3666 lastc = c;
3667 }
3668 }
3669 if (b == 4)
3670 goto Lret;
3671 }
3672 if (b == 0)
3673 buffer = null;
3674 else
3675 buffer[b .. 4] = '0';
3676 Lret:
3677 return buffer;
3678 }
3679
3680 unittest
3681 { char[4] buffer;
3682
3683 assert(soundex(null) == null);
3684 assert(soundex("") == null);
3685 assert(soundex("0123^&^^**&^") == null);
3686 assert(soundex("Euler") == "E460");
3687 assert(soundex(" Ellery ") == "E460");
3688 assert(soundex("Gauss") == "G200");
3689 assert(soundex("Ghosh") == "G200");
3690 assert(soundex("Hilbert") == "H416");
3691 assert(soundex("Heilbronn") == "H416");
3692 assert(soundex("Knuth") == "K530");
3693 assert(soundex("Kant", buffer) == "K530");
3694 assert(soundex("Lloyd") == "L300");
3695 assert(soundex("Ladd") == "L300");
3696 assert(soundex("Lukasiewicz", buffer) == "L222");
3697 assert(soundex("Lissajous") == "L222");
3698 assert(soundex("Robert") == "R163");
3699 assert(soundex("Rupert") == "R163");
3700 assert(soundex("Rubin") == "R150");
3701 assert(soundex("Washington") == "W252");
3702 assert(soundex("Lee") == "L000");
3703 assert(soundex("Gutierrez") == "G362");
3704 assert(soundex("Pfister") == "P236");
3705 assert(soundex("Jackson") == "J250");
3706 assert(soundex("Tymczak") == "T522");
3707 assert(soundex("Ashcraft") == "A261");
3708
3709 assert(soundex("Woo") == "W000");
3710 assert(soundex("Pilgrim") == "P426");
3711 assert(soundex("Flingjingwaller") == "F452");
3712 assert(soundex("PEARSE") == "P620");
3713 assert(soundex("PIERCE") == "P620");
3714 assert(soundex("Price") == "P620");
3715 assert(soundex("CATHY") == "C300");
3716 assert(soundex("KATHY") == "K300");
3717 assert(soundex("Jones") == "J520");
3718 assert(soundex("johnsons") == "J525");
3719 assert(soundex("Hardin") == "H635");
3720 assert(soundex("Martinez") == "M635");
3721 }
3722
3723
3724 /***************************************************
3725 * Construct an associative array consisting of all
3726 * abbreviations that uniquely map to the strings in values.
3727 *
3728 * This is useful in cases where the user is expected to type
3729 * in one of a known set of strings, and the program will helpfully
3730 * autocomplete the string once sufficient characters have been
3731 * entered that uniquely identify it.
3732 * Example:
3733 * ---
3734 * import std.stdio;
3735 * import std.string;
3736 *
3737 * void main()
3738 * {
3739 * static char[][] list = [ "food", "foxy" ];
3740 *
3741 * auto abbrevs = std.string.abbrev(list);
3742 *
3743 * foreach (key, value; abbrevs)
3744 * {
3745 * writefln("%s => %s", key, value);
3746 * }
3747 * }
3748 * ---
3749 * produces the output:
3750 * <pre>
3751 * fox =&gt; foxy
3752 * food =&gt; food
3753 * foxy =&gt; foxy
3754 * foo =&gt; food
3755 * </pre>
3756 */
3757
3758 char[][char[]] abbrev(char[][] values)
3759 {
3760 char[][char[]] result;
3761
3762 // Make a copy when sorting so we follow COW principles.
3763 values = values.dup.sort;
3764
3765 size_t values_length = values.length;
3766 size_t lasti = values_length;
3767 size_t nexti;
3768
3769 char[] nv;
3770 char[] lv;
3771
3772 for (size_t i = 0; i < values_length; i = nexti)
3773 { char[] value = values[i];
3774
3775 // Skip dups
3776 for (nexti = i + 1; nexti < values_length; nexti++)
3777 { nv = values[nexti];
3778 if (value != values[nexti])
3779 break;
3780 }
3781
3782 for (size_t j = 0; j < value.length; j += std.utf.stride(value, j))
3783 { char[] v = value[0 .. j];
3784
3785 if ((nexti == values_length || j > nv.length || v != nv[0 .. j]) &&
3786 (lasti == values_length || j > lv.length || v != lv[0 .. j]))
3787 result[v] = value;
3788 }
3789 result[value] = value;
3790 lasti = i;
3791 lv = value;
3792 }
3793
3794 return result;
3795 }
3796
3797 unittest
3798 {
3799 debug(string) printf("string.abbrev.unittest\n");
3800
3801 char[][] values;
3802 values ~= "hello";
3803 values ~= "hello";
3804 values ~= "he";
3805
3806 char[][char[]] r;
3807
3808 r = abbrev(values);
3809 char[][] keys = r.keys.dup;
3810 keys.sort;
3811
3812 assert(keys.length == 4);
3813 assert(keys[0] == "he");
3814 assert(keys[1] == "hel");
3815 assert(keys[2] == "hell");
3816 assert(keys[3] == "hello");
3817
3818 assert(r[keys[0]] == "he");
3819 assert(r[keys[1]] == "hello");
3820 assert(r[keys[2]] == "hello");
3821 assert(r[keys[3]] == "hello");
3822 }
3823
3824
3825 /******************************************
3826 * Compute column number after string if string starts in the
3827 * leftmost column, which is numbered starting from 0.
3828 */
3829
3830 size_t column(char[] string, int tabsize = 8)
3831 {
3832 size_t column;
3833
3834 foreach (dchar c; string)
3835 {
3836 switch (c)
3837 {
3838 case '\t':
3839 column = (column + tabsize) / tabsize * tabsize;
3840 break;
3841
3842 case '\r':
3843 case '\n':
3844 case PS:
3845 case LS:
3846 column = 0;
3847 break;
3848
3849 default:
3850 column++;
3851 break;
3852 }
3853 }
3854 return column;
3855 }
3856
3857 unittest
3858 {
3859 debug(string) printf("string.column.unittest\n");
3860
3861 assert(column(null) == 0);
3862 assert(column("") == 0);
3863 assert(column("\t") == 8);
3864 assert(column("abc\t") == 8);
3865 assert(column("12345678\t") == 16);
3866 }
3867
3868 /******************************************
3869 * Wrap text into a paragraph.
3870 *
3871 * The input text string s is formed into a paragraph
3872 * by breaking it up into a sequence of lines, delineated
3873 * by \n, such that the number of columns is not exceeded
3874 * on each line.
3875 * The last line is terminated with a \n.
3876 * Params:
3877 * s = text string to be wrapped
3878 * columns = maximum number of _columns in the paragraph
3879 * firstindent = string used to _indent first line of the paragraph
3880 * indent = string to use to _indent following lines of the paragraph
3881 * tabsize = column spacing of tabs
3882 * Returns:
3883 * The resulting paragraph.
3884 */
3885
3886 char[] wrap(char[] s, int columns = 80, char[] firstindent = null,
3887 char[] indent = null, int tabsize = 8)
3888 {
3889 char[] result;
3890 int col;
3891 int spaces;
3892 bool inword;
3893 bool first = true;
3894 size_t wordstart;
3895
3896 result.length = firstindent.length + s.length;
3897 result.length = firstindent.length;
3898 result[] = firstindent[];
3899 col = column(result, tabsize);
3900 foreach (size_t i, dchar c; s)
3901 {
3902 if (iswhite(c))
3903 {
3904 if (inword)
3905 {
3906 if (first)
3907 {
3908 ;
3909 }
3910 else if (col + 1 + (i - wordstart) > columns)
3911 {
3912 result ~= '\n';
3913 result ~= indent;
3914 col = column(indent, tabsize);
3915 }
3916 else
3917 { result ~= ' ';
3918 col += 1;
3919 }
3920 result ~= s[wordstart .. i];
3921 col += i - wordstart;
3922 inword = false;
3923 first = false;
3924 }
3925 }
3926 else
3927 {
3928 if (!inword)
3929 {
3930 wordstart = i;
3931 inword = true;
3932 }
3933 }
3934 }
3935
3936 if (inword)
3937 {
3938 if (col + 1 + (s.length - wordstart) >= columns)
3939 {
3940 result ~= '\n';
3941 result ~= indent;
3942 }
3943 else if (result.length != firstindent.length)
3944 result ~= ' ';
3945 result ~= s[wordstart .. s.length];
3946 }
3947 result ~= '\n';
3948
3949 return result;
3950 }
3951
3952 unittest
3953 {
3954 debug(string) printf("string.wrap.unittest\n");
3955
3956 assert(wrap(null) == "\n");
3957 assert(wrap(" a b df ") == "a b df\n");
3958 //writefln("'%s'", wrap(" a b df ",3));
3959 assert(wrap(" a b df ", 3) == "a b\ndf\n");
3960 assert(wrap(" a bc df ", 3) == "a\nbc\ndf\n");
3961 //writefln("'%s'", wrap(" abcd df ",3));
3962 assert(wrap(" abcd df ", 3) == "abcd\ndf\n");
3963 assert(wrap("x") == "x\n");
3964 assert(wrap("u u") == "u u\n");
3965 }
3966
3967
3968 /***************************
3969 * Does string s[] start with an email address?
3970 * Returns:
3971 * null it does not
3972 * char[] it does, and this is the slice of s[] that is that email address
3973 * References:
3974 * RFC2822
3975 */
3976 char[] isEmail(char[] s)
3977 { size_t i;
3978
3979 if (!isalpha(s[0]))
3980 goto Lno;
3981
3982 for (i = 1; 1; i++)
3983 {
3984 if (i == s.length)
3985 goto Lno;
3986 auto c = s[i];
3987 if (isalnum(c))
3988 continue;
3989 if (c == '-' || c == '_' || c == '.')
3990 continue;
3991 if (c != '@')
3992 goto Lno;
3993 i++;
3994 break;
3995 }
3996 //writefln("test1 '%s'", s[0 .. i]);
3997
3998 /* Now do the part past the '@'
3999 */
4000 size_t lastdot;
4001 for (; i < s.length; i++)
4002 {
4003 auto c = s[i];
4004 if (isalnum(c))
4005 continue;
4006 if (c == '-' || c == '_')
4007 continue;
4008 if (c == '.')
4009 {
4010 lastdot = i;
4011 continue;
4012 }
4013 break;
4014 }
4015 if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
4016 goto Lno;
4017
4018 return s[0 .. i];
4019
4020 Lno:
4021 return null;
4022 }
4023
4024
4025 /***************************
4026 * Does string s[] start with a URL?
4027 * Returns:
4028 * null it does not
4029 * char[] it does, and this is the slice of s[] that is that URL
4030 */
4031
4032 char[] isURL(char[] s)
4033 {
4034 /* Must start with one of:
4035 * http://
4036 * https://
4037 * www.
4038 */
4039
4040 size_t i;
4041
4042 if (s.length <= 4)
4043 goto Lno;
4044
4045 //writefln("isURL(%s)", s);
4046 if (s.length > 7 && std.string.icmp(s[0 .. 7], "http://") == 0)
4047 i = 7;
4048 else if (s.length > 8 && std.string.icmp(s[0 .. 8], "https://") == 0)
4049 i = 8;
4050 // if (icmp(s[0 .. 4], "www.") == 0)
4051 // i = 4;
4052 else
4053 goto Lno;
4054
4055 size_t lastdot;
4056 for (; i < s.length; i++)
4057 {
4058 auto c = s[i];
4059 if (isalnum(c))
4060 continue;
4061 if (c == '-' || c == '_' || c == '?' ||
4062 c == '=' || c == '%' || c == '&' ||
4063 c == '/' || c == '+' || c == '#' ||
4064 c == '~')
4065 continue;
4066 if (c == '.')
4067 {
4068 lastdot = i;
4069 continue;
4070 }
4071 break;
4072 }
4073 //if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
4074 if (!lastdot)
4075 goto Lno;
4076
4077 return s[0 .. i];
4078
4079 Lno:
4080 return null;
4081 }
4082
4083