Mercurial > projects > ldc
comparison tango/tango/text/Util.d @ 132:1700239cab2e trunk
[svn r136] MAJOR UNSTABLE UPDATE!!!
Initial commit after moving to Tango instead of Phobos.
Lots of bugfixes...
This build is not suitable for most things.
author | lindquist |
---|---|
date | Fri, 11 Jan 2008 17:57:40 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:5825d48b27d1 | 132:1700239cab2e |
---|---|
1 /******************************************************************************* | |
2 | |
3 copyright: Copyright (c) 2004 Kris Bell. All rights reserved | |
4 | |
5 license: BSD style: $(LICENSE) | |
6 | |
7 version: Apr 2004: Initial release | |
8 Dec 2006: South Seas version | |
9 | |
10 author: Kris | |
11 | |
12 | |
13 Placeholder for a variety of wee functions. These functions are all | |
14 templated with the intent of being used for arrays of char, wchar, | |
15 and dchar. However, they operate correctly with other array types | |
16 also. | |
17 | |
18 Several of these functions return an index value, representing where | |
19 some criteria was identified. When said criteria is not matched, the | |
20 functions return a value representing the array length provided to | |
21 them. That is, for those scenarios where C functions might typically | |
22 return -1 these functions return length instead. This operate nicely | |
23 with D slices: | |
24 --- | |
25 auto text = "happy:faces"; | |
26 | |
27 assert (text[0 .. locate (text, ':')] == "happy"); | |
28 | |
29 assert (text[0 .. locate (text, '!')] == "happy:faces"); | |
30 --- | |
31 | |
32 The contains() function is more convenient for trivial lookup | |
33 cases: | |
34 --- | |
35 if (contains ("fubar", '!')) | |
36 ... | |
37 --- | |
38 | |
39 Note that where some functions expect a uint as an argument, the | |
40 D template-matching algorithm will fail where an int is provided | |
41 instead. This is the typically the cause of "template not found" | |
42 errors. Also note that name overloading is not supported cleanly | |
43 by IFTI at this time, so is not applied here. | |
44 | |
45 | |
46 Applying the D "import alias" mechanism to this module is highly | |
47 recommended, in order to limit namespace pollution: | |
48 --- | |
49 import Util = tango.text.Util; | |
50 | |
51 auto s = Util.trim (" foo "); | |
52 --- | |
53 | |
54 | |
55 Function templates: | |
56 --- | |
57 trim (source) // trim whitespace | |
58 triml (source) // trim whitespace | |
59 trimr (source) // trim whitespace | |
60 strip (source, match) // trim elements | |
61 stripl (source, match) // trim left elements | |
62 stripr (source, match) // trim right elements | |
63 delimit (src, set) // split on delims | |
64 split (source, pattern) // split on pattern | |
65 splitLines (source); // split on lines | |
66 head (source, pattern, tail) // split to head & tail | |
67 join (source, postfix, output) // join text segments | |
68 repeat (source, count, output) // repeat source | |
69 replace (source, match, replacement) // replace chars | |
70 substitute (source, match, replacement) // replace patterns | |
71 contains (source, match) // has char? | |
72 containsPattern (source, match) // has pattern? | |
73 locate (source, match, start) // find char | |
74 locatePrior (source, match, start) // find prior char | |
75 locatePattern (source, match, start); // find pattern | |
76 locatePatternPrior (source, match, start); // find prior pattern | |
77 indexOf (s*, match, length) // low-level lookup | |
78 mismatch (s1*, s2*, length) // low-level compare | |
79 matching (s1*, s2*, length) // low-level compare | |
80 isSpace (match) // is whitespace? | |
81 layout (destination, format ...) // featherweight printf | |
82 lines (str) // foreach lines | |
83 quotes (str, set) // foreach quotes | |
84 delimiters (str, set) // foreach delimiters | |
85 patterns (str, pattern) // foreach patterns | |
86 --- | |
87 | |
88 *******************************************************************************/ | |
89 | |
90 module tango.text.Util; | |
91 | |
92 /****************************************************************************** | |
93 | |
94 Trim the provided array by stripping whitespace from both | |
95 ends. Returns a slice of the original content | |
96 | |
97 ******************************************************************************/ | |
98 | |
99 T[] trim(T) (T[] source) | |
100 { | |
101 T* head = source.ptr, | |
102 tail = head + source.length; | |
103 | |
104 while (head < tail && isSpace(*head)) | |
105 ++head; | |
106 | |
107 while (tail > head && isSpace(*(tail-1))) | |
108 --tail; | |
109 | |
110 return head [0 .. tail - head]; | |
111 } | |
112 | |
113 /****************************************************************************** | |
114 | |
115 Trim the provided array by stripping whitespace from the left. | |
116 Returns a slice of the original content | |
117 | |
118 ******************************************************************************/ | |
119 | |
120 T[] triml(T) (T[] source) | |
121 { | |
122 T* head = source.ptr, | |
123 tail = head + source.length; | |
124 | |
125 while (head < tail && isSpace(*head)) | |
126 ++head; | |
127 | |
128 return head [0 .. tail - head]; | |
129 } | |
130 | |
131 /****************************************************************************** | |
132 | |
133 Trim the provided array by stripping whitespace from the right. | |
134 Returns a slice of the original content | |
135 | |
136 ******************************************************************************/ | |
137 | |
138 T[] trimr(T) (T[] source) | |
139 { | |
140 T* head = source.ptr, | |
141 tail = head + source.length; | |
142 | |
143 while (tail > head && isSpace(*(tail-1))) | |
144 --tail; | |
145 | |
146 return head [0 .. tail - head]; | |
147 } | |
148 | |
149 /****************************************************************************** | |
150 | |
151 Trim the given array by stripping the provided match from | |
152 both ends. Returns a slice of the original content | |
153 | |
154 ******************************************************************************/ | |
155 | |
156 T[] strip(T) (T[] source, T match) | |
157 { | |
158 T* head = source.ptr, | |
159 tail = head + source.length; | |
160 | |
161 while (head < tail && *head is match) | |
162 ++head; | |
163 | |
164 while (tail > head && *(tail-1) is match) | |
165 --tail; | |
166 | |
167 return head [0 .. tail - head]; | |
168 } | |
169 | |
170 /****************************************************************************** | |
171 | |
172 Trim the given array by stripping the provided match from | |
173 the left hand side. Returns a slice of the original content | |
174 | |
175 ******************************************************************************/ | |
176 | |
177 T[] stripl(T) (T[] source, T match) | |
178 { | |
179 T* head = source.ptr, | |
180 tail = head + source.length; | |
181 | |
182 while (head < tail && *head is match) | |
183 ++head; | |
184 | |
185 return head [0 .. tail - head]; | |
186 } | |
187 | |
188 /****************************************************************************** | |
189 | |
190 Chop the given source by stripping the provided match from | |
191 the left hand side. Returns a slice of the original content | |
192 | |
193 ******************************************************************************/ | |
194 | |
195 T[] chopl(T) (T[] source, T[] match) | |
196 { | |
197 if (match.length <= source.length) | |
198 if (source[0 .. match.length] == match) | |
199 source = source [match.length .. $]; | |
200 | |
201 return source; | |
202 } | |
203 | |
204 /****************************************************************************** | |
205 | |
206 Chop the given source by stripping the provided match from | |
207 the right hand side. Returns a slice of the original content | |
208 | |
209 ******************************************************************************/ | |
210 | |
211 T[] chopr(T) (T[] source, T[] match) | |
212 { | |
213 if (match.length <= source.length) | |
214 if (source[$-match.length .. $] == match) | |
215 source = source [0 .. $-match.length]; | |
216 | |
217 return source; | |
218 } | |
219 | |
220 /****************************************************************************** | |
221 | |
222 Trim the given array by stripping the provided match from | |
223 the right hand side. Returns a slice of the original content | |
224 | |
225 ******************************************************************************/ | |
226 | |
227 T[] stripr(T) (T[] source, T match) | |
228 { | |
229 T* head = source.ptr, | |
230 tail = head + source.length; | |
231 | |
232 while (tail > head && *(tail-1) is match) | |
233 --tail; | |
234 | |
235 return head [0 .. tail - head]; | |
236 } | |
237 | |
238 /****************************************************************************** | |
239 | |
240 Replace all instances of one element with another (in place) | |
241 | |
242 ******************************************************************************/ | |
243 | |
244 T[] replace(T) (T[] source, T match, T replacement) | |
245 { | |
246 foreach (inout c; source) | |
247 if (c is match) | |
248 c = replacement; | |
249 return source; | |
250 } | |
251 | |
252 /****************************************************************************** | |
253 | |
254 Replace all instances of one array with another | |
255 | |
256 ******************************************************************************/ | |
257 | |
258 T[] substitute(T) (T[] source, T[] match, T[] replacement) | |
259 { | |
260 T[] output; | |
261 | |
262 foreach (s; patterns (source, match, replacement)) | |
263 output ~= s; | |
264 return output; | |
265 } | |
266 | |
267 /****************************************************************************** | |
268 | |
269 Returns whether or not the provided array contains an instance | |
270 of the given match | |
271 | |
272 ******************************************************************************/ | |
273 | |
274 bool contains(T) (T[] source, T match) | |
275 { | |
276 return indexOf (source.ptr, match, source.length) != source.length; | |
277 } | |
278 | |
279 /****************************************************************************** | |
280 | |
281 Returns whether or not the provided array contains an instance | |
282 of the given match | |
283 | |
284 ******************************************************************************/ | |
285 | |
286 bool containsPattern(T) (T[] source, T[] match) | |
287 { | |
288 return locatePattern (source, match) != source.length; | |
289 } | |
290 | |
291 /****************************************************************************** | |
292 | |
293 Return the index of the next instance of 'match' starting at | |
294 position 'start', or source.length where there is no match. | |
295 | |
296 Parameter 'start' defaults to 0 | |
297 | |
298 ******************************************************************************/ | |
299 | |
300 uint locate(T, U=uint) (T[] source, T match, U start=0) | |
301 {return locate!(T) (source, match, start);} | |
302 | |
303 uint locate(T) (T[] source, T match, uint start=0) | |
304 { | |
305 if (start > source.length) | |
306 start = source.length; | |
307 | |
308 return indexOf (source.ptr+start, match, source.length - start) + start; | |
309 } | |
310 | |
311 /****************************************************************************** | |
312 | |
313 Return the index of the prior instance of 'match' starting | |
314 just before 'start', or source.length where there is no match. | |
315 | |
316 Parameter 'start' defaults to source.length | |
317 | |
318 ******************************************************************************/ | |
319 | |
320 uint locatePrior(T, U=uint) (T[] source, T match, U start=uint.max) | |
321 {return locatePrior!(T)(source, match, start);} | |
322 | |
323 uint locatePrior(T) (T[] source, T match, uint start=uint.max) | |
324 { | |
325 if (start > source.length) | |
326 start = source.length; | |
327 | |
328 while (start > 0) | |
329 if (source[--start] is match) | |
330 return start; | |
331 return source.length; | |
332 } | |
333 | |
334 /****************************************************************************** | |
335 | |
336 Return the index of the next instance of 'match' starting at | |
337 position 'start', or source.length where there is no match. | |
338 | |
339 Parameter 'start' defaults to 0 | |
340 | |
341 ******************************************************************************/ | |
342 | |
343 uint locatePattern(T, U=uint) (T[] source, T[] match, U start=0) | |
344 {return locatePattern!(T) (source, match, start);} | |
345 | |
346 uint locatePattern(T) (T[] source, T[] match, uint start=0) | |
347 { | |
348 uint idx; | |
349 T* p = source.ptr + start; | |
350 uint extent = source.length - start - match.length + 1; | |
351 | |
352 if (match.length && extent <= source.length) | |
353 while (extent) | |
354 if ((idx = indexOf (p, match[0], extent)) is extent) | |
355 break; | |
356 else | |
357 if (matching (p+=idx, match.ptr, match.length)) | |
358 return p - source.ptr; | |
359 else | |
360 { | |
361 extent -= (idx+1); | |
362 ++p; | |
363 } | |
364 | |
365 return source.length; | |
366 } | |
367 | |
368 /****************************************************************************** | |
369 | |
370 Return the index of the prior instance of 'match' starting | |
371 just before 'start', or source.length where there is no match. | |
372 | |
373 Parameter 'start' defaults to source.length | |
374 | |
375 ******************************************************************************/ | |
376 | |
377 uint locatePatternPrior(T, U=uint) (T[] source, T[] match, U start=uint.max) | |
378 {return locatePatternPrior!(T)(source, match, start);} | |
379 | |
380 uint locatePatternPrior(T) (T[] source, T[] match, uint start=uint.max) | |
381 { | |
382 auto len = source.length; | |
383 | |
384 if (start > len) | |
385 start = len; | |
386 | |
387 if (match.length && match.length <= len) | |
388 while (start) | |
389 { | |
390 start = locatePrior (source, match[0], start); | |
391 if (start is len) | |
392 break; | |
393 else | |
394 if ((start + match.length) <= len) | |
395 if (matching (source.ptr+start, match.ptr, match.length)) | |
396 return start; | |
397 } | |
398 | |
399 return len; | |
400 } | |
401 | |
402 /****************************************************************************** | |
403 | |
404 Split the provided array on the first pattern instance, and | |
405 return the resultant head and tail. The pattern is excluded | |
406 from the two segments. | |
407 | |
408 Where a segment is not found, tail will be null and the return | |
409 value will be the original array. | |
410 | |
411 ******************************************************************************/ | |
412 | |
413 T[] head(T) (T[] src, T[] pattern, out T[] tail) | |
414 { | |
415 auto i = locatePattern (src, pattern); | |
416 if (i != src.length) | |
417 { | |
418 tail = src [i + pattern.length .. $]; | |
419 src = src [0 .. i]; | |
420 } | |
421 return src; | |
422 } | |
423 | |
424 /****************************************************************************** | |
425 | |
426 Split the provided array on the last pattern instance, and | |
427 return the resultant head and tail. The pattern is excluded | |
428 from the two segments. | |
429 | |
430 Where a segment is not found, head will be null and the return | |
431 value will be the original array. | |
432 | |
433 ******************************************************************************/ | |
434 | |
435 T[] tail(T) (T[] src, T[] pattern, out T[] head) | |
436 { | |
437 auto i = locatePatternPrior (src, pattern); | |
438 if (i != src.length) | |
439 { | |
440 head = src [0 .. i]; | |
441 src = src [i + pattern.length .. $]; | |
442 } | |
443 return src; | |
444 } | |
445 | |
446 /****************************************************************************** | |
447 | |
448 Split the provided array wherever a delimiter-set instance is | |
449 found, and return the resultant segments. The delimiters are | |
450 excluded from each of the segments. Note that delimiters are | |
451 matched as a set of alternates rather than as a pattern. | |
452 | |
453 Splitting on a single delimiter is considerably faster than | |
454 splitting upon a set of alternatives | |
455 | |
456 ******************************************************************************/ | |
457 | |
458 T[][] delimit(T) (T[] src, T[] set) | |
459 { | |
460 T[][] result; | |
461 | |
462 foreach (segment; delimiters (src, set)) | |
463 result ~= segment; | |
464 return result; | |
465 } | |
466 | |
467 /****************************************************************************** | |
468 | |
469 Split the provided array wherever a pattern instance is | |
470 found, and return the resultant segments. The pattern is | |
471 excluded from each of the segments. | |
472 | |
473 ******************************************************************************/ | |
474 | |
475 T[][] split(T) (T[] src, T[] pattern) | |
476 { | |
477 T[][] result; | |
478 | |
479 foreach (segment; patterns (src, pattern)) | |
480 result ~= segment; | |
481 return result; | |
482 } | |
483 | |
484 /****************************************************************************** | |
485 | |
486 Convert text into a set of lines, where each line is identified | |
487 by a \n or \r\n combination. The line terminator is stripped from | |
488 each resultant array | |
489 | |
490 ******************************************************************************/ | |
491 | |
492 T[][] splitLines(T) (T[] src) | |
493 { | |
494 int count; | |
495 | |
496 foreach (line; lines (src)) | |
497 ++count; | |
498 | |
499 T[][] result = new T[][count]; | |
500 | |
501 count = 0; | |
502 foreach (line; lines (src)) | |
503 result [count++] = line; | |
504 | |
505 return result; | |
506 } | |
507 | |
508 /****************************************************************************** | |
509 | |
510 Combine a series of text segments together, each appended with an | |
511 optional postfix pattern. An optional output buffer can be provided | |
512 to avoid heap activity - it should be large enough to contain the | |
513 entire output, otherwise the heap will be used instead. | |
514 | |
515 Returns a valid slice of the output, containing the concatenated | |
516 text. | |
517 | |
518 ******************************************************************************/ | |
519 | |
520 T[] join(T) (T[][] src, T[] postfix=null, T[] dst=null) | |
521 { | |
522 uint len = src.length * postfix.length; | |
523 | |
524 foreach (segment; src) | |
525 len += segment.length; | |
526 | |
527 if (dst.length < len) | |
528 dst.length = len; | |
529 | |
530 T* p = dst.ptr; | |
531 foreach (segment; src) | |
532 { | |
533 p[0 .. segment.length] = segment; | |
534 p += segment.length; | |
535 p[0 .. postfix.length] = postfix; | |
536 p += postfix.length; | |
537 } | |
538 | |
539 // remove trailing seperator | |
540 if (len) | |
541 len -= postfix.length; | |
542 return dst [0 .. len]; | |
543 } | |
544 | |
545 /****************************************************************************** | |
546 | |
547 Combine a series of text segments together, each appended with an | |
548 optional postfix pattern. An optional output buffer can be provided | |
549 to avoid heap activity - it should be large enough to contain the | |
550 entire output, otherwise the heap will be used instead. | |
551 | |
552 Returns a valid slice of the output, containing the concatenated | |
553 text. | |
554 | |
555 ******************************************************************************/ | |
556 | |
557 T[] repeat(T, U=uint) (T[] src, U count, T[] dst=null) | |
558 {return repeat!(T)(src, count, dst);} | |
559 | |
560 T[] repeat(T) (T[] src, uint count, T[] dst=null) | |
561 { | |
562 uint len = src.length * count; | |
563 if (len is 0) | |
564 return null; | |
565 | |
566 if (dst.length < len) | |
567 dst.length = len; | |
568 | |
569 for (auto p = dst.ptr; count--; p += src.length) | |
570 p[0 .. src.length] = src; | |
571 | |
572 return dst [0 .. len]; | |
573 } | |
574 | |
575 /****************************************************************************** | |
576 | |
577 Is the argument a whitespace character? | |
578 | |
579 ******************************************************************************/ | |
580 | |
581 bool isSpace(T) (T c) | |
582 { | |
583 static if (T.sizeof is 1) | |
584 return (c <= 32 && (c is ' ' | c is '\t' | c is '\r' | c is '\n' | c is '\f' | c is '\v')); | |
585 else | |
586 return (c <= 32 && (c is ' ' | c is '\t' | c is '\r' | c is '\n' | c is '\f' | c is '\v')) || (c is '\u2028' | c is '\u2029'); | |
587 } | |
588 | |
589 /****************************************************************************** | |
590 | |
591 Return whether or not the two arrays have matching content | |
592 | |
593 ******************************************************************************/ | |
594 | |
595 bool matching(T, U=uint) (T* s1, T* s2, U length) | |
596 {return matching!(T) (s1, s2, length);} | |
597 | |
598 bool matching(T) (T* s1, T* s2, uint length) | |
599 { | |
600 return mismatch(s1, s2, length) is length; | |
601 } | |
602 | |
603 /****************************************************************************** | |
604 | |
605 Returns the index of the first match in str, failing once | |
606 length is reached. Note that we return 'length' for failure | |
607 and a 0-based index on success | |
608 | |
609 ******************************************************************************/ | |
610 | |
611 uint indexOf(T, U=uint) (T* str, T match, U length) | |
612 {return indexOf!(T) (str, match, length);} | |
613 | |
614 uint indexOf(T) (T* str, T match, uint length) | |
615 { | |
616 version (D_InlineAsm_X86) | |
617 { | |
618 static if (T.sizeof == 1) | |
619 { | |
620 asm { | |
621 mov EDI, str; | |
622 mov ECX, length; | |
623 movzx EAX, match; | |
624 mov ESI, ECX; | |
625 and ESI, ESI; | |
626 jz end; | |
627 | |
628 cld; | |
629 repnz; | |
630 scasb; | |
631 jnz end; | |
632 sub ESI, ECX; | |
633 dec ESI; | |
634 end:; | |
635 mov EAX, ESI; | |
636 } | |
637 } | |
638 else static if (T.sizeof == 2) | |
639 { | |
640 asm { | |
641 mov EDI, str; | |
642 mov ECX, length; | |
643 movzx EAX, match; | |
644 mov ESI, ECX; | |
645 and ESI, ESI; | |
646 jz end; | |
647 | |
648 cld; | |
649 repnz; | |
650 scasw; | |
651 jnz end; | |
652 sub ESI, ECX; | |
653 dec ESI; | |
654 end:; | |
655 mov EAX, ESI; | |
656 } | |
657 } | |
658 else static if (T.sizeof == 4) | |
659 { | |
660 asm { | |
661 mov EDI, str; | |
662 mov ECX, length; | |
663 mov EAX, match; | |
664 mov ESI, ECX; | |
665 and ESI, ESI; | |
666 jz end; | |
667 | |
668 cld; | |
669 repnz; | |
670 scasd; | |
671 jnz end; | |
672 sub ESI, ECX; | |
673 dec ESI; | |
674 end:; | |
675 mov EAX, ESI; | |
676 } | |
677 } | |
678 else | |
679 { | |
680 auto len = length; | |
681 for (auto p=str-1; len--;) | |
682 if (*++p is match) | |
683 return p - str; | |
684 return length; | |
685 } | |
686 } | |
687 else | |
688 { | |
689 auto len = length; | |
690 for (auto p=str-1; len--;) | |
691 if (*++p is match) | |
692 return p - str; | |
693 return length; | |
694 } | |
695 } | |
696 | |
697 /****************************************************************************** | |
698 | |
699 Returns the index of a mismatch between s1 & s2, failing when | |
700 length is reached. Note that we return 'length' upon failure | |
701 (array content matches) and a 0-based index upon success. | |
702 | |
703 Use this as a faster opEquals (the assembler version). Also | |
704 provides the basis for a much faster opCmp, since the index | |
705 of the first mismatched character can be used to determine | |
706 the (greater or less than zero) return value | |
707 | |
708 ******************************************************************************/ | |
709 | |
710 uint mismatch(T, U=uint) (T* s1, T* s2, U length) | |
711 {return mismatch!(T)(s1, s2, length);} | |
712 | |
713 uint mismatch(T) (T* s1, T* s2, uint length) | |
714 { | |
715 version (D_InlineAsm_X86) | |
716 { | |
717 static if (T.sizeof == 1) | |
718 { | |
719 asm { | |
720 mov EDI, s1; | |
721 mov ESI, s2; | |
722 mov ECX, length; | |
723 mov EAX, ECX; | |
724 and EAX, EAX; | |
725 jz end; | |
726 | |
727 cld; | |
728 repz; | |
729 cmpsb; | |
730 jz end; | |
731 sub EAX, ECX; | |
732 dec EAX; | |
733 end:; | |
734 } | |
735 } | |
736 else static if (T.sizeof == 2) | |
737 { | |
738 asm { | |
739 mov EDI, s1; | |
740 mov ESI, s2; | |
741 mov ECX, length; | |
742 mov EAX, ECX; | |
743 and EAX, EAX; | |
744 jz end; | |
745 | |
746 cld; | |
747 repz; | |
748 cmpsw; | |
749 jz end; | |
750 sub EAX, ECX; | |
751 dec EAX; | |
752 sar ESI, 1; | |
753 end:; | |
754 } | |
755 } | |
756 else static if (T.sizeof == 4) | |
757 { | |
758 asm { | |
759 mov EDI, s1; | |
760 mov ESI, s2; | |
761 mov ECX, length; | |
762 mov EAX, ECX; | |
763 and EAX, EAX; | |
764 jz end; | |
765 | |
766 cld; | |
767 repz; | |
768 cmpsd; | |
769 jz end; | |
770 sub EAX, ECX; | |
771 dec EAX; | |
772 sar ESI, 2; | |
773 end:; | |
774 } | |
775 } | |
776 else | |
777 { | |
778 auto len = length; | |
779 for (auto p=s1-1; len--;) | |
780 if (*++p != *s2++) | |
781 return p - s1; | |
782 return length; | |
783 } | |
784 } | |
785 else | |
786 { | |
787 auto len = length; | |
788 for (auto p=s1-1; len--;) | |
789 if (*++p != *s2++) | |
790 return p - s1; | |
791 return length; | |
792 } | |
793 } | |
794 | |
795 /****************************************************************************** | |
796 | |
797 Freachable iterator to isolate lines. | |
798 | |
799 Converts text into a set of lines, where each line is identified | |
800 by a \n or \r\n combination. The line terminator is stripped from | |
801 each resultant array. | |
802 | |
803 --- | |
804 foreach (line; lines ("one\ntwo\nthree")) | |
805 ... | |
806 --- | |
807 | |
808 ******************************************************************************/ | |
809 | |
810 LineFreach!(T) lines(T) (T[] src) | |
811 { | |
812 LineFreach!(T) lines; | |
813 lines.src = src; | |
814 return lines; | |
815 } | |
816 | |
817 /****************************************************************************** | |
818 | |
819 Freachable iterator to isolate text elements. | |
820 | |
821 Splits the provided array wherever a delimiter-set instance is | |
822 found, and return the resultant segments. The delimiters are | |
823 excluded from each of the segments. Note that delimiters are | |
824 matched as a set of alternates rather than as a pattern. | |
825 | |
826 Splitting on a single delimiter is considerably faster than | |
827 splitting upon a set of alternatives. | |
828 | |
829 --- | |
830 foreach (segment; delimiters ("one,two;three", ",;")) | |
831 ... | |
832 --- | |
833 | |
834 ******************************************************************************/ | |
835 | |
836 DelimFreach!(T) delimiters(T) (T[] src, T[] set) | |
837 { | |
838 DelimFreach!(T) elements; | |
839 elements.set = set; | |
840 elements.src = src; | |
841 return elements; | |
842 } | |
843 | |
844 /****************************************************************************** | |
845 | |
846 Freachable iterator to isolate text elements. | |
847 | |
848 Split the provided array wherever a pattern instance is found, | |
849 and return the resultant segments. Pattern are excluded from | |
850 each of the segments, and an optional sub argument enables | |
851 replacement. | |
852 | |
853 --- | |
854 foreach (segment; patterns ("one, two, three", ", ")) | |
855 ... | |
856 --- | |
857 | |
858 ******************************************************************************/ | |
859 | |
860 PatternFreach!(T) patterns(T) (T[] src, T[] pattern, T[] sub=null) | |
861 { | |
862 PatternFreach!(T) elements; | |
863 elements.pattern = pattern; | |
864 elements.sub = sub; | |
865 elements.src = src; | |
866 return elements; | |
867 } | |
868 | |
869 /****************************************************************************** | |
870 | |
871 Freachable iterator to isolate optionally quoted text elements. | |
872 | |
873 As per elements(), but with the extension of being quote-aware; | |
874 the set of delimiters is ignored inside a pair of quotes. Note | |
875 that an unterminated quote will consume remaining content. | |
876 | |
877 --- | |
878 foreach (quote; quotes ("one two 'three four' five", " ")) | |
879 ... | |
880 --- | |
881 | |
882 ******************************************************************************/ | |
883 | |
884 QuoteFreach!(T) quotes(T) (T[] src, T[] set) | |
885 { | |
886 QuoteFreach!(T) quotes; | |
887 quotes.set = set; | |
888 quotes.src = src; | |
889 return quotes; | |
890 } | |
891 | |
892 /******************************************************************************* | |
893 | |
894 Arranges text strings in order, using indices to specify where | |
895 each particular argument should be positioned within the text. | |
896 This is handy for collating I18N components, or as a simplistic | |
897 and lightweight formatter. Indices range from zero through nine. | |
898 | |
899 --- | |
900 // write ordered text to the console | |
901 char[64] tmp; | |
902 | |
903 Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline; | |
904 --- | |
905 | |
906 *******************************************************************************/ | |
907 | |
908 T[] layout(T) (T[] output, T[][] layout ...) | |
909 { | |
910 static T[] badarg = "{index out of range}"; | |
911 static T[] toosmall = "{output buffer too small}"; | |
912 | |
913 int pos, | |
914 args; | |
915 bool state; | |
916 | |
917 args = layout.length - 1; | |
918 foreach (c; layout[0]) | |
919 { | |
920 if (state) | |
921 { | |
922 state = false; | |
923 if (c >= '0' && c <= '9') | |
924 { | |
925 uint index = c - '0'; | |
926 if (index < args) | |
927 { | |
928 T[] x = layout[index+1]; | |
929 | |
930 int limit = pos + x.length; | |
931 if (limit < output.length) | |
932 { | |
933 output [pos .. limit] = x; | |
934 pos = limit; | |
935 continue; | |
936 } | |
937 else | |
938 return toosmall; | |
939 } | |
940 else | |
941 return badarg; | |
942 } | |
943 } | |
944 else | |
945 if (c is '%') | |
946 { | |
947 state = true; | |
948 continue; | |
949 } | |
950 | |
951 if (pos < output.length) | |
952 { | |
953 output[pos] = c; | |
954 ++pos; | |
955 } | |
956 else | |
957 return toosmall; | |
958 } | |
959 | |
960 return output [0..pos]; | |
961 } | |
962 | |
963 /****************************************************************************** | |
964 | |
965 jhash() -- hash a variable-length key into a 32-bit value | |
966 | |
967 k : the key (the unaligned variable-length array of bytes) | |
968 len : the length of the key, counting by bytes | |
969 level : can be any 4-byte value | |
970 | |
971 Returns a 32-bit value. Every bit of the key affects every bit of | |
972 the return value. Every 1-bit and 2-bit delta achieves avalanche. | |
973 | |
974 About 4.3*len + 80 X86 instructions, with excellent pipelining | |
975 | |
976 The best hash table sizes are powers of 2. There is no need to do | |
977 mod a prime (mod is sooo slow!). If you need less than 32 bits, | |
978 use a bitmask. For example, if you need only 10 bits, do | |
979 | |
980 h = (h & hashmask(10)); | |
981 | |
982 In which case, the hash table should have hashsize(10) elements. | |
983 If you are hashing n strings (ub1 **)k, do it like this: | |
984 | |
985 for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); | |
986 | |
987 By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use | |
988 this code any way you wish, private, educational, or commercial. | |
989 It's free. | |
990 | |
991 See http://burlteburtle.net/bob/hash/evahash.html | |
992 Use for hash table lookup, or anything where one collision in 2^32 | |
993 is acceptable. Do NOT use for cryptographic purposes. | |
994 | |
995 ******************************************************************************/ | |
996 | |
997 uint jhash (ubyte* k, uint len, uint c = 0) | |
998 { | |
999 uint a = 0x9e3779b9, | |
1000 b = 0x9e3779b9, | |
1001 i = len; | |
1002 | |
1003 // handle most of the key | |
1004 while (i >= 12) | |
1005 { | |
1006 a += *cast(uint *)(k+0); | |
1007 b += *cast(uint *)(k+4); | |
1008 c += *cast(uint *)(k+8); | |
1009 | |
1010 a -= b; a -= c; a ^= (c>>13); | |
1011 b -= c; b -= a; b ^= (a<<8); | |
1012 c -= a; c -= b; c ^= (b>>13); | |
1013 a -= b; a -= c; a ^= (c>>12); | |
1014 b -= c; b -= a; b ^= (a<<16); | |
1015 c -= a; c -= b; c ^= (b>>5); | |
1016 a -= b; a -= c; a ^= (c>>3); | |
1017 b -= c; b -= a; b ^= (a<<10); | |
1018 c -= a; c -= b; c ^= (b>>15); | |
1019 k += 12; i -= 12; | |
1020 } | |
1021 | |
1022 // handle the last 11 bytes | |
1023 c += len; | |
1024 switch (i) | |
1025 { | |
1026 case 11: c+=(cast(uint)k[10]<<24); | |
1027 case 10: c+=(cast(uint)k[9]<<16); | |
1028 case 9 : c+=(cast(uint)k[8]<<8); | |
1029 case 8 : b+=(cast(uint)k[7]<<24); | |
1030 case 7 : b+=(cast(uint)k[6]<<16); | |
1031 case 6 : b+=(cast(uint)k[5]<<8); | |
1032 case 5 : b+=(cast(uint)k[4]); | |
1033 case 4 : a+=(cast(uint)k[3]<<24); | |
1034 case 3 : a+=(cast(uint)k[2]<<16); | |
1035 case 2 : a+=(cast(uint)k[1]<<8); | |
1036 case 1 : a+=(cast(uint)k[0]); | |
1037 default: | |
1038 } | |
1039 | |
1040 a -= b; a -= c; a ^= (c>>13); | |
1041 b -= c; b -= a; b ^= (a<<8); | |
1042 c -= a; c -= b; c ^= (b>>13); | |
1043 a -= b; a -= c; a ^= (c>>12); | |
1044 b -= c; b -= a; b ^= (a<<16); | |
1045 c -= a; c -= b; c ^= (b>>5); | |
1046 a -= b; a -= c; a ^= (c>>3); | |
1047 b -= c; b -= a; b ^= (a<<10); | |
1048 c -= a; c -= b; c ^= (b>>15); | |
1049 | |
1050 return c; | |
1051 } | |
1052 | |
1053 /// ditto | |
1054 uint jhash (void[] x, uint c = 0) | |
1055 { | |
1056 return jhash (cast(ubyte*) x.ptr, x.length, c); | |
1057 } | |
1058 | |
1059 | |
1060 /****************************************************************************** | |
1061 | |
1062 Helper struct for iterator lines() | |
1063 | |
1064 ******************************************************************************/ | |
1065 | |
1066 private struct LineFreach(T) | |
1067 { | |
1068 private T[] src; | |
1069 | |
1070 int opApply (int delegate (inout T[] line) dg) | |
1071 { | |
1072 uint ret, | |
1073 pos, | |
1074 mark; | |
1075 T[] line; | |
1076 | |
1077 const T nl = '\n'; | |
1078 const T cr = '\r'; | |
1079 | |
1080 while ((pos = locate (src, nl, mark)) < src.length) | |
1081 { | |
1082 auto end = pos; | |
1083 if (end && src[end-1] is cr) | |
1084 --end; | |
1085 | |
1086 line = src [mark .. end]; | |
1087 if ((ret = dg (line)) != 0) | |
1088 return ret; | |
1089 mark = pos + 1; | |
1090 } | |
1091 | |
1092 line = src [mark .. $]; | |
1093 if (mark < src.length) | |
1094 ret = dg (line); | |
1095 | |
1096 return ret; | |
1097 } | |
1098 } | |
1099 | |
1100 /****************************************************************************** | |
1101 | |
1102 Helper struct for iterator delimiters() | |
1103 | |
1104 ******************************************************************************/ | |
1105 | |
1106 private struct DelimFreach(T) | |
1107 { | |
1108 private T[] src; | |
1109 private T[] set; | |
1110 | |
1111 int opApply (int delegate (inout T[] token) dg) | |
1112 { | |
1113 uint ret, | |
1114 pos, | |
1115 mark; | |
1116 T[] token; | |
1117 | |
1118 // optimize for single delimiter case | |
1119 if (set.length is 1) | |
1120 while ((pos = locate (src, set[0], mark)) < src.length) | |
1121 { | |
1122 token = src [mark .. pos]; | |
1123 if ((ret = dg (token)) != 0) | |
1124 return ret; | |
1125 mark = pos + 1; | |
1126 } | |
1127 else | |
1128 if (set.length > 1) | |
1129 foreach (i, elem; src) | |
1130 if (contains (set, elem)) | |
1131 { | |
1132 token = src [mark .. i]; | |
1133 if ((ret = dg (token)) != 0) | |
1134 return ret; | |
1135 mark = i + 1; | |
1136 } | |
1137 | |
1138 token = src [mark .. $]; | |
1139 if (mark < src.length) | |
1140 ret = dg (token); | |
1141 | |
1142 return ret; | |
1143 } | |
1144 } | |
1145 | |
1146 /****************************************************************************** | |
1147 | |
1148 Helper struct for iterator patterns() | |
1149 | |
1150 ******************************************************************************/ | |
1151 | |
1152 private struct PatternFreach(T) | |
1153 { | |
1154 private T[] src, | |
1155 sub, | |
1156 pattern; | |
1157 | |
1158 int opApply (int delegate (inout T[] token) dg) | |
1159 { | |
1160 uint ret, | |
1161 pos, | |
1162 mark; | |
1163 T[] token; | |
1164 | |
1165 // optimize for single-element pattern | |
1166 if (pattern.length is 1) | |
1167 while ((pos = locate (src, pattern[0], mark)) < src.length) | |
1168 { | |
1169 token = src [mark .. pos]; | |
1170 if ((ret = dg(token)) != 0) | |
1171 return ret; | |
1172 if (sub.ptr) | |
1173 if ((ret = dg(sub)) != 0) | |
1174 return ret; | |
1175 mark = pos + 1; | |
1176 } | |
1177 else | |
1178 if (pattern.length > 1) | |
1179 while ((pos = locatePattern (src, pattern, mark)) < src.length) | |
1180 { | |
1181 token = src [mark .. pos]; | |
1182 if ((ret = dg(token)) != 0) | |
1183 return ret; | |
1184 if (sub.ptr) | |
1185 if ((ret = dg(sub)) != 0) | |
1186 return ret; | |
1187 mark = pos + pattern.length; | |
1188 } | |
1189 | |
1190 token = src [mark .. $]; | |
1191 if (mark < src.length) | |
1192 ret = dg (token); | |
1193 | |
1194 return ret; | |
1195 } | |
1196 } | |
1197 | |
1198 /****************************************************************************** | |
1199 | |
1200 Helper struct for iterator quotes() | |
1201 | |
1202 ******************************************************************************/ | |
1203 | |
1204 private struct QuoteFreach(T) | |
1205 { | |
1206 private T[] src; | |
1207 private T[] set; | |
1208 | |
1209 int opApply (int delegate (inout T[] token) dg) | |
1210 { | |
1211 int ret, | |
1212 mark; | |
1213 T[] token; | |
1214 | |
1215 if (set.length) | |
1216 for (uint i=0; i < src.length; ++i) | |
1217 { | |
1218 T c = src[i]; | |
1219 if (c is '"' || c is '\'') | |
1220 i = locate (src, c, i+1); | |
1221 else | |
1222 if (contains (set, c)) | |
1223 { | |
1224 token = src [mark .. i]; | |
1225 if ((ret = dg (token)) != 0) | |
1226 return ret; | |
1227 mark = i + 1; | |
1228 } | |
1229 } | |
1230 | |
1231 token = src [mark .. $]; | |
1232 if (mark < src.length) | |
1233 ret = dg (token); | |
1234 | |
1235 return ret; | |
1236 } | |
1237 } | |
1238 | |
1239 | |
1240 /****************************************************************************** | |
1241 | |
1242 ******************************************************************************/ | |
1243 | |
1244 debug (UnitTest) | |
1245 { | |
1246 //void main() {} | |
1247 | |
1248 unittest | |
1249 { | |
1250 char[64] tmp; | |
1251 | |
1252 assert (isSpace (' ') && !isSpace ('d')); | |
1253 | |
1254 assert (indexOf ("abc".ptr, 'a', 3) is 0); | |
1255 assert (indexOf ("abc".ptr, 'b', 3) is 1); | |
1256 assert (indexOf ("abc".ptr, 'c', 3) is 2); | |
1257 assert (indexOf ("abc".ptr, 'd', 3) is 3); | |
1258 | |
1259 assert (indexOf ("abc"d.ptr, cast(dchar)'c', 3) is 2); | |
1260 assert (indexOf ("abc"d.ptr, cast(dchar)'d', 3) is 3); | |
1261 | |
1262 assert (indexOf ("abc"w.ptr, cast(wchar)'c', 3) is 2); | |
1263 assert (indexOf ("abc"w.ptr, cast(wchar)'d', 3) is 3); | |
1264 | |
1265 assert (mismatch ("abc".ptr, "abc".ptr, 3) is 3); | |
1266 assert (mismatch ("abc".ptr, "abd".ptr, 3) is 2); | |
1267 assert (mismatch ("abc".ptr, "acc".ptr, 3) is 1); | |
1268 assert (mismatch ("abc".ptr, "ccc".ptr, 3) is 0); | |
1269 | |
1270 assert (mismatch ("abc"w.ptr, "abc"w.ptr, 3) is 3); | |
1271 assert (mismatch ("abc"w.ptr, "acc"w.ptr, 3) is 1); | |
1272 | |
1273 assert (mismatch ("abc"d.ptr, "abc"d.ptr, 3) is 3); | |
1274 assert (mismatch ("abc"d.ptr, "acc"d.ptr, 3) is 1); | |
1275 | |
1276 assert (matching ("abc".ptr, "abc".ptr, 3)); | |
1277 assert (matching ("abc".ptr, "abb".ptr, 3) is false); | |
1278 | |
1279 assert (contains ("abc", 'a')); | |
1280 assert (contains ("abc", 'b')); | |
1281 assert (contains ("abc", 'c')); | |
1282 assert (contains ("abc", 'd') is false); | |
1283 | |
1284 assert (containsPattern ("abc", "ab")); | |
1285 assert (containsPattern ("abc", "bc")); | |
1286 assert (containsPattern ("abc", "abc")); | |
1287 assert (containsPattern ("abc", "zabc") is false); | |
1288 assert (containsPattern ("abc", "abcd") is false); | |
1289 assert (containsPattern ("abc", "za") is false); | |
1290 assert (containsPattern ("abc", "cd") is false); | |
1291 | |
1292 assert (trim ("") == ""); | |
1293 assert (trim (" abc ") == "abc"); | |
1294 assert (trim (" ") == ""); | |
1295 | |
1296 assert (strip ("", '%') == ""); | |
1297 assert (strip ("%abc%%%", '%') == "abc"); | |
1298 assert (strip ("#####", '#') == ""); | |
1299 assert (stripl ("#####", '#') == ""); | |
1300 assert (stripl (" ###", ' ') == "###"); | |
1301 assert (stripl ("#####", 's') == "#####"); | |
1302 assert (stripr ("#####", '#') == ""); | |
1303 assert (stripr ("### ", ' ') == "###"); | |
1304 assert (stripr ("#####", 's') == "#####"); | |
1305 | |
1306 assert (replace ("abc".dup, 'b', ':') == "a:c"); | |
1307 assert (substitute ("abc".dup, "bc", "x") == "ax"); | |
1308 | |
1309 assert (locate ("abc", 'c', 1) is 2); | |
1310 | |
1311 assert (locate ("abc", 'c') is 2); | |
1312 assert (locate ("abc", 'a') is 0); | |
1313 assert (locate ("abc", 'd') is 3); | |
1314 assert (locate ("", 'c') is 0); | |
1315 | |
1316 assert (locatePrior ("abce", 'c') is 2); | |
1317 assert (locatePrior ("abce", 'a') is 0); | |
1318 assert (locatePrior ("abce", 'd') is 4); | |
1319 assert (locatePrior ("abce", 'c', 3) is 2); | |
1320 assert (locatePrior ("abce", 'c', 2) is 4); | |
1321 assert (locatePrior ("", 'c') is 0); | |
1322 | |
1323 auto x = delimit ("::b", ":"); | |
1324 assert (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b"); | |
1325 x = delimit ("a:bc:d", ":"); | |
1326 assert (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d"); | |
1327 x = delimit ("abcd", ":"); | |
1328 assert (x.length is 1 && x[0] == "abcd"); | |
1329 x = delimit ("abcd:", ":"); | |
1330 assert (x.length is 1 && x[0] == "abcd"); | |
1331 x = delimit ("a;b$c#d:e@f", ";:$#@"); | |
1332 assert (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" && | |
1333 x[3]=="d" && x[4]=="e" && x[5]=="f"); | |
1334 | |
1335 assert (locatePattern ("abcdefg", "") is 7); | |
1336 assert (locatePattern ("abcdefg", "g") is 6); | |
1337 assert (locatePattern ("abcdefg", "abcdefg") is 0); | |
1338 assert (locatePattern ("abcdefg", "abcdefgx") is 7); | |
1339 assert (locatePattern ("abcdefg", "cce") is 7); | |
1340 assert (locatePattern ("abcdefg", "cde") is 2); | |
1341 assert (locatePattern ("abcdefgcde", "cde", 3) is 7); | |
1342 | |
1343 assert (locatePatternPrior ("abcdefg", "") is 7); | |
1344 assert (locatePatternPrior ("abcdefg", "cce") is 7); | |
1345 assert (locatePatternPrior ("abcdefg", "cde") is 2); | |
1346 assert (locatePatternPrior ("abcdefgcde", "cde", 6) is 2); | |
1347 assert (locatePatternPrior ("abcdefgcde", "cde", 4) is 2); | |
1348 assert (locatePatternPrior ("abcdefg", "abcdefgx") is 7); | |
1349 | |
1350 x = splitLines ("a\nb\n"); | |
1351 assert (x.length is 2 && x[0] == "a" && x[1] == "b"); | |
1352 x = splitLines ("a\r\n"); | |
1353 assert (x.length is 1 && x[0] == "a"); | |
1354 x = splitLines ("a"); | |
1355 assert (x.length is 1 && x[0] == "a"); | |
1356 x = splitLines (""); | |
1357 assert (x.length is 0); | |
1358 | |
1359 char[][] q; | |
1360 foreach (element; quotes ("1 'avcc cc ' 3", " ")) | |
1361 q ~= element; | |
1362 assert (q.length is 3 && q[0] == "1" && q[1] == "'avcc cc '" && q[2] == "3"); | |
1363 | |
1364 assert (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc"); | |
1365 | |
1366 x = split ("one, two, three", ","); | |
1367 assert (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three"); | |
1368 x = split ("one, two, three", ", "); | |
1369 assert (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three"); | |
1370 x = split ("one, two, three", ",,"); | |
1371 assert (x.length is 1 && x[0] == "one, two, three"); | |
1372 | |
1373 char[] h, t; | |
1374 h = head ("one:two:three", ":", t); | |
1375 assert (h == "one" && t == "two:three"); | |
1376 h = head ("one:::two:three", ":::", t); | |
1377 assert (h == "one" && t == "two:three"); | |
1378 h = head ("one:two:three", "*", t); | |
1379 assert (h == "one:two:three" && t is null); | |
1380 | |
1381 t = tail ("one:two:three", ":", h); | |
1382 assert (h == "one:two" && t == "three"); | |
1383 t = tail ("one:::two:three", ":::", h); | |
1384 assert (h == "one" && t == "two:three"); | |
1385 t = tail ("one:two:three", "*", h); | |
1386 assert (t == "one:two:three" && h is null); | |
1387 | |
1388 assert (chopl("hello world", "hello ") == "world"); | |
1389 assert (chopl("hello", "hello") == ""); | |
1390 assert (chopl("hello world", " ") == "hello world"); | |
1391 assert (chopl("hello world", "") == "hello world"); | |
1392 | |
1393 assert (chopr("hello world", " world") == "hello"); | |
1394 assert (chopr("hello", "hello") == ""); | |
1395 assert (chopr("hello world", " ") == "hello world"); | |
1396 assert (chopr("hello world", "") == "hello world"); | |
1397 | |
1398 char[][] foo = ["one", "two", "three"]; | |
1399 auto j = join (foo); | |
1400 assert (j == "onetwothree"); | |
1401 j = join (foo, ", "); | |
1402 assert (j == "one, two, three"); | |
1403 j = join (foo, " ", tmp); | |
1404 assert (j == "one two three"); | |
1405 assert (j.ptr is tmp.ptr); | |
1406 | |
1407 assert (repeat ("abc", 0) == ""); | |
1408 assert (repeat ("abc", 1) == "abc"); | |
1409 assert (repeat ("abc", 2) == "abcabc"); | |
1410 assert (repeat ("abc", 4) == "abcabcabcabc"); | |
1411 assert (repeat ("", 4) == ""); | |
1412 char[10] rep; | |
1413 assert (repeat ("abc", 0, rep) == ""); | |
1414 assert (repeat ("abc", 1, rep) == "abc"); | |
1415 assert (repeat ("abc", 2, rep) == "abcabc"); | |
1416 assert (repeat ("", 4, rep) == ""); | |
1417 } | |
1418 } | |
1419 | |
1420 | |
1421 |