comparison java/src/java/mangoicu/UString.d @ 16:dbfb303e8fb0

first complete successful compile (win-only)
author Frank Benoit <benoit@tionex.de>
date Wed, 18 Mar 2009 08:56:47 +0100
parents
children 9b96950f2c3c
comparison
equal deleted inserted replaced
15:c4b1a29263fc 16:dbfb303e8fb0
1 /*******************************************************************************
2
3 @file UString.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, October 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module java.mangoicu.UString;
86
87 private import java.mangoicu.ICU,
88 java.mangoicu.UChar,
89 java.mangoicu.ULocale;
90
91 /*******************************************************************************
92
93 *******************************************************************************/
94
95 private extern (C) void memmove (void* dst, void* src, uint bytes);
96
97 /*******************************************************************************
98
99 Bind to the IReadable and IWritable interfaces if we're building
100 along with the mango.io package
101
102 *******************************************************************************/
103
104 version=Isolated;
105 version (Isolated)
106 {
107 private interface ITextOther {}
108 private interface IStringOther {}
109 }
110 else
111 {
112 private import java.mangoicu.UMango;
113
114 private import mango.io.model.IReader,
115 mango.io.model.IWriter;
116
117 private interface ITextOther : IWritable {}
118 private interface IStringOther : IReadable {}
119 }
120
121
122 /*******************************************************************************
123
124 UString is a string class that stores Unicode characters directly
125 and provides similar functionality as the Java String class.
126
127 In ICU, a Unicode string consists of 16-bit Unicode code units.
128 A Unicode character may be stored with either one code unit &#8212;
129 which is the most common case &#8212; or with a matched pair of
130 special code units ("surrogates"). The data type for code units
131 is UChar.
132
133 For single-character handling, a Unicode character code point is
134 a value in the range 0..0x10ffff. ICU uses the UChar32 type for
135 code points.
136
137 Indexes and offsets into and lengths of strings always count code
138 units, not code points. This is the same as with multi-byte char*
139 strings in traditional string handling. Operations on partial
140 strings typically do not test for code point boundaries. If necessary,
141 the user needs to take care of such boundaries by testing for the code
142 unit values or by using functions like getChar32Start()
143 and getChar32Limit()
144
145 UString methods are more lenient with regard to input parameter values
146 than other ICU APIs. In particular:
147
148 - If indexes are out of bounds for a UString object (< 0 or > length)
149 then they are "pinned" to the nearest boundary.
150
151 - If primitive string pointer values (e.g., const wchar* or char*) for
152 input strings are null, then those input string parameters are treated
153 as if they pointed to an empty string. However, this is not the case
154 for char* parameters for charset names or other IDs.
155
156 *******************************************************************************/
157
158 class UString : UStringView, IStringOther
159 {
160 alias opCat append;
161 alias opIndexAssign setCharAt;
162
163 /***********************************************************************
164
165 Create an empty UString with the specified available space
166
167 ***********************************************************************/
168
169 this (uint space = 0)
170 {
171 content.length = space;
172 mutable = true;
173 }
174
175 /***********************************************************************
176
177 Create a UString upon the provided content. If said content
178 is immutable (read-only) then you might consider setting the
179 'mutable' parameter to false. Doing so will avoid allocating
180 heap-space for the content until it is modified.
181
182 ***********************************************************************/
183
184 this (wchar[] content, bool mutable = true)
185 {
186 setTo (content, mutable);
187 }
188
189 /***********************************************************************
190
191 Create a UString via the content of a UStringView. Note that the
192 default is to assume the content is immutable (read-only).
193
194 ***********************************************************************/
195
196 this (UStringView other, bool mutable = false)
197 {
198 this (other.get, mutable);
199 }
200
201 /***********************************************************************
202
203 Create a UString via the content of a UString. If said content
204 is immutable (read-only) then you might consider setting the
205 'mutable' parameter to false. Doing so will avoid allocating
206 heap-space for the content until it is modified via UString
207 methods.
208
209 ***********************************************************************/
210
211 this (UString other, bool mutable = true)
212 {
213 this (other.get, mutable);
214 }
215
216 /***********************************************************************
217
218 Support for reading content via the IO system
219
220 ***********************************************************************/
221
222 version (Isolated){}
223 else
224 {
225 /***************************************************************
226
227 Internal adapter to handle loading and conversion
228 of UString content. Once constructed, this may be
229 used as the target for an IReader. Alternatively,
230 invoke the load() method with an IBuffer of choice.
231
232 ***************************************************************/
233
234 class UStringDecoder : StringDecoder16
235 {
236 private UString s;
237
238 // construct a decoder on the given UString
239 this (UConverter c, uint bytes, UString s)
240 {
241 super (c, bytes);
242 this.s = s;
243 }
244
245 // IReadable adapter to perform the conversion
246 protected void read (IReader r)
247 {
248 load (r.buffer);
249 }
250
251 // read from the provided buffer until we
252 // either have all the content, or an eof
253 // condition throws an exception.
254 package void load (IBuffer b)
255 {
256 uint produced = super.read (b, s.content);
257 while (toGo)
258 {
259 s.expand (toGo);
260 produced += super.read (b, s.content[produced..$]);
261 }
262 s.len = produced;
263 }
264 }
265
266 /***************************************************************
267
268 Another constructor for loading known content length
269 into a UString.
270
271 ***************************************************************/
272
273 this (IBuffer buffer, uint contentLength, UConverter cvt)
274 {
275 this (contentLength);
276 UStringDecoder sd = new UStringDecoder (cvt, contentLength, this);
277 sd.load (buffer);
278 }
279
280 /***************************************************************
281
282 Read as many bytes from the input as is necessary
283 to produce the expected number of wchar elements.
284 This uses the default wchar handler, which can be
285 altered by binding a StringDecoder to the IReader
286 in use (see UMango for details).
287
288 We're mutable, so ensure we don't mess with the
289 IO buffers. Interestingly, changing the length
290 of a D array will account for slice assignments
291 (it checks the pointer to see if it's a starting
292 point in the pool). Unfortunately, that doesn't
293 catch the case where a slice starts at offset 0,
294 which is where IBuffer slices may come from.
295
296 To be safe, we ask the allocator in use whether
297 the content it provided can be mutated or not.
298 Note that this is not necessary for UStringView, since
299 that is a read-only construct.
300
301 ***************************************************************/
302
303 void read (IReader r)
304 {
305 r.get (content);
306 len = content.length;
307 mutable = r.getAllocator.isMutable (content);
308 }
309
310 /***************************************************************
311
312 Return a streaming decoder that can be used to
313 populate this UString with a specified number of
314 input bytes.
315
316 This differs from the above read() method in the
317 way content is read: in the above case, exactly
318 the specified number of wchar elements will be
319 converter from the input, whereas in this case
320 a variable number of wchar elements are converted
321 until 'bytes' have been read from the input. This
322 is useful in those cases where the original number
323 of elements has been lost, and only the resultant
324 converted byte-count remains (a la HTTP).
325
326 The returned StringDecoder is one-shot only. You may
327 reuse it (both the converter and the byte count) via
328 its reset() method.
329
330 One applies the resultant converter directly with an
331 IReader like so:
332
333 @code
334 UString s = ...;
335 IReader r = ...;
336
337 // r >> s.createDecoder(cvt, bytes);
338 r.get (s.createDecoder(cvt, bytes));
339 @endcode
340
341 which will read the specified number of bytes from
342 the input and convert them to an appropriate number
343 of wchars within the UString.
344
345 ***************************************************************/
346
347 StringDecoder createDecoder (UConverter c, uint bytes)
348 {
349 return new UStringDecoder (c, bytes, this);
350 }
351 }
352
353 /***********************************************************************
354
355 Append text to this UString
356
357 ***********************************************************************/
358
359 UString opCat (UStringView other)
360 {
361 return opCat (other.get);
362 }
363
364 /***********************************************************************
365
366 Append partial text to this UString
367
368 ***********************************************************************/
369
370 UString opCat (UStringView other, uint start, uint len=uint.max)
371 {
372 other.pinIndices (start, len);
373 return opCat (other.content [start..start+len]);
374 }
375
376 /***********************************************************************
377
378 Append a single character to this UString
379
380 ***********************************************************************/
381
382 UString opCat (wchar chr)
383 {
384 return opCat (&chr, 1);
385 }
386
387 /***********************************************************************
388
389 Append text to this UString
390
391 ***********************************************************************/
392
393 UString opCat (wchar[] chars)
394 {
395 return opCat (chars.ptr, chars.length);
396 }
397
398 /***********************************************************************
399
400 Converts a sequence of UTF-8 bytes to UChars (UTF-16)
401
402 ***********************************************************************/
403
404 UString opCat (char[] chars)
405 {
406 uint fmt (wchar* dst, uint len, inout UErrorCode e)
407 {
408 uint x;
409
410 u_strFromUTF8 (dst, len, &x, chars.ptr, chars.length, e);
411 return x;
412 }
413
414 expand (chars.length);
415 return format (&fmt, "failed to append UTF char[]");
416 }
417
418 /***********************************************************************
419
420 Set a section of this UString to the specified character
421
422 ***********************************************************************/
423
424 UString setTo (wchar chr, uint start=0, uint len=uint.max)
425 {
426 pinIndices (start, len);
427 if (! mutable)
428 realloc ();
429 content [start..start+len] = chr;
430 return this;
431 }
432
433 /***********************************************************************
434
435 Set the content to the provided array. Parameter 'mutable'
436 specifies whether the given array is likely to change. If
437 not, the array is aliased until such time this UString is
438 altered.
439
440 ***********************************************************************/
441
442 UString setTo (wchar[] chars, bool mutable = true)
443 {
444 len = chars.length;
445 if ((this.mutable = mutable) == true)
446 content = chars.dup;
447 else
448 content = chars;
449 return this;
450 }
451
452 /***********************************************************************
453
454 Replace the content of this UString. If the new content
455 is immutable (read-only) then you might consider setting the
456 'mutable' parameter to false. Doing so will avoid allocating
457 heap-space for the content until it is modified via one of
458 these methods.
459
460 ***********************************************************************/
461
462 UString setTo (UStringView other, bool mutable = true)
463 {
464 return setTo (other.get, mutable);
465 }
466
467 /***********************************************************************
468
469 Replace the content of this UString. If the new content
470 is immutable (read-only) then you might consider setting the
471 'mutable' parameter to false. Doing so will avoid allocating
472 heap-space for the content until it is modified via one of
473 these methods.
474
475 ***********************************************************************/
476
477 UString setTo (UStringView other, uint start, uint len, bool mutable = true)
478 {
479 other.pinIndices (start, len);
480 return setTo (other.content [start..start+len], mutable);
481 }
482
483 /***********************************************************************
484
485 Replace the character at the specified location.
486
487 ***********************************************************************/
488
489 final UString opIndexAssign (wchar chr, uint index)
490 in {
491 if (index >= len)
492 exception ("index of out bounds");
493 }
494 body
495 {
496 if (! mutable)
497 realloc ();
498 content [index] = chr;
499 return this;
500 }
501
502 /***********************************************************************
503
504 Remove a piece of this UString.
505
506 ***********************************************************************/
507
508 UString remove (uint start, uint length=uint.max)
509 {
510 pinIndices (start, length);
511 if (length)
512 if (start >= len)
513 truncate (start);
514 else
515 {
516 if (! mutable)
517 realloc ();
518
519 uint i = start + length;
520 memmove (&content[start], &content[i], (len-i) * wchar.sizeof);
521 len -= length;
522 }
523 return this;
524 }
525
526 /***********************************************************************
527
528 Truncate the length of this UString.
529
530 ***********************************************************************/
531
532 UString truncate (uint length=0)
533 {
534 if (length <= len)
535 len = length;
536 return this;
537 }
538
539 /***********************************************************************
540
541 Insert leading spaces in this UString
542
543 ***********************************************************************/
544
545 UString padLeading (uint count, wchar padChar = 0x0020)
546 {
547 expand (count);
548 memmove (&content[count], content.ptr, len * wchar.sizeof);
549 len += count;
550 return setTo (padChar, 0, count);
551 }
552
553 /***********************************************************************
554
555 Append some trailing spaces to this UString.
556
557 ***********************************************************************/
558
559 UString padTrailing (uint length, wchar padChar = 0x0020)
560 {
561 expand (length);
562 len += length;
563 return setTo (padChar, len-length, length);
564 }
565
566 /***********************************************************************
567
568 Check for available space within the buffer, and expand
569 as necessary.
570
571 ***********************************************************************/
572
573 package final void expand (uint count)
574 {
575 if ((len + count) > content.length)
576 realloc (count);
577 }
578
579 /***********************************************************************
580
581 Allocate memory due to a change in the content. We handle
582 the distinction between mutable and immutable here.
583
584 ***********************************************************************/
585
586 private final void realloc (uint count = 0)
587 {
588 uint size = (content.length + count + 63) & ~63;
589
590 if (mutable)
591 content.length = size;
592 else
593 {
594 mutable = true;
595 wchar[] x = content;
596 content = new wchar [size];
597 if (len)
598 content[0..len] = x;
599 }
600 }
601
602 /***********************************************************************
603
604 Internal method to support UString appending
605
606 ***********************************************************************/
607
608 private final UString opCat (wchar* chars, uint count)
609 {
610 expand (count);
611 content[len..len+count] = chars[0..count];
612 len += count;
613 return this;
614 }
615
616 /***********************************************************************
617
618 Internal method to support formatting into this UString.
619 This is used by many of the ICU wrappers to append content
620 into a UString.
621
622 ***********************************************************************/
623
624 typedef uint delegate (wchar* dst, uint len, inout UErrorCode e) Formatter;
625
626 package final UString format (Formatter format, char[] msg)
627 {
628 UErrorCode e;
629 uint length;
630
631 while (true)
632 {
633 e = e.OK;
634 length = format (&content[len], content.length - len, e);
635 if (e == e.BufferOverflow)
636 expand (length);
637 else
638 break;
639 }
640
641 if (isError (e))
642 exception (msg);
643
644 len += length;
645 return this;
646 }
647 }
648
649
650 /*******************************************************************************
651
652 Immutable (read-only) text -- use UString for mutable strings.
653
654 *******************************************************************************/
655
656 class UStringView : ICU, ITextOther
657 {
658 alias opIndex charAt;
659
660 // the core of the UStringView and UString attributes. The name 'len'
661 // is used rather than the more obvious 'length' since there is
662 // a collision with the silly array[length] syntactic sugar ...
663 package uint len;
664 package wchar[] content;
665
666 // this should probably be in UString only, but there seems to
667 // be a compiler bug where it doesn't get initialised correctly,
668 // and it's perhaps useful to have here for when a UString is
669 // passed as a UStringView argument.
670 private bool mutable;
671
672 // toFolded() argument
673 public enum CaseOption
674 {
675 Default = 0,
676 SpecialI = 1
677 }
678
679 /***********************************************************************
680
681 Hidden constructor
682
683 ***********************************************************************/
684
685 private this ()
686 {
687 }
688
689 /***********************************************************************
690
691 Construct read-only wrapper around the given content
692
693 ***********************************************************************/
694
695 this (wchar[] content)
696 {
697 this.content = content;
698 this.len = content.length;
699 }
700
701 /***********************************************************************
702
703 Support for writing via the Mango IO subsystem
704
705 ***********************************************************************/
706
707 version (Isolated){}
708 else
709 {
710 void write (IWriter w)
711 {
712 w.put (get);
713 }
714 }
715
716 /***********************************************************************
717
718 Return the valid content from this UStringView
719
720 ***********************************************************************/
721
722 final package wchar[] get ()
723 {
724 return content [0..len];
725 }
726
727 /***********************************************************************
728
729 Is this UStringView equal to another?
730
731 ***********************************************************************/
732
733 final override int opEquals (Object o)
734 {
735 UStringView other = cast(UStringView) o;
736
737 if (other)
738 return (other is this || compare (other) == 0);
739 return 0;
740 }
741
742 /***********************************************************************
743
744 Compare this UStringView to another.
745
746 ***********************************************************************/
747
748 final override int opCmp (Object o)
749 {
750 UStringView other = cast(UStringView) o;
751
752 if (other is this)
753 return 0;
754 else
755 if (other)
756 return compare (other);
757 return 1;
758 }
759
760 /***********************************************************************
761
762 Hash this UStringView
763
764 ***********************************************************************/
765
766 final override uint toHash ()
767 {
768 return typeid(wchar[]).getHash (&content[0..len]);
769 }
770
771 /***********************************************************************
772
773 Clone this UStringView into a UString
774
775 ***********************************************************************/
776
777 final UString copy ()
778 {
779 return new UString (content);
780 }
781
782 /***********************************************************************
783
784 Clone a section of this UStringView into a UString
785
786 ***********************************************************************/
787
788 final UString extract (uint start, uint len=uint.max)
789 {
790 pinIndices (start, len);
791 return new UString (content[start..start+len]);
792 }
793
794 /***********************************************************************
795
796 Count unicode code points in the length UChar code units of
797 the string. A code point may occupy either one or two UChar
798 code units. Counting code points involves reading all code
799 units.
800
801 ***********************************************************************/
802
803 final uint codePoints (uint start=0, uint length=uint.max)
804 {
805 pinIndices (start, length);
806 return u_countChar32 (&content[start], length);
807 }
808
809 /***********************************************************************
810
811 Return an indication whether or not there are surrogate pairs
812 within the string.
813
814 ***********************************************************************/
815
816 final bool hasSurrogates (uint start=0, uint length=uint.max)
817 {
818 pinIndices (start, length);
819 return codePoints (start, length) != length;
820 }
821
822 /***********************************************************************
823
824 Return the character at the specified position.
825
826 ***********************************************************************/
827
828 final wchar opIndex (uint index)
829 in {
830 if (index >= len)
831 exception ("index of out bounds");
832 }
833 body
834 {
835 return content [index];
836 }
837
838 /***********************************************************************
839
840 Return the length of the valid content
841
842 ***********************************************************************/
843
844 final uint length ()
845 {
846 return len;
847 }
848
849 /***********************************************************************
850
851 The comparison can be done in code unit order or in code
852 point order. They differ only in UTF-16 when comparing
853 supplementary code points (U+10000..U+10ffff) to BMP code
854 points near the end of the BMP (i.e., U+e000..U+ffff).
855
856 In code unit order, high BMP code points sort after
857 supplementary code points because they are stored as
858 pairs of surrogates which are at U+d800..U+dfff.
859
860 ***********************************************************************/
861
862 final int compare (UStringView other, bool codePointOrder=false)
863 {
864 return compare (other.get, codePointOrder);
865 }
866
867 /***********************************************************************
868
869 The comparison can be done in code unit order or in code
870 point order. They differ only in UTF-16 when comparing
871 supplementary code points (U+10000..U+10ffff) to BMP code
872 points near the end of the BMP (i.e., U+e000..U+ffff).
873
874 In code unit order, high BMP code points sort after
875 supplementary code points because they are stored as
876 pairs of surrogates which are at U+d800..U+dfff.
877
878 ***********************************************************************/
879
880 final int compare (wchar[] other, bool codePointOrder=false)
881 {
882 return u_strCompare (content.ptr, len, other.ptr, other.length, codePointOrder);
883 }
884
885 /***********************************************************************
886
887 The comparison can be done in UTF-16 code unit order or
888 in code point order. They differ only when comparing
889 supplementary code points (U+10000..U+10ffff) to BMP code
890 points near the end of the BMP (i.e., U+e000..U+ffff).
891
892 In code unit order, high BMP code points sort after
893 supplementary code points because they are stored as
894 pairs of surrogates which are at U+d800..U+dfff.
895
896 ***********************************************************************/
897
898 final int compareFolded (UStringView other, CaseOption option = CaseOption.Default)
899 {
900 return compareFolded (other.content, option);
901 }
902
903 /***********************************************************************
904
905 The comparison can be done in UTF-16 code unit order or
906 in code point order. They differ only when comparing
907 supplementary code points (U+10000..U+10ffff) to BMP code
908 points near the end of the BMP (i.e., U+e000..U+ffff).
909
910 In code unit order, high BMP code points sort after
911 supplementary code points because they are stored as
912 pairs of surrogates which are at U+d800..U+dfff.
913
914 ***********************************************************************/
915
916 final int compareFolded (wchar[] other, CaseOption option = CaseOption.Default)
917 {
918 return compareFolded (get, other, option);
919 }
920
921 /***********************************************************************
922
923 Does this UStringView start with specified string?
924
925 ***********************************************************************/
926
927 final bool startsWith (UStringView other)
928 {
929 return startsWith (other.get);
930 }
931
932 /***********************************************************************
933
934 Does this UStringView start with specified string?
935
936 ***********************************************************************/
937
938 final bool startsWith (wchar[] chars)
939 {
940 if (len >= chars.length)
941 return compareFolded (content[0..chars.length], chars) == 0;
942 return false;
943 }
944
945 /***********************************************************************
946
947 Does this UStringView end with specified string?
948
949 ***********************************************************************/
950
951 final bool endsWith (UStringView other)
952 {
953 return endsWith (other.get);
954 }
955
956 /***********************************************************************
957
958 Does this UStringView end with specified string?
959
960 ***********************************************************************/
961
962 final bool endsWith (wchar[] chars)
963 {
964 if (len >= chars.length)
965 return compareFolded (content[len-chars.length..len], chars) == 0;
966 return false;
967 }
968
969 /***********************************************************************
970
971 Find the first occurrence of a BMP code point in a string.
972 A surrogate code point is found only if its match in the
973 text is not part of a surrogate pair.
974
975 ***********************************************************************/
976
977 final uint indexOf (wchar c, uint start=0)
978 {
979 pinIndex (start);
980 wchar* s = u_memchr (&content[start], c, len-start);
981 if (s)
982 return s - content.ptr;
983 return uint.max;
984 }
985
986 /***********************************************************************
987
988 Find the first occurrence of a substring in a string.
989
990 The substring is found at code point boundaries. That means
991 that if the substring begins with a trail surrogate or ends
992 with a lead surrogate, then it is found only if these
993 surrogates stand alone in the text. Otherwise, the substring
994 edge units would be matched against halves of surrogate pairs.
995
996 ***********************************************************************/
997
998 final uint indexOf (UStringView other, uint start=0)
999 {
1000 return indexOf (other.get, start);
1001 }
1002
1003 /***********************************************************************
1004
1005 Find the first occurrence of a substring in a string.
1006
1007 The substring is found at code point boundaries. That means
1008 that if the substring begins with a trail surrogate or ends
1009 with a lead surrogate, then it is found only if these
1010 surrogates stand alone in the text. Otherwise, the substring
1011 edge units would be matched against halves of surrogate pairs.
1012
1013 ***********************************************************************/
1014
1015 final uint indexOf (wchar[] chars, uint start=0)
1016 {
1017 pinIndex (start);
1018 wchar* s = u_strFindFirst (&content[start], len-start, chars.ptr, chars.length);
1019 if (s)
1020 return s - content.ptr;
1021 return uint.max;
1022 }
1023
1024 /***********************************************************************
1025
1026 Find the last occurrence of a BMP code point in a string.
1027 A surrogate code point is found only if its match in the
1028 text is not part of a surrogate pair.
1029
1030 ***********************************************************************/
1031
1032 final uint lastIndexOf (wchar c, uint start=uint.max)
1033 {
1034 pinIndex (start);
1035 wchar* s = u_memrchr (content.ptr, c, start);
1036 if (s)
1037 return s - content.ptr;
1038 return uint.max;
1039 }
1040
1041 /***********************************************************************
1042
1043 Find the last occurrence of a BMP code point in a string.
1044 A surrogate code point is found only if its match in the
1045 text is not part of a surrogate pair.
1046
1047 ***********************************************************************/
1048
1049 final uint lastIndexOf (UStringView other, uint start=uint.max)
1050 {
1051 return lastIndexOf (other.get, start);
1052 }
1053
1054 /***********************************************************************
1055
1056 Find the last occurrence of a substring in a string.
1057
1058 The substring is found at code point boundaries. That means
1059 that if the substring begins with a trail surrogate or ends
1060 with a lead surrogate, then it is found only if these
1061 surrogates stand alone in the text. Otherwise, the substring
1062 edge units would be matched against halves of surrogate pairs.
1063
1064 ***********************************************************************/
1065
1066 final uint lastIndexOf (wchar[] chars, uint start=uint.max)
1067 {
1068 pinIndex (start);
1069 wchar* s = u_strFindLast (content.ptr, start, chars.ptr, chars.length);
1070 if (s)
1071 return s - content.ptr;
1072 return uint.max;
1073 }
1074
1075 /***********************************************************************
1076
1077 Lowercase the characters into a seperate UString.
1078
1079 Casing is locale-dependent and context-sensitive. The
1080 result may be longer or shorter than the original.
1081
1082 Note that the return value refers to the provided destination
1083 UString.
1084
1085 ***********************************************************************/
1086
1087 final UString toLower (UString dst)
1088 {
1089 return toLower (dst, ULocale.Default);
1090 }
1091
1092 /***********************************************************************
1093
1094 Lowercase the characters into a seperate UString.
1095
1096 Casing is locale-dependent and context-sensitive. The
1097 result may be longer or shorter than the original.
1098
1099 Note that the return value refers to the provided destination
1100 UString.
1101
1102 ***********************************************************************/
1103
1104 final UString toLower (UString dst, inout ULocale locale)
1105 {
1106 uint lower (wchar* dst, uint length, inout UErrorCode e)
1107 {
1108 return u_strToLower (dst, length, content.ptr, len, ICU.toString(locale.name), e);
1109 }
1110
1111 dst.expand (len + 32);
1112 return dst.format (&lower, "toLower() failed");
1113 }
1114
1115 /***********************************************************************
1116
1117 Uppercase the characters into a seperate UString.
1118
1119 Casing is locale-dependent and context-sensitive. The
1120 result may be longer or shorter than the original.
1121
1122 Note that the return value refers to the provided destination
1123 UString.
1124
1125 ***********************************************************************/
1126
1127 final UString toUpper (UString dst)
1128 {
1129 return toUpper (dst, ULocale.Default);
1130 }
1131
1132 /***********************************************************************
1133
1134 Uppercase the characters into a seperate UString.
1135
1136 Casing is locale-dependent and context-sensitive. The
1137 result may be longer or shorter than the original.
1138
1139 Note that the return value refers to the provided destination
1140 UString.
1141
1142 ***********************************************************************/
1143
1144 final UString toUpper (UString dst, inout ULocale locale)
1145 {
1146 uint upper (wchar* dst, uint length, inout UErrorCode e)
1147 {
1148 return u_strToUpper (dst, length, content.ptr, len, ICU.toString(locale.name), e);
1149 }
1150
1151 dst.expand (len + 32);
1152 return dst.format (&upper, "toUpper() failed");
1153 }
1154
1155 /***********************************************************************
1156
1157 Case-fold the characters into a seperate UString.
1158
1159 Case-folding is locale-independent and not context-sensitive,
1160 but there is an option for whether to include or exclude
1161 mappings for dotted I and dotless i that are marked with 'I'
1162 in CaseFolding.txt. The result may be longer or shorter than
1163 the original.
1164
1165 Note that the return value refers to the provided destination
1166 UString.
1167
1168 ***********************************************************************/
1169
1170 final UString toFolded (UString dst, CaseOption option = CaseOption.Default)
1171 {
1172 uint fold (wchar* dst, uint length, inout UErrorCode e)
1173 {
1174 return u_strFoldCase (dst, length, content.ptr, len, option, e);
1175 }
1176
1177 dst.expand (len + 32);
1178 return dst.format (&fold, "toFolded() failed");
1179 }
1180
1181 /***********************************************************************
1182
1183 Converts a sequence of wchar (UTF-16) to UTF-8 bytes. If
1184 the output array is not provided, an array of appropriate
1185 size will be allocated and returned. Where the output is
1186 provided, it must be large enough to hold potentially four
1187 bytes per character for surrogate-pairs or three bytes per
1188 character for BMP only. Consider using UConverter where
1189 streaming conversions are required.
1190
1191 Returns an array slice representing the valid UTF8 content.
1192
1193 ***********************************************************************/
1194
1195 final char[] toUtf8 (char[] dst = null)
1196 {
1197 uint x;
1198 UErrorCode e;
1199
1200 if (! cast(char*) dst)
1201 dst = new char[len * 4];
1202
1203 u_strToUTF8 (dst.ptr, dst.length, &x, content.ptr, len, e);
1204 testError (e, "failed to convert to UTF8");
1205 return dst [0..x];
1206 }
1207
1208 /***********************************************************************
1209
1210 Remove leading and trailing whitespace from this UStringView.
1211 Note that we slice the content to remove leading space.
1212
1213 ***********************************************************************/
1214
1215 UStringView trim ()
1216 {
1217 wchar c;
1218 uint i = len;
1219
1220 // cut off trailing white space
1221 while (i && ((c = charAt(i-1)) == 0x20 || UChar.isWhiteSpace (c)))
1222 --i;
1223 len = i;
1224
1225 // now remove leading whitespace
1226 for (i=0; i < len && ((c = charAt(i)) == 0x20 || UChar.isWhiteSpace (c)); ++i) {}
1227 if (i)
1228 {
1229 len -= i;
1230 content = content[i..$-i];
1231 }
1232
1233 return this;
1234 }
1235
1236 /***********************************************************************
1237
1238 Unescape a string of characters and write the resulting
1239 Unicode characters to the destination buffer. The following
1240 escape sequences are recognized:
1241
1242 uhhhh 4 hex digits; h in [0-9A-Fa-f]
1243 Uhhhhhhhh 8 hex digits
1244 xhh 1-2 hex digits
1245 x{h...} 1-8 hex digits
1246 ooo 1-3 octal digits; o in [0-7]
1247 cX control-X; X is masked with 0x1F
1248
1249 as well as the standard ANSI C escapes:
1250
1251 a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
1252 v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
1253 \\" =U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
1254
1255 Anything else following a backslash is generically escaped.
1256 For example, "[a\\-z]" returns "[a-z]".
1257
1258 If an escape sequence is ill-formed, this method returns an
1259 empty string. An example of an ill-formed sequence is "\\u"
1260 followed by fewer than 4 hex digits.
1261
1262 ***********************************************************************/
1263
1264 final UString unEscape ()
1265 {
1266 UString result = new UString (len);
1267 for (uint i=0; i < len;)
1268 {
1269 dchar c = charAt(i++);
1270 if (c == 0x005C)
1271 {
1272 // bump index ...
1273 c = u_unescapeAt (&_charAt, &i, len, cast(void*) this);
1274
1275 // error?
1276 if (c == 0xFFFFFFFF)
1277 {
1278 result.truncate (); // return empty string
1279 break; // invalid escape sequence
1280 }
1281 }
1282 result.append (c);
1283 }
1284 return result;
1285 }
1286
1287 /***********************************************************************
1288
1289 Is this code point a surrogate (U+d800..U+dfff)?
1290
1291 ***********************************************************************/
1292
1293 final static bool isSurrogate (wchar c)
1294 {
1295 return (c & 0xfffff800) == 0xd800;
1296 }
1297
1298 /***********************************************************************
1299
1300 Is this code unit a lead surrogate (U+d800..U+dbff)?
1301
1302 ***********************************************************************/
1303
1304 final static bool isLeading (wchar c)
1305 {
1306 return (c & 0xfffffc00) == 0xd800;
1307 }
1308
1309 /***********************************************************************
1310
1311 Is this code unit a trail surrogate (U+dc00..U+dfff)?
1312
1313 ***********************************************************************/
1314
1315 final static bool isTrailing (wchar c)
1316 {
1317 return (c & 0xfffffc00) == 0xdc00;
1318 }
1319
1320 /***********************************************************************
1321
1322 Adjust a random-access offset to a code point boundary
1323 at the start of a code point. If the offset points to
1324 the trail surrogate of a surrogate pair, then the offset
1325 is decremented. Otherwise, it is not modified.
1326
1327 ***********************************************************************/
1328
1329 final uint getCharStart (uint i)
1330 in {
1331 if (i >= len)
1332 exception ("index of out bounds");
1333 }
1334 body
1335 {
1336 if (isTrailing (content[i]) && i && isLeading (content[i-1]))
1337 --i;
1338 return i;
1339 }
1340
1341 /***********************************************************************
1342
1343 Adjust a random-access offset to a code point boundary
1344 after a code point. If the offset is behind the lead
1345 surrogate of a surrogate pair, then the offset is
1346 incremented. Otherwise, it is not modified.
1347
1348 ***********************************************************************/
1349
1350 final uint getCharLimit (uint i)
1351 in {
1352 if (i >= len)
1353 exception ("index of out bounds");
1354 }
1355 body
1356 {
1357 if (i && isLeading(content[i-1]) && isTrailing (content[i]))
1358 ++i;
1359 return i;
1360 }
1361
1362 /***********************************************************************
1363
1364 Callback for C unescapeAt() function
1365
1366 ***********************************************************************/
1367
1368 extern (C)
1369 {
1370 typedef wchar function (uint offset, void* context) CharAt;
1371
1372 private static wchar _charAt (uint offset, void* context)
1373 {
1374 return (cast(UString) context).charAt (offset);
1375 }
1376 }
1377
1378 /***********************************************************************
1379
1380 Pin the given index to a valid position.
1381
1382 ***********************************************************************/
1383
1384 final private void pinIndex (inout uint x)
1385 {
1386 if (x > len)
1387 x = len;
1388 }
1389
1390 /***********************************************************************
1391
1392 Pin the given index and length to a valid position.
1393
1394 ***********************************************************************/
1395
1396 final private void pinIndices (inout uint start, inout uint length)
1397 {
1398 if (start > len)
1399 start = len;
1400
1401 if (length > (len - start))
1402 length = len - start;
1403 }
1404
1405 /***********************************************************************
1406
1407 Helper for comparison methods
1408
1409 ***********************************************************************/
1410
1411 final private int compareFolded (wchar[] s1, wchar[] s2, CaseOption option = CaseOption.Default)
1412 {
1413 UErrorCode e;
1414
1415 int x = u_strCaseCompare (s1.ptr, s1.length, s2.ptr, s2.length, option, e);
1416 testError (e, "compareFolded failed");
1417 return x;
1418 }
1419
1420
1421 /***********************************************************************
1422
1423 Bind the ICU functions from a shared library. This is
1424 complicated by the issues regarding D and DLLs on the
1425 Windows platform
1426
1427 ***********************************************************************/
1428
1429 private static void* library;
1430
1431 /***********************************************************************
1432
1433 ***********************************************************************/
1434
1435 private static extern (C)
1436 {
1437 wchar* function (wchar*, uint, wchar*, uint) u_strFindFirst;
1438 wchar* function (wchar*, uint, wchar*, uint) u_strFindLast;
1439 wchar* function (wchar*, wchar, uint) u_memchr;
1440 wchar* function (wchar*, wchar, uint) u_memrchr;
1441 int function (wchar*, uint, wchar*, uint, bool) u_strCompare;
1442 int function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) u_strCaseCompare;
1443 dchar function (CharAt, uint*, uint, void*) u_unescapeAt;
1444 uint function (wchar*, uint) u_countChar32;
1445 uint function (wchar*, uint, wchar*, uint, char*, inout UErrorCode) u_strToUpper;
1446 uint function (wchar*, uint, wchar*, uint, char*, inout UErrorCode) u_strToLower;
1447 uint function (wchar*, uint, wchar*, uint, uint, inout UErrorCode) u_strFoldCase;
1448 wchar* function (wchar*, uint, uint*, char*, uint, inout UErrorCode) u_strFromUTF8;
1449 char* function (char*, uint, uint*, wchar*, uint, inout UErrorCode) u_strToUTF8;
1450 }
1451
1452 /***********************************************************************
1453
1454 ***********************************************************************/
1455
1456 static FunctionLoader.Bind[] targets =
1457 [
1458 {cast(void**) &u_strFindFirst, "u_strFindFirst"},
1459 {cast(void**) &u_strFindLast, "u_strFindLast"},
1460 {cast(void**) &u_memchr, "u_memchr"},
1461 {cast(void**) &u_memrchr, "u_memrchr"},
1462 {cast(void**) &u_strCompare, "u_strCompare"},
1463 {cast(void**) &u_strCaseCompare, "u_strCaseCompare"},
1464 {cast(void**) &u_unescapeAt, "u_unescapeAt"},
1465 {cast(void**) &u_countChar32, "u_countChar32"},
1466 {cast(void**) &u_strToUpper, "u_strToUpper"},
1467 {cast(void**) &u_strToLower, "u_strToLower"},
1468 {cast(void**) &u_strFoldCase, "u_strFoldCase"},
1469 {cast(void**) &u_strFromUTF8, "u_strFromUTF8"},
1470 {cast(void**) &u_strToUTF8, "u_strToUTF8"},
1471 ];
1472
1473 /***********************************************************************
1474
1475 ***********************************************************************/
1476
1477 static this ()
1478 {
1479 library = FunctionLoader.bind (icuuc, targets);
1480 //test ();
1481 }
1482
1483 /***********************************************************************
1484
1485 ***********************************************************************/
1486
1487 static ~this ()
1488 {
1489 FunctionLoader.unbind (library);
1490 }
1491
1492 /***********************************************************************
1493
1494 ***********************************************************************/
1495
1496 private static void test()
1497 {
1498 UString s = new UString (r"aaaqw \uabcd eaaa");
1499 char[] x = "dssfsdff";
1500 s ~ x ~ x;
1501 wchar c = s[3];
1502 s[3] = 'Q';
1503 int y = s.indexOf ("qwe");
1504 s.unEscape ();
1505 s.toUpper (new UString);
1506 s.padLeading(2).padTrailing(2).trim();
1507 }
1508 }