comparison dwtx/dwtxhelper/mangoicu/URegex.d @ 89:040da1cb0d76

Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author Frank Benoit <benoit@tionex.de>
date Sun, 22 Jun 2008 22:57:31 +0200
parents
children 11e8159caf7a
comparison
equal deleted inserted replaced
88:cd18fa3b71f1 89:040da1cb0d76
1 /*******************************************************************************
2
3 @file URegex.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, November 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module dwtx.dwthelper.mangoicu.URegex;
86
87 private import dwtx.dwthelper.mangoicu.ICU;
88
89 public import dwtx.dwthelper.mangoicu.ULocale,
90 dwtx.dwthelper.mangoicu.UString,
91 dwtx.dwthelper.mangoicu.UCollator,
92 dwtx.dwthelper.mangoicu.UBreakIterator;
93
94
95 /*******************************************************************************
96
97 Set of slices to return for group matching. See URegex.groups()
98
99 *******************************************************************************/
100
101 class Groups : ICU
102 {
103 public wchar[] g0,
104 g1,
105 g2,
106 g3,
107 g4,
108 g5,
109 g6,
110 g7,
111 g8,
112 g9;
113 }
114
115 /*******************************************************************************
116
117 Apis for an engine that provides regular-expression searching of
118 UTF16 strings.
119
120 See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full
121 details.
122
123 *******************************************************************************/
124
125 class URegex : Groups
126 {
127 private Handle handle;
128 private UText theText;
129
130 // Regex modes
131 public enum Flag
132 {
133 None = 0,
134
135 // Enable case insensitive matching
136 CaseInsensitive = 2,
137
138 // Allow white space and comments within patterns
139 Comments = 4,
140
141 // Control behavior of "$" and "^" If set, recognize
142 // line terminators within string, otherwise, match
143 // only at start and end of input string.
144 MultiLine = 8,
145
146 // If set, '.' matches line terminators, otherwise '.'
147 // matching stops at line end
148 DotAll = 32,
149
150 // Forces normalization of pattern and strings
151 CanonEq = 128,
152
153 // If set, uses the Unicode TR 29 definition of word
154 // boundaries. Warning: Unicode word boundaries are
155 // quite different from traditional regular expression
156 // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries
157 UWord = 256,
158 }
159
160 /***********************************************************************
161
162 Compiles the regular expression in string form into an
163 internal representation using the specified match mode
164 flags. The resulting regular expression handle can then
165 be used to perform various matching operations.
166
167 ***********************************************************************/
168
169 this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null)
170 {
171 Error e;
172
173 handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e);
174 testError (e, "failed to open regex");
175 uregex_setText (handle, "", 0, e);
176 }
177
178 /***********************************************************************
179
180 Compiles the regular expression in string form into an
181 internal representation using the specified match mode
182 flags. The resulting regular expression handle can then
183 be used to perform various matching operations.
184
185 ***********************************************************************/
186
187 this (UText pattern, Flag flags=Flag.None, ParseError* pe=null)
188 {
189 this (pattern.get, flags, pe);
190 }
191
192 /***********************************************************************
193
194 Internal constructor; used for cloning
195
196 ***********************************************************************/
197
198 private this (Handle handle)
199 {
200 Error e;
201
202 this.handle = handle;
203 uregex_setText (handle, "", 0, e);
204 }
205
206 /***********************************************************************
207
208 Close the regular expression, recovering all resources (memory)
209 it was holding
210
211 ***********************************************************************/
212
213 ~this ()
214 {
215 uregex_close (handle);
216 }
217
218 /***********************************************************************
219
220 Cloning a regular expression is faster than opening a second
221 instance from the source form of the expression, and requires
222 less memory.
223
224 Note that the current input string and the position of any
225 matched text within it are not cloned; only the pattern itself
226 and and the match mode flags are copied.
227
228 Cloning can be particularly useful to threaded applications
229 that perform multiple match operations in parallel. Each
230 concurrent RE operation requires its own instance of a
231 URegularExpression.
232
233 ***********************************************************************/
234
235 URegex clone ()
236 {
237 Error e;
238
239 Handle h = uregex_clone (handle, e);
240 testError (e, "failed to clone regex");
241 return new URegex (h);
242 }
243
244 /***********************************************************************
245
246 Return a copy of the source form of the pattern for this
247 regular expression
248
249 ***********************************************************************/
250
251 UString getPattern ()
252 {
253 Error e;
254 uint len;
255
256 wchar* x = uregex_pattern (handle, len, e);
257 testError (e, "failed to extract regex pattern");
258 return new UString (x[0..len]);
259 }
260
261 /***********************************************************************
262
263 Get the match mode flags that were specified when compiling
264 this regular expression
265
266 ***********************************************************************/
267
268 Flag getFlags ()
269 {
270 Error e;
271
272 Flag f = cast(Flag) uregex_flags (handle, e);
273 testError (e, "failed to get regex flags");
274 return f;
275 }
276
277 /***********************************************************************
278
279 Set the subject text string upon which the regular expression
280 will look for matches.
281
282 This function may be called any number of times, allowing the
283 regular expression pattern to be applied to different strings.
284
285 Regular expression matching operations work directly on the
286 application's string data. No copy is made. The subject string
287 data must not be altered after calling this function until after
288 all regular expression operations involving this string data are
289 completed.
290
291 Zero length strings are permitted. In this case, no subsequent
292 match operation will dereference the text string pointer.
293
294 ***********************************************************************/
295
296 void setText (UText t)
297 {
298 Error e;
299
300 theText = t;
301 uregex_setText (handle, t.get.ptr, t.length, e);
302 testError (e, "failed to set regex text");
303 }
304
305 /***********************************************************************
306
307 Get the subject text that is currently associated with this
308 regular expression object. This simply returns whatever was
309 previously supplied via setText().
310
311 Note that this returns a read-only reference to the text.
312
313 ***********************************************************************/
314
315 UText getText ()
316 {
317 return theText;
318 }
319
320 /***********************************************************************
321
322 Return a set of slices representing the parenthesised groups.
323 This can be used in the following manner:
324
325 @code
326 wchar msg;
327
328 if (regex.next())
329 with (regex.groups())
330 msg ~= g1 ~ ":" ~ g2
331 @endcode
332
333 Note that g0 represents the entire match, whereas g1 through
334 g9 represent the parenthesised expressions.
335
336 ***********************************************************************/
337
338 Groups groups ()
339 {
340 wchar[]* p = &g0;
341 uint count = groupCount();
342 wchar[] content = theText.get();
343
344 if (count > 9)
345 count = 9;
346 for (uint i=0; i <= count; ++p, ++i)
347 *p = content [start(i)..end(i)];
348 return this;
349 }
350
351 /***********************************************************************
352
353 Extract the string for the specified matching expression or
354 subexpression. UString 's' is the destination for the match.
355
356 Group #0 is the complete string of matched text. Group #1 is
357 the text matched by the first set of capturing parentheses.
358
359 ***********************************************************************/
360
361 void group (UString s, uint index)
362 {
363 uint fmt (wchar* dst, uint length, inout Error e)
364 {
365 return uregex_group (handle, index, dst, length, e);
366 }
367
368 s.format (&fmt, "failed to extract regex group text");
369 }
370
371 /***********************************************************************
372
373 Get the number of capturing groups in this regular
374 expression's pattern
375
376 ***********************************************************************/
377
378 uint groupCount ()
379 {
380 Error e;
381
382 uint i = uregex_groupCount (handle, e);
383 testError (e, "failed to get regex group-count");
384 return i;
385 }
386
387 /***********************************************************************
388
389 Returns the index in the input string of the start of the
390 text matched by the specified capture group during the
391 previous match operation.
392
393 Return -1 if the capture group was not part of the last
394 match. Group #0 refers to the complete range of matched
395 text. Group #1 refers to the text matched by the first
396 set of capturing parentheses
397
398 ***********************************************************************/
399
400 uint start (uint index = 0)
401 {
402 Error e;
403
404 uint i = uregex_start (handle, index, e);
405 testError (e, "failed to get regex start");
406 return i;
407 }
408
409 /***********************************************************************
410
411 Returns the index in the input string of the position
412 following the end of the text matched by the specified
413 capture group.
414
415 Return -1 if the capture group was not part of the last
416 match. Group #0 refers to the complete range of matched
417 text. Group #1 refers to the text matched by the first
418 set of capturing parentheses.
419
420 ***********************************************************************/
421
422 uint end (uint index = 0)
423 {
424 Error e;
425
426 uint i = uregex_end (handle, index, e);
427 testError (e, "failed to get regex end");
428 return i;
429 }
430
431 /***********************************************************************
432
433 Reset any saved state from the previous match.
434
435 Has the effect of causing uregex_findNext to begin at the
436 specified index, and causing uregex_start(), uregex_end()
437 and uregex_group() to return an error indicating that there
438 is no match information available.
439
440 ***********************************************************************/
441
442 void reset (uint startIndex)
443 {
444 Error e;
445
446 uregex_reset (handle, startIndex, e);
447 testError (e, "failed to set regex next-index");
448 }
449
450 /***********************************************************************
451
452 Attempts to match the input string, beginning at startIndex,
453 against the pattern.
454
455 To succeed, the match must extend to the end of the input
456 string
457
458 ***********************************************************************/
459
460 bool match (uint startIndex)
461 {
462 Error e;
463
464 bool b = uregex_matches (handle, startIndex, e);
465 testError (e, "failed while matching regex");
466 return b;
467 }
468
469 /***********************************************************************
470
471 Attempts to match the input string, starting from the
472 specified index, against the pattern.
473
474 The match may be of any length, and is not required to
475 extend to the end of the input string. Contrast with match()
476
477 ***********************************************************************/
478
479 bool probe (uint startIndex)
480 {
481 Error e;
482
483 bool b = uregex_lookingAt (handle, startIndex, e);
484 testError (e, "failed while looking at regex");
485 return b;
486 }
487
488 /***********************************************************************
489
490 Returns whether the text matches the search pattern, starting
491 from the current position.
492
493 If startIndex is specified, the current position is moved to
494 the specified location before the seach is initiated.
495
496 ***********************************************************************/
497
498 bool next (uint startIndex = uint.max)
499 {
500 Error e;
501 bool b;
502
503 b = (startIndex == uint.max) ? uregex_findNext (handle, e) :
504 uregex_find (handle, startIndex, e);
505
506 testError (e, "failed on next regex");
507 return b;
508 }
509
510 /***********************************************************************
511
512 Replaces every substring of the input that matches the pattern
513 with the given replacement string.
514
515 This is a convenience function that provides a complete
516 find-and-replace-all operation.
517
518 This method scans the input string looking for matches of
519 the pattern. Input that is not part of any match is copied
520 unchanged to the destination buffer. Matched regions are
521 replaced in the output buffer by the replacement string.
522 The replacement string may contain references to capture
523 groups; these take the form of $1, $2, etc.
524
525 The provided 'result' will contain the results, and should
526 be set with a length sufficient to house the entire result.
527 Upon completion, the 'result' is shortened appropriately
528 and the total extent (length) of the operation is returned.
529 Set the initital length of 'result' using the UString method
530 truncate().
531
532 The returned extent should be checked to ensure it is not
533 longer than the length of 'result'. If it is longer, then
534 the result has been truncated.
535
536 ***********************************************************************/
537
538 uint replaceAll (UText replace, UString result)
539 {
540 Error e;
541
542 uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e);
543 testError (e, "failed during regex replace");
544 result.truncate (len);
545 return len;
546 }
547
548 /***********************************************************************
549
550 Replaces the first substring of the input that matches the
551 pattern with the given replacement string.
552
553 This is a convenience function that provides a complete
554 find-and-replace operation.
555
556 This method scans the input string looking for a match of
557 the pattern. All input that is not part of the match is
558 copied unchanged to the destination buffer. The matched
559 region is replaced in the output buffer by the replacement
560 string. The replacement string may contain references to
561 capture groups; these take the form of $1, $2, etc
562
563 The provided 'result' will contain the results, and should
564 be set with a length sufficient to house the entire result.
565 Upon completion, the 'result' is shortened appropriately
566 and the total extent (length) of the operation is returned.
567 Set the initital length of 'result' using the UString method
568 truncate().
569
570 The returned extent should be checked to ensure it is not
571 longer than the length of 'result'. If it is longer, then
572 the result has been truncated.
573
574 ***********************************************************************/
575
576 uint replaceFirst (UText replace, UString result)
577 {
578 Error e;
579
580 uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e);
581 testError (e, "failed during regex replace");
582 result.truncate (len);
583 return len;
584 }
585
586 /***********************************************************************
587
588 Split the text up into slices (fields), where each slice
589 represents the text situated between each pattern matched
590 within the text. The pattern is expected to represent one
591 or more slice delimiters.
592
593 ***********************************************************************/
594
595 uint split (wchar[][] fields)
596 {
597 Error e;
598 uint pos,
599 count;
600 wchar[] content = theText.get;
601
602 while (count < fields.length)
603 if (uregex_findNext (handle, e) && e == e.OK)
604 {
605 uint i = start();
606 fields[count] = content[pos..i];
607 pos = end ();
608
609 // ignore leading delimiter
610 if (i)
611 ++count;
612 }
613 else
614 break;
615
616 testError (e, "failed during split");
617 return count;
618 }
619
620
621 /***********************************************************************
622
623 Bind the ICU functions from a shared library. This is
624 complicated by the issues regarding D and DLLs on the
625 Windows platform
626
627 ***********************************************************************/
628
629 private static void* library;
630
631 /***********************************************************************
632
633 ***********************************************************************/
634
635 private static extern (C)
636 {
637 Handle function (wchar*, uint, uint, ParseError*, inout Error) uregex_open;
638 void function (Handle) uregex_close;
639 Handle function (Handle, inout Error) uregex_clone;
640 wchar* function (Handle, inout uint, inout Error) uregex_pattern;
641 uint function (Handle, inout Error) uregex_flags;
642 void function (Handle, wchar*, uint, inout Error) uregex_setText;
643 wchar* function (Handle, inout uint, inout Error) uregex_getText;
644 uint function (Handle, uint, wchar*, uint, inout Error) uregex_group;
645 uint function (Handle, inout Error) uregex_groupCount;
646 uint function (Handle, uint, inout Error) uregex_start;
647 uint function (Handle, uint, inout Error) uregex_end;
648 void function (Handle, uint, inout Error) uregex_reset;
649 bool function (Handle, uint, inout Error) uregex_matches;
650 bool function (Handle, uint, inout Error) uregex_lookingAt;
651 bool function (Handle, uint, inout Error) uregex_find;
652 bool function (Handle, inout Error) uregex_findNext;
653 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll;
654 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst;
655 }
656
657 /***********************************************************************
658
659 ***********************************************************************/
660
661 static FunctionLoader.Bind[] targets =
662 [
663 {cast(void**) &uregex_open, "uregex_open"},
664 {cast(void**) &uregex_close, "uregex_close"},
665 {cast(void**) &uregex_clone, "uregex_clone"},
666 {cast(void**) &uregex_pattern, "uregex_pattern"},
667 {cast(void**) &uregex_flags, "uregex_flags"},
668 {cast(void**) &uregex_setText, "uregex_setText"},
669 {cast(void**) &uregex_getText, "uregex_getText"},
670 {cast(void**) &uregex_group, "uregex_group"},
671 {cast(void**) &uregex_groupCount, "uregex_groupCount"},
672 {cast(void**) &uregex_start, "uregex_start"},
673 {cast(void**) &uregex_end, "uregex_end"},
674 {cast(void**) &uregex_reset, "uregex_reset"},
675 {cast(void**) &uregex_matches, "uregex_matches"},
676 {cast(void**) &uregex_lookingAt, "uregex_lookingAt"},
677 {cast(void**) &uregex_find, "uregex_find"},
678 {cast(void**) &uregex_findNext, "uregex_findNext"},
679 {cast(void**) &uregex_replaceAll, "uregex_replaceAll"},
680 {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"},
681 ];
682
683 /***********************************************************************
684
685 ***********************************************************************/
686
687 static this ()
688 {
689 library = FunctionLoader.bind (icuin, targets);
690 }
691
692 /***********************************************************************
693
694 ***********************************************************************/
695
696 static ~this ()
697 {
698 FunctionLoader.unbind (library);
699 }
700 }