Mercurial > projects > dwt-addons
comparison dwtx/dwtxhelper/mangoicu/URegex.d @ 89:040da1cb0d76
Add a local copy of the mango ICU binding to work out the utf8 usability. Will hopefully go back into mango.
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sun, 22 Jun 2008 22:57:31 +0200 |
parents | |
children | 11e8159caf7a |
comparison
equal
deleted
inserted
replaced
88:cd18fa3b71f1 | 89:040da1cb0d76 |
---|---|
1 /******************************************************************************* | |
2 | |
3 @file URegex.d | |
4 | |
5 Copyright (c) 2004 Kris Bell | |
6 | |
7 This software is provided 'as-is', without any express or implied | |
8 warranty. In no event will the authors be held liable for damages | |
9 of any kind arising from the use of this software. | |
10 | |
11 Permission is hereby granted to anyone to use this software for any | |
12 purpose, including commercial applications, and to alter it and/or | |
13 redistribute it freely, subject to the following restrictions: | |
14 | |
15 1. The origin of this software must not be misrepresented; you must | |
16 not claim that you wrote the original software. If you use this | |
17 software in a product, an acknowledgment within documentation of | |
18 said product would be appreciated but is not required. | |
19 | |
20 2. Altered source versions must be plainly marked as such, and must | |
21 not be misrepresented as being the original software. | |
22 | |
23 3. This notice may not be removed or altered from any distribution | |
24 of the source. | |
25 | |
26 4. Derivative works are permitted, but they must carry this notice | |
27 in full and credit the original source. | |
28 | |
29 | |
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
31 | |
32 | |
33 @version Initial version, November 2004 | |
34 @author Kris | |
35 | |
36 Note that this package and documentation is built around the ICU | |
37 project (http://oss.software.ibm.com/icu/). Below is the license | |
38 statement as specified by that software: | |
39 | |
40 | |
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
42 | |
43 | |
44 ICU License - ICU 1.8.1 and later | |
45 | |
46 COPYRIGHT AND PERMISSION NOTICE | |
47 | |
48 Copyright (c) 1995-2003 International Business Machines Corporation and | |
49 others. | |
50 | |
51 All rights reserved. | |
52 | |
53 Permission is hereby granted, free of charge, to any person obtaining a | |
54 copy of this software and associated documentation files (the | |
55 "Software"), to deal in the Software without restriction, including | |
56 without limitation the rights to use, copy, modify, merge, publish, | |
57 distribute, and/or sell copies of the Software, and to permit persons | |
58 to whom the Software is furnished to do so, provided that the above | |
59 copyright notice(s) and this permission notice appear in all copies of | |
60 the Software and that both the above copyright notice(s) and this | |
61 permission notice appear in supporting documentation. | |
62 | |
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR | |
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL | |
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING | |
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, | |
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION | |
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
72 | |
73 Except as contained in this notice, the name of a copyright holder | |
74 shall not be used in advertising or otherwise to promote the sale, use | |
75 or other dealings in this Software without prior written authorization | |
76 of the copyright holder. | |
77 | |
78 ---------------------------------------------------------------------- | |
79 | |
80 All trademarks and registered trademarks mentioned herein are the | |
81 property of their respective owners. | |
82 | |
83 *******************************************************************************/ | |
84 | |
85 module dwtx.dwthelper.mangoicu.URegex; | |
86 | |
87 private import dwtx.dwthelper.mangoicu.ICU; | |
88 | |
89 public import dwtx.dwthelper.mangoicu.ULocale, | |
90 dwtx.dwthelper.mangoicu.UString, | |
91 dwtx.dwthelper.mangoicu.UCollator, | |
92 dwtx.dwthelper.mangoicu.UBreakIterator; | |
93 | |
94 | |
95 /******************************************************************************* | |
96 | |
97 Set of slices to return for group matching. See URegex.groups() | |
98 | |
99 *******************************************************************************/ | |
100 | |
101 class Groups : ICU | |
102 { | |
103 public wchar[] g0, | |
104 g1, | |
105 g2, | |
106 g3, | |
107 g4, | |
108 g5, | |
109 g6, | |
110 g7, | |
111 g8, | |
112 g9; | |
113 } | |
114 | |
115 /******************************************************************************* | |
116 | |
117 Apis for an engine that provides regular-expression searching of | |
118 UTF16 strings. | |
119 | |
120 See http://icu.sourceforge.net/apiref/icu4c/uregex_8h.html for full | |
121 details. | |
122 | |
123 *******************************************************************************/ | |
124 | |
125 class URegex : Groups | |
126 { | |
127 private Handle handle; | |
128 private UText theText; | |
129 | |
130 // Regex modes | |
131 public enum Flag | |
132 { | |
133 None = 0, | |
134 | |
135 // Enable case insensitive matching | |
136 CaseInsensitive = 2, | |
137 | |
138 // Allow white space and comments within patterns | |
139 Comments = 4, | |
140 | |
141 // Control behavior of "$" and "^" If set, recognize | |
142 // line terminators within string, otherwise, match | |
143 // only at start and end of input string. | |
144 MultiLine = 8, | |
145 | |
146 // If set, '.' matches line terminators, otherwise '.' | |
147 // matching stops at line end | |
148 DotAll = 32, | |
149 | |
150 // Forces normalization of pattern and strings | |
151 CanonEq = 128, | |
152 | |
153 // If set, uses the Unicode TR 29 definition of word | |
154 // boundaries. Warning: Unicode word boundaries are | |
155 // quite different from traditional regular expression | |
156 // word boundaries. See http://unicode.org/reports/tr29/#Word_Boundaries | |
157 UWord = 256, | |
158 } | |
159 | |
160 /*********************************************************************** | |
161 | |
162 Compiles the regular expression in string form into an | |
163 internal representation using the specified match mode | |
164 flags. The resulting regular expression handle can then | |
165 be used to perform various matching operations. | |
166 | |
167 ***********************************************************************/ | |
168 | |
169 this (wchar[] pattern, Flag flags=Flag.None, ParseError* pe=null) | |
170 { | |
171 Error e; | |
172 | |
173 handle = uregex_open (pattern.ptr, pattern.length, flags, pe, e); | |
174 testError (e, "failed to open regex"); | |
175 uregex_setText (handle, "", 0, e); | |
176 } | |
177 | |
178 /*********************************************************************** | |
179 | |
180 Compiles the regular expression in string form into an | |
181 internal representation using the specified match mode | |
182 flags. The resulting regular expression handle can then | |
183 be used to perform various matching operations. | |
184 | |
185 ***********************************************************************/ | |
186 | |
187 this (UText pattern, Flag flags=Flag.None, ParseError* pe=null) | |
188 { | |
189 this (pattern.get, flags, pe); | |
190 } | |
191 | |
192 /*********************************************************************** | |
193 | |
194 Internal constructor; used for cloning | |
195 | |
196 ***********************************************************************/ | |
197 | |
198 private this (Handle handle) | |
199 { | |
200 Error e; | |
201 | |
202 this.handle = handle; | |
203 uregex_setText (handle, "", 0, e); | |
204 } | |
205 | |
206 /*********************************************************************** | |
207 | |
208 Close the regular expression, recovering all resources (memory) | |
209 it was holding | |
210 | |
211 ***********************************************************************/ | |
212 | |
213 ~this () | |
214 { | |
215 uregex_close (handle); | |
216 } | |
217 | |
218 /*********************************************************************** | |
219 | |
220 Cloning a regular expression is faster than opening a second | |
221 instance from the source form of the expression, and requires | |
222 less memory. | |
223 | |
224 Note that the current input string and the position of any | |
225 matched text within it are not cloned; only the pattern itself | |
226 and and the match mode flags are copied. | |
227 | |
228 Cloning can be particularly useful to threaded applications | |
229 that perform multiple match operations in parallel. Each | |
230 concurrent RE operation requires its own instance of a | |
231 URegularExpression. | |
232 | |
233 ***********************************************************************/ | |
234 | |
235 URegex clone () | |
236 { | |
237 Error e; | |
238 | |
239 Handle h = uregex_clone (handle, e); | |
240 testError (e, "failed to clone regex"); | |
241 return new URegex (h); | |
242 } | |
243 | |
244 /*********************************************************************** | |
245 | |
246 Return a copy of the source form of the pattern for this | |
247 regular expression | |
248 | |
249 ***********************************************************************/ | |
250 | |
251 UString getPattern () | |
252 { | |
253 Error e; | |
254 uint len; | |
255 | |
256 wchar* x = uregex_pattern (handle, len, e); | |
257 testError (e, "failed to extract regex pattern"); | |
258 return new UString (x[0..len]); | |
259 } | |
260 | |
261 /*********************************************************************** | |
262 | |
263 Get the match mode flags that were specified when compiling | |
264 this regular expression | |
265 | |
266 ***********************************************************************/ | |
267 | |
268 Flag getFlags () | |
269 { | |
270 Error e; | |
271 | |
272 Flag f = cast(Flag) uregex_flags (handle, e); | |
273 testError (e, "failed to get regex flags"); | |
274 return f; | |
275 } | |
276 | |
277 /*********************************************************************** | |
278 | |
279 Set the subject text string upon which the regular expression | |
280 will look for matches. | |
281 | |
282 This function may be called any number of times, allowing the | |
283 regular expression pattern to be applied to different strings. | |
284 | |
285 Regular expression matching operations work directly on the | |
286 application's string data. No copy is made. The subject string | |
287 data must not be altered after calling this function until after | |
288 all regular expression operations involving this string data are | |
289 completed. | |
290 | |
291 Zero length strings are permitted. In this case, no subsequent | |
292 match operation will dereference the text string pointer. | |
293 | |
294 ***********************************************************************/ | |
295 | |
296 void setText (UText t) | |
297 { | |
298 Error e; | |
299 | |
300 theText = t; | |
301 uregex_setText (handle, t.get.ptr, t.length, e); | |
302 testError (e, "failed to set regex text"); | |
303 } | |
304 | |
305 /*********************************************************************** | |
306 | |
307 Get the subject text that is currently associated with this | |
308 regular expression object. This simply returns whatever was | |
309 previously supplied via setText(). | |
310 | |
311 Note that this returns a read-only reference to the text. | |
312 | |
313 ***********************************************************************/ | |
314 | |
315 UText getText () | |
316 { | |
317 return theText; | |
318 } | |
319 | |
320 /*********************************************************************** | |
321 | |
322 Return a set of slices representing the parenthesised groups. | |
323 This can be used in the following manner: | |
324 | |
325 @code | |
326 wchar msg; | |
327 | |
328 if (regex.next()) | |
329 with (regex.groups()) | |
330 msg ~= g1 ~ ":" ~ g2 | |
331 @endcode | |
332 | |
333 Note that g0 represents the entire match, whereas g1 through | |
334 g9 represent the parenthesised expressions. | |
335 | |
336 ***********************************************************************/ | |
337 | |
338 Groups groups () | |
339 { | |
340 wchar[]* p = &g0; | |
341 uint count = groupCount(); | |
342 wchar[] content = theText.get(); | |
343 | |
344 if (count > 9) | |
345 count = 9; | |
346 for (uint i=0; i <= count; ++p, ++i) | |
347 *p = content [start(i)..end(i)]; | |
348 return this; | |
349 } | |
350 | |
351 /*********************************************************************** | |
352 | |
353 Extract the string for the specified matching expression or | |
354 subexpression. UString 's' is the destination for the match. | |
355 | |
356 Group #0 is the complete string of matched text. Group #1 is | |
357 the text matched by the first set of capturing parentheses. | |
358 | |
359 ***********************************************************************/ | |
360 | |
361 void group (UString s, uint index) | |
362 { | |
363 uint fmt (wchar* dst, uint length, inout Error e) | |
364 { | |
365 return uregex_group (handle, index, dst, length, e); | |
366 } | |
367 | |
368 s.format (&fmt, "failed to extract regex group text"); | |
369 } | |
370 | |
371 /*********************************************************************** | |
372 | |
373 Get the number of capturing groups in this regular | |
374 expression's pattern | |
375 | |
376 ***********************************************************************/ | |
377 | |
378 uint groupCount () | |
379 { | |
380 Error e; | |
381 | |
382 uint i = uregex_groupCount (handle, e); | |
383 testError (e, "failed to get regex group-count"); | |
384 return i; | |
385 } | |
386 | |
387 /*********************************************************************** | |
388 | |
389 Returns the index in the input string of the start of the | |
390 text matched by the specified capture group during the | |
391 previous match operation. | |
392 | |
393 Return -1 if the capture group was not part of the last | |
394 match. Group #0 refers to the complete range of matched | |
395 text. Group #1 refers to the text matched by the first | |
396 set of capturing parentheses | |
397 | |
398 ***********************************************************************/ | |
399 | |
400 uint start (uint index = 0) | |
401 { | |
402 Error e; | |
403 | |
404 uint i = uregex_start (handle, index, e); | |
405 testError (e, "failed to get regex start"); | |
406 return i; | |
407 } | |
408 | |
409 /*********************************************************************** | |
410 | |
411 Returns the index in the input string of the position | |
412 following the end of the text matched by the specified | |
413 capture group. | |
414 | |
415 Return -1 if the capture group was not part of the last | |
416 match. Group #0 refers to the complete range of matched | |
417 text. Group #1 refers to the text matched by the first | |
418 set of capturing parentheses. | |
419 | |
420 ***********************************************************************/ | |
421 | |
422 uint end (uint index = 0) | |
423 { | |
424 Error e; | |
425 | |
426 uint i = uregex_end (handle, index, e); | |
427 testError (e, "failed to get regex end"); | |
428 return i; | |
429 } | |
430 | |
431 /*********************************************************************** | |
432 | |
433 Reset any saved state from the previous match. | |
434 | |
435 Has the effect of causing uregex_findNext to begin at the | |
436 specified index, and causing uregex_start(), uregex_end() | |
437 and uregex_group() to return an error indicating that there | |
438 is no match information available. | |
439 | |
440 ***********************************************************************/ | |
441 | |
442 void reset (uint startIndex) | |
443 { | |
444 Error e; | |
445 | |
446 uregex_reset (handle, startIndex, e); | |
447 testError (e, "failed to set regex next-index"); | |
448 } | |
449 | |
450 /*********************************************************************** | |
451 | |
452 Attempts to match the input string, beginning at startIndex, | |
453 against the pattern. | |
454 | |
455 To succeed, the match must extend to the end of the input | |
456 string | |
457 | |
458 ***********************************************************************/ | |
459 | |
460 bool match (uint startIndex) | |
461 { | |
462 Error e; | |
463 | |
464 bool b = uregex_matches (handle, startIndex, e); | |
465 testError (e, "failed while matching regex"); | |
466 return b; | |
467 } | |
468 | |
469 /*********************************************************************** | |
470 | |
471 Attempts to match the input string, starting from the | |
472 specified index, against the pattern. | |
473 | |
474 The match may be of any length, and is not required to | |
475 extend to the end of the input string. Contrast with match() | |
476 | |
477 ***********************************************************************/ | |
478 | |
479 bool probe (uint startIndex) | |
480 { | |
481 Error e; | |
482 | |
483 bool b = uregex_lookingAt (handle, startIndex, e); | |
484 testError (e, "failed while looking at regex"); | |
485 return b; | |
486 } | |
487 | |
488 /*********************************************************************** | |
489 | |
490 Returns whether the text matches the search pattern, starting | |
491 from the current position. | |
492 | |
493 If startIndex is specified, the current position is moved to | |
494 the specified location before the seach is initiated. | |
495 | |
496 ***********************************************************************/ | |
497 | |
498 bool next (uint startIndex = uint.max) | |
499 { | |
500 Error e; | |
501 bool b; | |
502 | |
503 b = (startIndex == uint.max) ? uregex_findNext (handle, e) : | |
504 uregex_find (handle, startIndex, e); | |
505 | |
506 testError (e, "failed on next regex"); | |
507 return b; | |
508 } | |
509 | |
510 /*********************************************************************** | |
511 | |
512 Replaces every substring of the input that matches the pattern | |
513 with the given replacement string. | |
514 | |
515 This is a convenience function that provides a complete | |
516 find-and-replace-all operation. | |
517 | |
518 This method scans the input string looking for matches of | |
519 the pattern. Input that is not part of any match is copied | |
520 unchanged to the destination buffer. Matched regions are | |
521 replaced in the output buffer by the replacement string. | |
522 The replacement string may contain references to capture | |
523 groups; these take the form of $1, $2, etc. | |
524 | |
525 The provided 'result' will contain the results, and should | |
526 be set with a length sufficient to house the entire result. | |
527 Upon completion, the 'result' is shortened appropriately | |
528 and the total extent (length) of the operation is returned. | |
529 Set the initital length of 'result' using the UString method | |
530 truncate(). | |
531 | |
532 The returned extent should be checked to ensure it is not | |
533 longer than the length of 'result'. If it is longer, then | |
534 the result has been truncated. | |
535 | |
536 ***********************************************************************/ | |
537 | |
538 uint replaceAll (UText replace, UString result) | |
539 { | |
540 Error e; | |
541 | |
542 uint len = uregex_replaceAll (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); | |
543 testError (e, "failed during regex replace"); | |
544 result.truncate (len); | |
545 return len; | |
546 } | |
547 | |
548 /*********************************************************************** | |
549 | |
550 Replaces the first substring of the input that matches the | |
551 pattern with the given replacement string. | |
552 | |
553 This is a convenience function that provides a complete | |
554 find-and-replace operation. | |
555 | |
556 This method scans the input string looking for a match of | |
557 the pattern. All input that is not part of the match is | |
558 copied unchanged to the destination buffer. The matched | |
559 region is replaced in the output buffer by the replacement | |
560 string. The replacement string may contain references to | |
561 capture groups; these take the form of $1, $2, etc | |
562 | |
563 The provided 'result' will contain the results, and should | |
564 be set with a length sufficient to house the entire result. | |
565 Upon completion, the 'result' is shortened appropriately | |
566 and the total extent (length) of the operation is returned. | |
567 Set the initital length of 'result' using the UString method | |
568 truncate(). | |
569 | |
570 The returned extent should be checked to ensure it is not | |
571 longer than the length of 'result'. If it is longer, then | |
572 the result has been truncated. | |
573 | |
574 ***********************************************************************/ | |
575 | |
576 uint replaceFirst (UText replace, UString result) | |
577 { | |
578 Error e; | |
579 | |
580 uint len = uregex_replaceFirst (handle, replace.get.ptr, replace.length, result.get.ptr, result.length, e); | |
581 testError (e, "failed during regex replace"); | |
582 result.truncate (len); | |
583 return len; | |
584 } | |
585 | |
586 /*********************************************************************** | |
587 | |
588 Split the text up into slices (fields), where each slice | |
589 represents the text situated between each pattern matched | |
590 within the text. The pattern is expected to represent one | |
591 or more slice delimiters. | |
592 | |
593 ***********************************************************************/ | |
594 | |
595 uint split (wchar[][] fields) | |
596 { | |
597 Error e; | |
598 uint pos, | |
599 count; | |
600 wchar[] content = theText.get; | |
601 | |
602 while (count < fields.length) | |
603 if (uregex_findNext (handle, e) && e == e.OK) | |
604 { | |
605 uint i = start(); | |
606 fields[count] = content[pos..i]; | |
607 pos = end (); | |
608 | |
609 // ignore leading delimiter | |
610 if (i) | |
611 ++count; | |
612 } | |
613 else | |
614 break; | |
615 | |
616 testError (e, "failed during split"); | |
617 return count; | |
618 } | |
619 | |
620 | |
621 /*********************************************************************** | |
622 | |
623 Bind the ICU functions from a shared library. This is | |
624 complicated by the issues regarding D and DLLs on the | |
625 Windows platform | |
626 | |
627 ***********************************************************************/ | |
628 | |
629 private static void* library; | |
630 | |
631 /*********************************************************************** | |
632 | |
633 ***********************************************************************/ | |
634 | |
635 private static extern (C) | |
636 { | |
637 Handle function (wchar*, uint, uint, ParseError*, inout Error) uregex_open; | |
638 void function (Handle) uregex_close; | |
639 Handle function (Handle, inout Error) uregex_clone; | |
640 wchar* function (Handle, inout uint, inout Error) uregex_pattern; | |
641 uint function (Handle, inout Error) uregex_flags; | |
642 void function (Handle, wchar*, uint, inout Error) uregex_setText; | |
643 wchar* function (Handle, inout uint, inout Error) uregex_getText; | |
644 uint function (Handle, uint, wchar*, uint, inout Error) uregex_group; | |
645 uint function (Handle, inout Error) uregex_groupCount; | |
646 uint function (Handle, uint, inout Error) uregex_start; | |
647 uint function (Handle, uint, inout Error) uregex_end; | |
648 void function (Handle, uint, inout Error) uregex_reset; | |
649 bool function (Handle, uint, inout Error) uregex_matches; | |
650 bool function (Handle, uint, inout Error) uregex_lookingAt; | |
651 bool function (Handle, uint, inout Error) uregex_find; | |
652 bool function (Handle, inout Error) uregex_findNext; | |
653 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceAll; | |
654 uint function (Handle, wchar*, uint, wchar*, uint, inout Error) uregex_replaceFirst; | |
655 } | |
656 | |
657 /*********************************************************************** | |
658 | |
659 ***********************************************************************/ | |
660 | |
661 static FunctionLoader.Bind[] targets = | |
662 [ | |
663 {cast(void**) &uregex_open, "uregex_open"}, | |
664 {cast(void**) &uregex_close, "uregex_close"}, | |
665 {cast(void**) &uregex_clone, "uregex_clone"}, | |
666 {cast(void**) &uregex_pattern, "uregex_pattern"}, | |
667 {cast(void**) &uregex_flags, "uregex_flags"}, | |
668 {cast(void**) &uregex_setText, "uregex_setText"}, | |
669 {cast(void**) &uregex_getText, "uregex_getText"}, | |
670 {cast(void**) &uregex_group, "uregex_group"}, | |
671 {cast(void**) &uregex_groupCount, "uregex_groupCount"}, | |
672 {cast(void**) &uregex_start, "uregex_start"}, | |
673 {cast(void**) &uregex_end, "uregex_end"}, | |
674 {cast(void**) &uregex_reset, "uregex_reset"}, | |
675 {cast(void**) &uregex_matches, "uregex_matches"}, | |
676 {cast(void**) &uregex_lookingAt, "uregex_lookingAt"}, | |
677 {cast(void**) &uregex_find, "uregex_find"}, | |
678 {cast(void**) &uregex_findNext, "uregex_findNext"}, | |
679 {cast(void**) &uregex_replaceAll, "uregex_replaceAll"}, | |
680 {cast(void**) &uregex_replaceFirst, "uregex_replaceFirst"}, | |
681 ]; | |
682 | |
683 /*********************************************************************** | |
684 | |
685 ***********************************************************************/ | |
686 | |
687 static this () | |
688 { | |
689 library = FunctionLoader.bind (icuin, targets); | |
690 } | |
691 | |
692 /*********************************************************************** | |
693 | |
694 ***********************************************************************/ | |
695 | |
696 static ~this () | |
697 { | |
698 FunctionLoader.unbind (library); | |
699 } | |
700 } |