92
|
1 /*******************************************************************************
|
|
2
|
|
3 @file USearch.d
|
|
4
|
|
5 Copyright (c) 2004 Kris Bell
|
|
6
|
|
7 This software is provided 'as-is', without any express or implied
|
|
8 warranty. In no event will the authors be held liable for damages
|
|
9 of any kind arising from the use of this software.
|
|
10
|
|
11 Permission is hereby granted to anyone to use this software for any
|
|
12 purpose, including commercial applications, and to alter it and/or
|
|
13 redistribute it freely, subject to the following restrictions:
|
|
14
|
|
15 1. The origin of this software must not be misrepresented; you must
|
|
16 not claim that you wrote the original software. If you use this
|
|
17 software in a product, an acknowledgment within documentation of
|
|
18 said product would be appreciated but is not required.
|
|
19
|
|
20 2. Altered source versions must be plainly marked as such, and must
|
|
21 not be misrepresented as being the original software.
|
|
22
|
|
23 3. This notice may not be removed or altered from any distribution
|
|
24 of the source.
|
|
25
|
|
26 4. Derivative works are permitted, but they must carry this notice
|
|
27 in full and credit the original source.
|
|
28
|
|
29
|
|
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
31
|
|
32
|
|
33 @version Initial version, November 2004
|
|
34 @author Kris
|
|
35
|
|
36 Note that this package and documentation is built around the ICU
|
|
37 project (http://oss.software.ibm.com/icu/). Below is the license
|
|
38 statement as specified by that software:
|
|
39
|
|
40
|
|
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
42
|
|
43
|
|
44 ICU License - ICU 1.8.1 and later
|
|
45
|
|
46 COPYRIGHT AND PERMISSION NOTICE
|
|
47
|
|
48 Copyright (c) 1995-2003 International Business Machines Corporation and
|
|
49 others.
|
|
50
|
|
51 All rights reserved.
|
|
52
|
|
53 Permission is hereby granted, free of charge, to any person obtaining a
|
|
54 copy of this software and associated documentation files (the
|
|
55 "Software"), to deal in the Software without restriction, including
|
|
56 without limitation the rights to use, copy, modify, merge, publish,
|
|
57 distribute, and/or sell copies of the Software, and to permit persons
|
|
58 to whom the Software is furnished to do so, provided that the above
|
|
59 copyright notice(s) and this permission notice appear in all copies of
|
|
60 the Software and that both the above copyright notice(s) and this
|
|
61 permission notice appear in supporting documentation.
|
|
62
|
|
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
|
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
|
|
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
|
|
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
|
|
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
|
|
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
72
|
|
73 Except as contained in this notice, the name of a copyright holder
|
|
74 shall not be used in advertising or otherwise to promote the sale, use
|
|
75 or other dealings in this Software without prior written authorization
|
|
76 of the copyright holder.
|
|
77
|
|
78 ----------------------------------------------------------------------
|
|
79
|
|
80 All trademarks and registered trademarks mentioned herein are the
|
|
81 property of their respective owners.
|
|
82
|
|
83 *******************************************************************************/
|
|
84
|
|
85 module dwtx.dwtxhelper.mangoicu.USearch;
|
|
86
|
|
87 private import dwtx.dwtxhelper.mangoicu.ICU;
|
|
88
|
|
89 public import dwtx.dwtxhelper.mangoicu.ULocale,
|
|
90 dwtx.dwtxhelper.mangoicu.UString,
|
|
91 dwtx.dwtxhelper.mangoicu.UCollator,
|
|
92 dwtx.dwtxhelper.mangoicu.UBreakIterator;
|
|
93
|
|
94 /*******************************************************************************
|
|
95
|
|
96 Apis for an engine that provides language-sensitive text
|
|
97 searching based on the comparison rules defined in a UCollator
|
|
98 data struct. This ensures that language eccentricity can be handled,
|
|
99 e.g. for the German collator, characters ß and SS will be matched
|
|
100 if case is chosen to be ignored. See the "ICU Collation Design
|
|
101 Document" for more information.
|
|
102
|
|
103 The algorithm implemented is a modified form of the Boyer Moore's
|
|
104 search. For more information see "Efficient Text Searching in Java",
|
|
105 published in Java Report in February, 1999, for further information
|
|
106 on the algorithm.
|
|
107
|
|
108 There are 2 match options for selection: Let S' be the sub-string
|
|
109 of a text string S between the offsets start and end <start, end>. A
|
|
110 pattern string P matches a text string S at the offsets <start, end> if
|
|
111
|
|
112 - option 1. Some canonical equivalent of P matches some canonical
|
|
113 equivalent of S'
|
|
114
|
|
115 - option 2. P matches S' and if P starts or ends with a combining
|
|
116 mark, there exists no non-ignorable combining mark before
|
|
117 or after S' in S respectively.
|
|
118
|
|
119 Option 2 will be the default
|
|
120
|
|
121 This search has APIs similar to that of other text iteration
|
|
122 mechanisms such as the break iterators in ubrk.h. Using these
|
|
123 APIs, it is easy to scan through text looking for all occurances
|
|
124 of a given pattern. This search iterator allows changing of
|
|
125 direction by calling a reset followed by a next or previous.
|
|
126 Though a direction change can occur without calling reset first,
|
|
127 this operation comes with some speed penalty. Generally, match
|
|
128 results in the forward direction will match the result matches
|
|
129 in the backwards direction in the reverse order
|
|
130
|
|
131 USearch provides APIs to specify the starting position within
|
|
132 the text string to be searched, e.g. setOffset(), previous(x)
|
|
133 and next(x). Since the starting position will be set as it
|
|
134 is specified, please take note that there are some dangerous
|
|
135 positions which the search may render incorrect results:
|
|
136
|
|
137 - The midst of a substring that requires normalization.
|
|
138
|
|
139 - If the following match is to be found, the position should
|
|
140 not be the second character which requires to be swapped
|
|
141 with the preceding character. Vice versa, if the preceding
|
|
142 match is to be found, position to search from should not be
|
|
143 the first character which requires to be swapped with the
|
|
144 next character. E.g certain Thai and Lao characters require
|
|
145 swapping.
|
|
146
|
|
147 - If a following pattern match is to be found, any position
|
|
148 within a contracting sequence except the first will fail.
|
|
149 Vice versa if a preceding pattern match is to be found,
|
|
150 a invalid starting point would be any character within a
|
|
151 contracting sequence except the last.
|
|
152
|
|
153 A breakiterator can be used if only matches at logical breaks are
|
|
154 desired. Using a breakiterator will only give you results that
|
|
155 exactly matches the boundaries given by the breakiterator. For
|
|
156 instance the pattern "e" will not be found in the string "\u00e9"
|
|
157 if a character break iterator is used.
|
|
158
|
|
159 Options are provided to handle overlapping matches. E.g. In
|
|
160 English, overlapping matches produces the result 0 and 2 for
|
|
161 the pattern "abab" in the text "ababab", where else mutually
|
|
162 exclusive matches only produce the result of 0.
|
|
163
|
|
164 Though collator attributes will be taken into consideration while
|
|
165 performing matches, there are no APIs here for setting and getting
|
|
166 the attributes. These attributes can be set by getting the collator
|
|
167 from getCollator() and using the APIs in UCollator. Lastly to update
|
|
168 String Search to the new collator attributes, reset() has to be called.
|
|
169
|
|
170 See http://oss.software.ibm.com/icu/apiref/usearch_8h.html for full
|
|
171 details.
|
|
172
|
|
173 *******************************************************************************/
|
|
174
|
|
175 class USearch : ICU
|
|
176 {
|
|
177 private Handle handle;
|
|
178 private UBreakIterator* iterator;
|
|
179
|
|
180 // DONE is returned by previous() and next() after all valid
|
|
181 // matches have been returned, and by first() and last() if
|
|
182 // there are no matches at all.
|
|
183 const uint Done = uint.max;
|
|
184
|
|
185 //Possible types of searches
|
|
186 public enum Attribute
|
|
187 {
|
|
188 Overlap,
|
|
189 CanonicalMatch,
|
|
190 Count
|
|
191 }
|
|
192
|
|
193 public enum AttributeValue
|
|
194 {
|
|
195 Default = -1,
|
|
196 Off,
|
|
197 On,
|
|
198 Count
|
|
199 }
|
|
200
|
|
201 /***********************************************************************
|
|
202
|
|
203 Creating a search iterator data struct using the argument
|
|
204 locale language rule set
|
|
205
|
|
206 ***********************************************************************/
|
|
207
|
|
208 this (UStringView pattern, UStringView text, inout ULocale locale, UBreakIterator* iterator = null)
|
|
209 {
|
|
210 UErrorCode e;
|
|
211
|
|
212 this.iterator = iterator;
|
|
213 handle = usearch_open (pattern.get.ptr, pattern.length, text.get.ptr, text.length, toString(locale.name), ( iterator is null ) ? null : iterator.handle, e);
|
|
214 testError (e, "failed to open search");
|
|
215 }
|
|
216
|
|
217 /***********************************************************************
|
|
218
|
|
219 Creating a search iterator data struct using the argument
|
|
220 locale language rule set
|
|
221
|
|
222 ***********************************************************************/
|
|
223
|
|
224 this (UStringView pattern, UStringView text, UCollator col, UBreakIterator* iterator = null)
|
|
225 {
|
|
226 UErrorCode e;
|
|
227
|
|
228 this.iterator = iterator;
|
|
229 handle = usearch_openFromCollator (pattern.get.ptr, pattern.length, text.get.ptr, text.length, col.handle, ( iterator is null ) ? null : iterator.handle, e);
|
|
230 testError (e, "failed to open search from collator");
|
|
231 }
|
|
232
|
|
233 /***********************************************************************
|
|
234
|
|
235 Close this USearch
|
|
236
|
|
237 ***********************************************************************/
|
|
238
|
|
239 ~this ()
|
|
240 {
|
|
241 usearch_close (handle);
|
|
242 }
|
|
243
|
|
244 /***********************************************************************
|
|
245
|
|
246 Sets the current position in the text string which the
|
|
247 next search will start from.
|
|
248
|
|
249 ***********************************************************************/
|
|
250
|
|
251 void setOffset (uint position)
|
|
252 {
|
|
253 UErrorCode e;
|
|
254
|
|
255 usearch_setOffset (handle, position, e);
|
|
256 testError (e, "failed to set search offset");
|
|
257 }
|
|
258
|
|
259 /***********************************************************************
|
|
260
|
|
261 Return the current index in the string text being searched
|
|
262
|
|
263 ***********************************************************************/
|
|
264
|
|
265 uint getOffset ()
|
|
266 {
|
|
267 return usearch_getOffset (handle);
|
|
268 }
|
|
269
|
|
270 /***********************************************************************
|
|
271
|
|
272 Returns the index to the match in the text string that was
|
|
273 searched
|
|
274
|
|
275 ***********************************************************************/
|
|
276
|
|
277 uint getMatchedStart ()
|
|
278 {
|
|
279 return usearch_getMatchedStart (handle);
|
|
280 }
|
|
281
|
|
282 /***********************************************************************
|
|
283
|
|
284 Returns the length of text in the string which matches the
|
|
285 search pattern
|
|
286
|
|
287 ***********************************************************************/
|
|
288
|
|
289 uint getMatchedLength ()
|
|
290 {
|
|
291 return usearch_getMatchedLength (handle);
|
|
292 }
|
|
293
|
|
294 /***********************************************************************
|
|
295
|
|
296 Returns the text that was matched by the most recent call to
|
|
297 first(), next(), previous(), or last().
|
|
298
|
|
299 ***********************************************************************/
|
|
300
|
|
301 void getMatchedText (UString s)
|
|
302 {
|
|
303 uint fmt (wchar* dst, uint length, inout UErrorCode e)
|
|
304 {
|
|
305 return usearch_getMatchedText (handle, dst, length, e);
|
|
306 }
|
|
307
|
|
308 s.format (&fmt, "failed to extract matched text");
|
|
309 }
|
|
310
|
|
311 /***********************************************************************
|
|
312
|
|
313 Set the string text to be searched.
|
|
314
|
|
315 ***********************************************************************/
|
|
316
|
|
317 void setText (UStringView t)
|
|
318 {
|
|
319 UErrorCode e;
|
|
320
|
|
321 usearch_setText (handle, t.get.ptr, t.length, e);
|
|
322 testError (e, "failed to set search text");
|
|
323 }
|
|
324
|
|
325 /***********************************************************************
|
|
326
|
|
327 Return the string text to be searched. Note that this
|
|
328 returns a read-only reference to the search text.
|
|
329
|
|
330 ***********************************************************************/
|
|
331
|
|
332 UStringView getText ()
|
|
333 {
|
|
334 uint len;
|
|
335
|
|
336 wchar *x = usearch_getText (handle, &len);
|
|
337 return new UStringView (x[0..len]);
|
|
338 }
|
|
339
|
|
340 /***********************************************************************
|
|
341
|
|
342 Sets the pattern used for matching
|
|
343
|
|
344 ***********************************************************************/
|
|
345
|
|
346 void setPattern (UStringView t)
|
|
347 {
|
|
348 UErrorCode e;
|
|
349
|
|
350 usearch_setPattern (handle, t.get.ptr, t.length, e);
|
|
351 testError (e, "failed to set search pattern");
|
|
352 }
|
|
353
|
|
354 /***********************************************************************
|
|
355
|
|
356 Gets the search pattern. Note that this returns a
|
|
357 read-only reference to the pattern.
|
|
358
|
|
359 ***********************************************************************/
|
|
360
|
|
361 UStringView getPattern ()
|
|
362 {
|
|
363 uint len;
|
|
364
|
|
365 wchar *x = usearch_getPattern (handle, &len);
|
|
366 return new UStringView (x[0..len]);
|
|
367 }
|
|
368
|
|
369 /***********************************************************************
|
|
370
|
|
371 Set the BreakIterator that will be used to restrict the
|
|
372 points at which matches are detected.
|
|
373
|
|
374 ***********************************************************************/
|
|
375
|
|
376 void setIterator (UBreakIterator* iterator)
|
|
377 {
|
|
378 UErrorCode e;
|
|
379
|
|
380 this.iterator = iterator;
|
|
381 usearch_setBreakIterator (handle, cast(Handle)iterator.handle, e);
|
|
382 testError (e, "failed to set search iterator");
|
|
383 }
|
|
384
|
|
385 /***********************************************************************
|
|
386
|
|
387 Get the BreakIterator that will be used to restrict the
|
|
388 points at which matches are detected.
|
|
389
|
|
390 ***********************************************************************/
|
|
391
|
|
392 UBreakIterator* getIterator ()
|
|
393 {
|
|
394 return iterator;
|
|
395 }
|
|
396
|
|
397 /***********************************************************************
|
|
398
|
|
399 Returns the first index at which the string text matches
|
|
400 the search pattern
|
|
401
|
|
402 ***********************************************************************/
|
|
403
|
|
404 uint first ()
|
|
405 {
|
|
406 UErrorCode e;
|
|
407
|
|
408 uint x = usearch_first (handle, e);
|
|
409 testError (e, "failed on first search");
|
|
410 return x;
|
|
411 }
|
|
412
|
|
413 /***********************************************************************
|
|
414
|
|
415 Returns the last index in the target text at which it
|
|
416 matches the search pattern
|
|
417
|
|
418 ***********************************************************************/
|
|
419
|
|
420 uint last ()
|
|
421 {
|
|
422 UErrorCode e;
|
|
423
|
|
424 uint x = usearch_last (handle, e);
|
|
425 testError (e, "failed on last search");
|
|
426 return x;
|
|
427 }
|
|
428
|
|
429 /***********************************************************************
|
|
430
|
|
431 Returns the index of the next point at which the string
|
|
432 text matches the search pattern, starting from the current
|
|
433 position.
|
|
434
|
|
435 If pos is specified, returns the first index greater than
|
|
436 pos at which the string text matches the search pattern
|
|
437
|
|
438 ***********************************************************************/
|
|
439
|
|
440 uint next (uint pos = uint.max)
|
|
441 {
|
|
442 UErrorCode e;
|
|
443 uint x;
|
|
444
|
|
445 x = (pos == uint.max) ? usearch_next (handle, e) :
|
|
446 usearch_following (handle, pos, e);
|
|
447
|
|
448 testError (e, "failed on next search");
|
|
449 return x;
|
|
450 }
|
|
451
|
|
452 /***********************************************************************
|
|
453
|
|
454 Returns the index of the previous point at which the
|
|
455 string text matches the search pattern, starting at
|
|
456 the current position.
|
|
457
|
|
458 If pos is specified, returns the first index less
|
|
459 than pos at which the string text matches the search
|
|
460 pattern.
|
|
461
|
|
462 ***********************************************************************/
|
|
463
|
|
464 uint previous (uint pos = uint.max)
|
|
465 {
|
|
466 UErrorCode e;
|
|
467 uint x;
|
|
468
|
|
469 x = (pos == uint.max) ? usearch_previous (handle, e) :
|
|
470 usearch_preceding (handle, pos, e);
|
|
471
|
|
472 testError (e, "failed on next search");
|
|
473 return x;
|
|
474 }
|
|
475
|
|
476 /***********************************************************************
|
|
477
|
|
478 Search will begin at the start of the text string if a
|
|
479 forward iteration is initiated before a backwards iteration.
|
|
480 Otherwise if a backwards iteration is initiated before a
|
|
481 forwards iteration, the search will begin at the end of the
|
|
482 text string
|
|
483
|
|
484 ***********************************************************************/
|
|
485
|
|
486 void reset ()
|
|
487 {
|
|
488 usearch_reset (handle);
|
|
489 }
|
|
490
|
|
491 /***********************************************************************
|
|
492
|
|
493 Gets the collator used for the language rules.
|
|
494
|
|
495 ***********************************************************************/
|
|
496
|
|
497 UCollator getCollator ()
|
|
498 {
|
|
499 return new UCollator (usearch_getCollator (handle));
|
|
500 }
|
|
501
|
|
502 /***********************************************************************
|
|
503
|
|
504 Sets the collator used for the language rules. This
|
|
505 method causes internal data such as Boyer-Moore shift
|
|
506 tables to be recalculated, but the iterator's position
|
|
507 is unchanged
|
|
508
|
|
509 ***********************************************************************/
|
|
510
|
|
511 void setCollator (UCollator col)
|
|
512 {
|
|
513 UErrorCode e;
|
|
514
|
|
515 usearch_setCollator (handle, col.handle, e);
|
|
516 testError (e, "failed to set search collator");
|
|
517 }
|
|
518
|
|
519
|
|
520 /***********************************************************************
|
|
521
|
|
522 Bind the ICU functions from a shared library. This is
|
|
523 complicated by the issues regarding D and DLLs on the
|
|
524 Windows platform
|
|
525
|
|
526 ***********************************************************************/
|
|
527
|
|
528 private static void* library;
|
|
529
|
|
530 /***********************************************************************
|
|
531
|
|
532 ***********************************************************************/
|
|
533
|
|
534 private static extern (C)
|
|
535 {
|
|
536 Handle function (wchar*, uint, wchar*, uint, char*, void*, inout UErrorCode) usearch_open;
|
|
537 Handle function (wchar*, uint, wchar*, uint, Handle, void*, inout UErrorCode) usearch_openFromCollator;
|
|
538 void function (Handle) usearch_close;
|
|
539 void function (Handle, uint, inout UErrorCode) usearch_setOffset;
|
|
540 uint function (Handle) usearch_getOffset;
|
|
541 uint function (Handle) usearch_getMatchedStart;
|
|
542 uint function (Handle) usearch_getMatchedLength;
|
|
543 uint function (Handle, wchar*, uint, inout UErrorCode) usearch_getMatchedText;
|
|
544 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setText;
|
|
545 wchar* function (Handle, uint*) usearch_getText;
|
|
546 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setPattern;
|
|
547 wchar* function (Handle, uint*) usearch_getPattern;
|
|
548 uint function (Handle, inout UErrorCode) usearch_first;
|
|
549 uint function (Handle, inout UErrorCode) usearch_last;
|
|
550 uint function (Handle, inout UErrorCode) usearch_next;
|
|
551 uint function (Handle, inout UErrorCode) usearch_previous;
|
|
552 uint function (Handle, uint, inout UErrorCode) usearch_following;
|
|
553 uint function (Handle, uint, inout UErrorCode) usearch_preceding;
|
|
554 void function (Handle) usearch_reset;
|
|
555 void function (Handle, Handle, inout UErrorCode) usearch_setBreakIterator;
|
|
556 Handle function (Handle) usearch_getCollator;
|
|
557 void function (Handle, Handle, inout UErrorCode) usearch_setCollator;
|
|
558 }
|
|
559
|
|
560 /***********************************************************************
|
|
561
|
|
562 ***********************************************************************/
|
|
563
|
|
564 static FunctionLoader.Bind[] targets =
|
|
565 [
|
|
566 {cast(void**) &usearch_open, "usearch_open"},
|
|
567 {cast(void**) &usearch_openFromCollator, "usearch_openFromCollator"},
|
|
568 {cast(void**) &usearch_close, "usearch_close"},
|
|
569 {cast(void**) &usearch_setOffset, "usearch_setOffset"},
|
|
570 {cast(void**) &usearch_getOffset, "usearch_getOffset"},
|
|
571 {cast(void**) &usearch_getMatchedStart, "usearch_getMatchedStart"},
|
|
572 {cast(void**) &usearch_getMatchedLength, "usearch_getMatchedLength"},
|
|
573 {cast(void**) &usearch_getMatchedText, "usearch_getMatchedText"},
|
|
574 {cast(void**) &usearch_setText, "usearch_setText"},
|
|
575 {cast(void**) &usearch_getText, "usearch_getText"},
|
|
576 {cast(void**) &usearch_setPattern, "usearch_setPattern"},
|
|
577 {cast(void**) &usearch_getPattern, "usearch_getPattern"},
|
|
578 {cast(void**) &usearch_first, "usearch_first"},
|
|
579 {cast(void**) &usearch_last, "usearch_last"},
|
|
580 {cast(void**) &usearch_next, "usearch_next"},
|
|
581 {cast(void**) &usearch_previous, "usearch_previous"},
|
|
582 {cast(void**) &usearch_following, "usearch_following"},
|
|
583 {cast(void**) &usearch_preceding, "usearch_preceding"},
|
|
584 {cast(void**) &usearch_reset, "usearch_reset"},
|
|
585 {cast(void**) &usearch_setBreakIterator, "usearch_setBreakIterator"},
|
|
586 {cast(void**) &usearch_getCollator, "usearch_getCollator"},
|
|
587 {cast(void**) &usearch_setCollator, "usearch_setCollator"},
|
|
588 ];
|
|
589
|
|
590 /***********************************************************************
|
|
591
|
|
592 ***********************************************************************/
|
|
593
|
|
594 static this ()
|
|
595 {
|
|
596 library = FunctionLoader.bind (icuin, targets);
|
|
597 }
|
|
598
|
|
599 /***********************************************************************
|
|
600
|
|
601 ***********************************************************************/
|
|
602
|
|
603 static ~this ()
|
|
604 {
|
|
605 FunctionLoader.unbind (library);
|
|
606 }
|
|
607 }
|