comparison base/src/java/mangoicu/USearch.d @ 27:1bf55a6eb092

Renamed java tree to base
author Frank Benoit <benoit@tionex.de>
date Sat, 21 Mar 2009 11:33:57 +0100
parents java/src/java/mangoicu/USearch.d@dbfb303e8fb0
children
comparison
equal deleted inserted replaced
26:f589fc20a5f9 27:1bf55a6eb092
1 /*******************************************************************************
2
3 @file USearch.d
4
5 Copyright (c) 2004 Kris Bell
6
7 This software is provided 'as-is', without any express or implied
8 warranty. In no event will the authors be held liable for damages
9 of any kind arising from the use of this software.
10
11 Permission is hereby granted to anyone to use this software for any
12 purpose, including commercial applications, and to alter it and/or
13 redistribute it freely, subject to the following restrictions:
14
15 1. The origin of this software must not be misrepresented; you must
16 not claim that you wrote the original software. If you use this
17 software in a product, an acknowledgment within documentation of
18 said product would be appreciated but is not required.
19
20 2. Altered source versions must be plainly marked as such, and must
21 not be misrepresented as being the original software.
22
23 3. This notice may not be removed or altered from any distribution
24 of the source.
25
26 4. Derivative works are permitted, but they must carry this notice
27 in full and credit the original source.
28
29
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31
32
33 @version Initial version, November 2004
34 @author Kris
35
36 Note that this package and documentation is built around the ICU
37 project (http://oss.software.ibm.com/icu/). Below is the license
38 statement as specified by that software:
39
40
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42
43
44 ICU License - ICU 1.8.1 and later
45
46 COPYRIGHT AND PERMISSION NOTICE
47
48 Copyright (c) 1995-2003 International Business Machines Corporation and
49 others.
50
51 All rights reserved.
52
53 Permission is hereby granted, free of charge, to any person obtaining a
54 copy of this software and associated documentation files (the
55 "Software"), to deal in the Software without restriction, including
56 without limitation the rights to use, copy, modify, merge, publish,
57 distribute, and/or sell copies of the Software, and to permit persons
58 to whom the Software is furnished to do so, provided that the above
59 copyright notice(s) and this permission notice appear in all copies of
60 the Software and that both the above copyright notice(s) and this
61 permission notice appear in supporting documentation.
62
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
72
73 Except as contained in this notice, the name of a copyright holder
74 shall not be used in advertising or otherwise to promote the sale, use
75 or other dealings in this Software without prior written authorization
76 of the copyright holder.
77
78 ----------------------------------------------------------------------
79
80 All trademarks and registered trademarks mentioned herein are the
81 property of their respective owners.
82
83 *******************************************************************************/
84
85 module java.mangoicu.USearch;
86
87 private import java.mangoicu.ICU;
88
89 public import java.mangoicu.ULocale,
90 java.mangoicu.UString,
91 java.mangoicu.UCollator,
92 java.mangoicu.UBreakIterator;
93
94 /*******************************************************************************
95
96 Apis for an engine that provides language-sensitive text
97 searching based on the comparison rules defined in a UCollator
98 data struct. This ensures that language eccentricity can be handled,
99 e.g. for the German collator, characters &#x00DF; and SS will be matched
100 if case is chosen to be ignored. See the "ICU Collation Design
101 Document" for more information.
102
103 The algorithm implemented is a modified form of the Boyer Moore's
104 search. For more information see "Efficient Text Searching in Java",
105 published in Java Report in February, 1999, for further information
106 on the algorithm.
107
108 There are 2 match options for selection: Let S' be the sub-string
109 of a text string S between the offsets start and end <start, end>. A
110 pattern string P matches a text string S at the offsets <start, end> if
111
112 - option 1. Some canonical equivalent of P matches some canonical
113 equivalent of S'
114
115 - option 2. P matches S' and if P starts or ends with a combining
116 mark, there exists no non-ignorable combining mark before
117 or after S' in S respectively.
118
119 Option 2 will be the default
120
121 This search has APIs similar to that of other text iteration
122 mechanisms such as the break iterators in ubrk.h. Using these
123 APIs, it is easy to scan through text looking for all occurances
124 of a given pattern. This search iterator allows changing of
125 direction by calling a reset followed by a next or previous.
126 Though a direction change can occur without calling reset first,
127 this operation comes with some speed penalty. Generally, match
128 results in the forward direction will match the result matches
129 in the backwards direction in the reverse order
130
131 USearch provides APIs to specify the starting position within
132 the text string to be searched, e.g. setOffset(), previous(x)
133 and next(x). Since the starting position will be set as it
134 is specified, please take note that there are some dangerous
135 positions which the search may render incorrect results:
136
137 - The midst of a substring that requires normalization.
138
139 - If the following match is to be found, the position should
140 not be the second character which requires to be swapped
141 with the preceding character. Vice versa, if the preceding
142 match is to be found, position to search from should not be
143 the first character which requires to be swapped with the
144 next character. E.g certain Thai and Lao characters require
145 swapping.
146
147 - If a following pattern match is to be found, any position
148 within a contracting sequence except the first will fail.
149 Vice versa if a preceding pattern match is to be found,
150 a invalid starting point would be any character within a
151 contracting sequence except the last.
152
153 A breakiterator can be used if only matches at logical breaks are
154 desired. Using a breakiterator will only give you results that
155 exactly matches the boundaries given by the breakiterator. For
156 instance the pattern "e" will not be found in the string "\u00e9"
157 if a character break iterator is used.
158
159 Options are provided to handle overlapping matches. E.g. In
160 English, overlapping matches produces the result 0 and 2 for
161 the pattern "abab" in the text "ababab", where else mutually
162 exclusive matches only produce the result of 0.
163
164 Though collator attributes will be taken into consideration while
165 performing matches, there are no APIs here for setting and getting
166 the attributes. These attributes can be set by getting the collator
167 from getCollator() and using the APIs in UCollator. Lastly to update
168 String Search to the new collator attributes, reset() has to be called.
169
170 See http://oss.software.ibm.com/icu/apiref/usearch_8h.html for full
171 details.
172
173 *******************************************************************************/
174
175 class USearch : ICU
176 {
177 private Handle handle;
178 private UBreakIterator* iterator;
179
180 // DONE is returned by previous() and next() after all valid
181 // matches have been returned, and by first() and last() if
182 // there are no matches at all.
183 const uint Done = uint.max;
184
185 //Possible types of searches
186 public enum Attribute
187 {
188 Overlap,
189 CanonicalMatch,
190 Count
191 }
192
193 public enum AttributeValue
194 {
195 Default = -1,
196 Off,
197 On,
198 Count
199 }
200
201 /***********************************************************************
202
203 Creating a search iterator data struct using the argument
204 locale language rule set
205
206 ***********************************************************************/
207
208 this (UStringView pattern, UStringView text, inout ULocale locale, UBreakIterator* iterator = null)
209 {
210 UErrorCode e;
211
212 this.iterator = iterator;
213 handle = usearch_open (pattern.get.ptr, pattern.length, text.get.ptr, text.length, toString(locale.name), ( iterator is null ) ? null : iterator.handle, e);
214 testError (e, "failed to open search");
215 }
216
217 /***********************************************************************
218
219 Creating a search iterator data struct using the argument
220 locale language rule set
221
222 ***********************************************************************/
223
224 this (UStringView pattern, UStringView text, UCollator col, UBreakIterator* iterator = null)
225 {
226 UErrorCode e;
227
228 this.iterator = iterator;
229 handle = usearch_openFromCollator (pattern.get.ptr, pattern.length, text.get.ptr, text.length, col.handle, ( iterator is null ) ? null : iterator.handle, e);
230 testError (e, "failed to open search from collator");
231 }
232
233 /***********************************************************************
234
235 Close this USearch
236
237 ***********************************************************************/
238
239 ~this ()
240 {
241 usearch_close (handle);
242 }
243
244 /***********************************************************************
245
246 Sets the current position in the text string which the
247 next search will start from.
248
249 ***********************************************************************/
250
251 void setOffset (uint position)
252 {
253 UErrorCode e;
254
255 usearch_setOffset (handle, position, e);
256 testError (e, "failed to set search offset");
257 }
258
259 /***********************************************************************
260
261 Return the current index in the string text being searched
262
263 ***********************************************************************/
264
265 uint getOffset ()
266 {
267 return usearch_getOffset (handle);
268 }
269
270 /***********************************************************************
271
272 Returns the index to the match in the text string that was
273 searched
274
275 ***********************************************************************/
276
277 uint getMatchedStart ()
278 {
279 return usearch_getMatchedStart (handle);
280 }
281
282 /***********************************************************************
283
284 Returns the length of text in the string which matches the
285 search pattern
286
287 ***********************************************************************/
288
289 uint getMatchedLength ()
290 {
291 return usearch_getMatchedLength (handle);
292 }
293
294 /***********************************************************************
295
296 Returns the text that was matched by the most recent call to
297 first(), next(), previous(), or last().
298
299 ***********************************************************************/
300
301 void getMatchedText (UString s)
302 {
303 uint fmt (wchar* dst, uint length, inout UErrorCode e)
304 {
305 return usearch_getMatchedText (handle, dst, length, e);
306 }
307
308 s.format (&fmt, "failed to extract matched text");
309 }
310
311 /***********************************************************************
312
313 Set the string text to be searched.
314
315 ***********************************************************************/
316
317 void setText (UStringView t)
318 {
319 UErrorCode e;
320
321 usearch_setText (handle, t.get.ptr, t.length, e);
322 testError (e, "failed to set search text");
323 }
324
325 /***********************************************************************
326
327 Return the string text to be searched. Note that this
328 returns a read-only reference to the search text.
329
330 ***********************************************************************/
331
332 UStringView getText ()
333 {
334 uint len;
335
336 wchar *x = usearch_getText (handle, &len);
337 return new UStringView (x[0..len]);
338 }
339
340 /***********************************************************************
341
342 Sets the pattern used for matching
343
344 ***********************************************************************/
345
346 void setPattern (UStringView t)
347 {
348 UErrorCode e;
349
350 usearch_setPattern (handle, t.get.ptr, t.length, e);
351 testError (e, "failed to set search pattern");
352 }
353
354 /***********************************************************************
355
356 Gets the search pattern. Note that this returns a
357 read-only reference to the pattern.
358
359 ***********************************************************************/
360
361 UStringView getPattern ()
362 {
363 uint len;
364
365 wchar *x = usearch_getPattern (handle, &len);
366 return new UStringView (x[0..len]);
367 }
368
369 /***********************************************************************
370
371 Set the BreakIterator that will be used to restrict the
372 points at which matches are detected.
373
374 ***********************************************************************/
375
376 void setIterator (UBreakIterator* iterator)
377 {
378 UErrorCode e;
379
380 this.iterator = iterator;
381 usearch_setBreakIterator (handle, cast(Handle)iterator.handle, e);
382 testError (e, "failed to set search iterator");
383 }
384
385 /***********************************************************************
386
387 Get the BreakIterator that will be used to restrict the
388 points at which matches are detected.
389
390 ***********************************************************************/
391
392 UBreakIterator* getIterator ()
393 {
394 return iterator;
395 }
396
397 /***********************************************************************
398
399 Returns the first index at which the string text matches
400 the search pattern
401
402 ***********************************************************************/
403
404 uint first ()
405 {
406 UErrorCode e;
407
408 uint x = usearch_first (handle, e);
409 testError (e, "failed on first search");
410 return x;
411 }
412
413 /***********************************************************************
414
415 Returns the last index in the target text at which it
416 matches the search pattern
417
418 ***********************************************************************/
419
420 uint last ()
421 {
422 UErrorCode e;
423
424 uint x = usearch_last (handle, e);
425 testError (e, "failed on last search");
426 return x;
427 }
428
429 /***********************************************************************
430
431 Returns the index of the next point at which the string
432 text matches the search pattern, starting from the current
433 position.
434
435 If pos is specified, returns the first index greater than
436 pos at which the string text matches the search pattern
437
438 ***********************************************************************/
439
440 uint next (uint pos = uint.max)
441 {
442 UErrorCode e;
443 uint x;
444
445 x = (pos == uint.max) ? usearch_next (handle, e) :
446 usearch_following (handle, pos, e);
447
448 testError (e, "failed on next search");
449 return x;
450 }
451
452 /***********************************************************************
453
454 Returns the index of the previous point at which the
455 string text matches the search pattern, starting at
456 the current position.
457
458 If pos is specified, returns the first index less
459 than pos at which the string text matches the search
460 pattern.
461
462 ***********************************************************************/
463
464 uint previous (uint pos = uint.max)
465 {
466 UErrorCode e;
467 uint x;
468
469 x = (pos == uint.max) ? usearch_previous (handle, e) :
470 usearch_preceding (handle, pos, e);
471
472 testError (e, "failed on next search");
473 return x;
474 }
475
476 /***********************************************************************
477
478 Search will begin at the start of the text string if a
479 forward iteration is initiated before a backwards iteration.
480 Otherwise if a backwards iteration is initiated before a
481 forwards iteration, the search will begin at the end of the
482 text string
483
484 ***********************************************************************/
485
486 void reset ()
487 {
488 usearch_reset (handle);
489 }
490
491 /***********************************************************************
492
493 Gets the collator used for the language rules.
494
495 ***********************************************************************/
496
497 UCollator getCollator ()
498 {
499 return new UCollator (usearch_getCollator (handle));
500 }
501
502 /***********************************************************************
503
504 Sets the collator used for the language rules. This
505 method causes internal data such as Boyer-Moore shift
506 tables to be recalculated, but the iterator's position
507 is unchanged
508
509 ***********************************************************************/
510
511 void setCollator (UCollator col)
512 {
513 UErrorCode e;
514
515 usearch_setCollator (handle, col.handle, e);
516 testError (e, "failed to set search collator");
517 }
518
519
520 /***********************************************************************
521
522 Bind the ICU functions from a shared library. This is
523 complicated by the issues regarding D and DLLs on the
524 Windows platform
525
526 ***********************************************************************/
527
528 private static void* library;
529
530 /***********************************************************************
531
532 ***********************************************************************/
533
534 private static extern (C)
535 {
536 Handle function (wchar*, uint, wchar*, uint, char*, void*, inout UErrorCode) usearch_open;
537 Handle function (wchar*, uint, wchar*, uint, Handle, void*, inout UErrorCode) usearch_openFromCollator;
538 void function (Handle) usearch_close;
539 void function (Handle, uint, inout UErrorCode) usearch_setOffset;
540 uint function (Handle) usearch_getOffset;
541 uint function (Handle) usearch_getMatchedStart;
542 uint function (Handle) usearch_getMatchedLength;
543 uint function (Handle, wchar*, uint, inout UErrorCode) usearch_getMatchedText;
544 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setText;
545 wchar* function (Handle, uint*) usearch_getText;
546 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setPattern;
547 wchar* function (Handle, uint*) usearch_getPattern;
548 uint function (Handle, inout UErrorCode) usearch_first;
549 uint function (Handle, inout UErrorCode) usearch_last;
550 uint function (Handle, inout UErrorCode) usearch_next;
551 uint function (Handle, inout UErrorCode) usearch_previous;
552 uint function (Handle, uint, inout UErrorCode) usearch_following;
553 uint function (Handle, uint, inout UErrorCode) usearch_preceding;
554 void function (Handle) usearch_reset;
555 void function (Handle, Handle, inout UErrorCode) usearch_setBreakIterator;
556 Handle function (Handle) usearch_getCollator;
557 void function (Handle, Handle, inout UErrorCode) usearch_setCollator;
558 }
559
560 /***********************************************************************
561
562 ***********************************************************************/
563
564 static FunctionLoader.Bind[] targets =
565 [
566 {cast(void**) &usearch_open, "usearch_open"},
567 {cast(void**) &usearch_openFromCollator, "usearch_openFromCollator"},
568 {cast(void**) &usearch_close, "usearch_close"},
569 {cast(void**) &usearch_setOffset, "usearch_setOffset"},
570 {cast(void**) &usearch_getOffset, "usearch_getOffset"},
571 {cast(void**) &usearch_getMatchedStart, "usearch_getMatchedStart"},
572 {cast(void**) &usearch_getMatchedLength, "usearch_getMatchedLength"},
573 {cast(void**) &usearch_getMatchedText, "usearch_getMatchedText"},
574 {cast(void**) &usearch_setText, "usearch_setText"},
575 {cast(void**) &usearch_getText, "usearch_getText"},
576 {cast(void**) &usearch_setPattern, "usearch_setPattern"},
577 {cast(void**) &usearch_getPattern, "usearch_getPattern"},
578 {cast(void**) &usearch_first, "usearch_first"},
579 {cast(void**) &usearch_last, "usearch_last"},
580 {cast(void**) &usearch_next, "usearch_next"},
581 {cast(void**) &usearch_previous, "usearch_previous"},
582 {cast(void**) &usearch_following, "usearch_following"},
583 {cast(void**) &usearch_preceding, "usearch_preceding"},
584 {cast(void**) &usearch_reset, "usearch_reset"},
585 {cast(void**) &usearch_setBreakIterator, "usearch_setBreakIterator"},
586 {cast(void**) &usearch_getCollator, "usearch_getCollator"},
587 {cast(void**) &usearch_setCollator, "usearch_setCollator"},
588 ];
589
590 /***********************************************************************
591
592 ***********************************************************************/
593
594 static this ()
595 {
596 library = FunctionLoader.bind (icuin, targets);
597 }
598
599 /***********************************************************************
600
601 ***********************************************************************/
602
603 static ~this ()
604 {
605 FunctionLoader.unbind (library);
606 }
607 }