Mercurial > projects > dwt2
comparison base/src/java/mangoicu/USearch.d @ 27:1bf55a6eb092
Renamed java tree to base
author | Frank Benoit <benoit@tionex.de> |
---|---|
date | Sat, 21 Mar 2009 11:33:57 +0100 |
parents | java/src/java/mangoicu/USearch.d@dbfb303e8fb0 |
children |
comparison
equal
deleted
inserted
replaced
26:f589fc20a5f9 | 27:1bf55a6eb092 |
---|---|
1 /******************************************************************************* | |
2 | |
3 @file USearch.d | |
4 | |
5 Copyright (c) 2004 Kris Bell | |
6 | |
7 This software is provided 'as-is', without any express or implied | |
8 warranty. In no event will the authors be held liable for damages | |
9 of any kind arising from the use of this software. | |
10 | |
11 Permission is hereby granted to anyone to use this software for any | |
12 purpose, including commercial applications, and to alter it and/or | |
13 redistribute it freely, subject to the following restrictions: | |
14 | |
15 1. The origin of this software must not be misrepresented; you must | |
16 not claim that you wrote the original software. If you use this | |
17 software in a product, an acknowledgment within documentation of | |
18 said product would be appreciated but is not required. | |
19 | |
20 2. Altered source versions must be plainly marked as such, and must | |
21 not be misrepresented as being the original software. | |
22 | |
23 3. This notice may not be removed or altered from any distribution | |
24 of the source. | |
25 | |
26 4. Derivative works are permitted, but they must carry this notice | |
27 in full and credit the original source. | |
28 | |
29 | |
30 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
31 | |
32 | |
33 @version Initial version, November 2004 | |
34 @author Kris | |
35 | |
36 Note that this package and documentation is built around the ICU | |
37 project (http://oss.software.ibm.com/icu/). Below is the license | |
38 statement as specified by that software: | |
39 | |
40 | |
41 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
42 | |
43 | |
44 ICU License - ICU 1.8.1 and later | |
45 | |
46 COPYRIGHT AND PERMISSION NOTICE | |
47 | |
48 Copyright (c) 1995-2003 International Business Machines Corporation and | |
49 others. | |
50 | |
51 All rights reserved. | |
52 | |
53 Permission is hereby granted, free of charge, to any person obtaining a | |
54 copy of this software and associated documentation files (the | |
55 "Software"), to deal in the Software without restriction, including | |
56 without limitation the rights to use, copy, modify, merge, publish, | |
57 distribute, and/or sell copies of the Software, and to permit persons | |
58 to whom the Software is furnished to do so, provided that the above | |
59 copyright notice(s) and this permission notice appear in all copies of | |
60 the Software and that both the above copyright notice(s) and this | |
61 permission notice appear in supporting documentation. | |
62 | |
63 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
64 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
65 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT | |
66 OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR | |
67 HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL | |
68 INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING | |
69 FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, | |
70 NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION | |
71 WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
72 | |
73 Except as contained in this notice, the name of a copyright holder | |
74 shall not be used in advertising or otherwise to promote the sale, use | |
75 or other dealings in this Software without prior written authorization | |
76 of the copyright holder. | |
77 | |
78 ---------------------------------------------------------------------- | |
79 | |
80 All trademarks and registered trademarks mentioned herein are the | |
81 property of their respective owners. | |
82 | |
83 *******************************************************************************/ | |
84 | |
85 module java.mangoicu.USearch; | |
86 | |
87 private import java.mangoicu.ICU; | |
88 | |
89 public import java.mangoicu.ULocale, | |
90 java.mangoicu.UString, | |
91 java.mangoicu.UCollator, | |
92 java.mangoicu.UBreakIterator; | |
93 | |
94 /******************************************************************************* | |
95 | |
96 Apis for an engine that provides language-sensitive text | |
97 searching based on the comparison rules defined in a UCollator | |
98 data struct. This ensures that language eccentricity can be handled, | |
99 e.g. for the German collator, characters ß and SS will be matched | |
100 if case is chosen to be ignored. See the "ICU Collation Design | |
101 Document" for more information. | |
102 | |
103 The algorithm implemented is a modified form of the Boyer Moore's | |
104 search. For more information see "Efficient Text Searching in Java", | |
105 published in Java Report in February, 1999, for further information | |
106 on the algorithm. | |
107 | |
108 There are 2 match options for selection: Let S' be the sub-string | |
109 of a text string S between the offsets start and end <start, end>. A | |
110 pattern string P matches a text string S at the offsets <start, end> if | |
111 | |
112 - option 1. Some canonical equivalent of P matches some canonical | |
113 equivalent of S' | |
114 | |
115 - option 2. P matches S' and if P starts or ends with a combining | |
116 mark, there exists no non-ignorable combining mark before | |
117 or after S' in S respectively. | |
118 | |
119 Option 2 will be the default | |
120 | |
121 This search has APIs similar to that of other text iteration | |
122 mechanisms such as the break iterators in ubrk.h. Using these | |
123 APIs, it is easy to scan through text looking for all occurances | |
124 of a given pattern. This search iterator allows changing of | |
125 direction by calling a reset followed by a next or previous. | |
126 Though a direction change can occur without calling reset first, | |
127 this operation comes with some speed penalty. Generally, match | |
128 results in the forward direction will match the result matches | |
129 in the backwards direction in the reverse order | |
130 | |
131 USearch provides APIs to specify the starting position within | |
132 the text string to be searched, e.g. setOffset(), previous(x) | |
133 and next(x). Since the starting position will be set as it | |
134 is specified, please take note that there are some dangerous | |
135 positions which the search may render incorrect results: | |
136 | |
137 - The midst of a substring that requires normalization. | |
138 | |
139 - If the following match is to be found, the position should | |
140 not be the second character which requires to be swapped | |
141 with the preceding character. Vice versa, if the preceding | |
142 match is to be found, position to search from should not be | |
143 the first character which requires to be swapped with the | |
144 next character. E.g certain Thai and Lao characters require | |
145 swapping. | |
146 | |
147 - If a following pattern match is to be found, any position | |
148 within a contracting sequence except the first will fail. | |
149 Vice versa if a preceding pattern match is to be found, | |
150 a invalid starting point would be any character within a | |
151 contracting sequence except the last. | |
152 | |
153 A breakiterator can be used if only matches at logical breaks are | |
154 desired. Using a breakiterator will only give you results that | |
155 exactly matches the boundaries given by the breakiterator. For | |
156 instance the pattern "e" will not be found in the string "\u00e9" | |
157 if a character break iterator is used. | |
158 | |
159 Options are provided to handle overlapping matches. E.g. In | |
160 English, overlapping matches produces the result 0 and 2 for | |
161 the pattern "abab" in the text "ababab", where else mutually | |
162 exclusive matches only produce the result of 0. | |
163 | |
164 Though collator attributes will be taken into consideration while | |
165 performing matches, there are no APIs here for setting and getting | |
166 the attributes. These attributes can be set by getting the collator | |
167 from getCollator() and using the APIs in UCollator. Lastly to update | |
168 String Search to the new collator attributes, reset() has to be called. | |
169 | |
170 See http://oss.software.ibm.com/icu/apiref/usearch_8h.html for full | |
171 details. | |
172 | |
173 *******************************************************************************/ | |
174 | |
175 class USearch : ICU | |
176 { | |
177 private Handle handle; | |
178 private UBreakIterator* iterator; | |
179 | |
180 // DONE is returned by previous() and next() after all valid | |
181 // matches have been returned, and by first() and last() if | |
182 // there are no matches at all. | |
183 const uint Done = uint.max; | |
184 | |
185 //Possible types of searches | |
186 public enum Attribute | |
187 { | |
188 Overlap, | |
189 CanonicalMatch, | |
190 Count | |
191 } | |
192 | |
193 public enum AttributeValue | |
194 { | |
195 Default = -1, | |
196 Off, | |
197 On, | |
198 Count | |
199 } | |
200 | |
201 /*********************************************************************** | |
202 | |
203 Creating a search iterator data struct using the argument | |
204 locale language rule set | |
205 | |
206 ***********************************************************************/ | |
207 | |
208 this (UStringView pattern, UStringView text, inout ULocale locale, UBreakIterator* iterator = null) | |
209 { | |
210 UErrorCode e; | |
211 | |
212 this.iterator = iterator; | |
213 handle = usearch_open (pattern.get.ptr, pattern.length, text.get.ptr, text.length, toString(locale.name), ( iterator is null ) ? null : iterator.handle, e); | |
214 testError (e, "failed to open search"); | |
215 } | |
216 | |
217 /*********************************************************************** | |
218 | |
219 Creating a search iterator data struct using the argument | |
220 locale language rule set | |
221 | |
222 ***********************************************************************/ | |
223 | |
224 this (UStringView pattern, UStringView text, UCollator col, UBreakIterator* iterator = null) | |
225 { | |
226 UErrorCode e; | |
227 | |
228 this.iterator = iterator; | |
229 handle = usearch_openFromCollator (pattern.get.ptr, pattern.length, text.get.ptr, text.length, col.handle, ( iterator is null ) ? null : iterator.handle, e); | |
230 testError (e, "failed to open search from collator"); | |
231 } | |
232 | |
233 /*********************************************************************** | |
234 | |
235 Close this USearch | |
236 | |
237 ***********************************************************************/ | |
238 | |
239 ~this () | |
240 { | |
241 usearch_close (handle); | |
242 } | |
243 | |
244 /*********************************************************************** | |
245 | |
246 Sets the current position in the text string which the | |
247 next search will start from. | |
248 | |
249 ***********************************************************************/ | |
250 | |
251 void setOffset (uint position) | |
252 { | |
253 UErrorCode e; | |
254 | |
255 usearch_setOffset (handle, position, e); | |
256 testError (e, "failed to set search offset"); | |
257 } | |
258 | |
259 /*********************************************************************** | |
260 | |
261 Return the current index in the string text being searched | |
262 | |
263 ***********************************************************************/ | |
264 | |
265 uint getOffset () | |
266 { | |
267 return usearch_getOffset (handle); | |
268 } | |
269 | |
270 /*********************************************************************** | |
271 | |
272 Returns the index to the match in the text string that was | |
273 searched | |
274 | |
275 ***********************************************************************/ | |
276 | |
277 uint getMatchedStart () | |
278 { | |
279 return usearch_getMatchedStart (handle); | |
280 } | |
281 | |
282 /*********************************************************************** | |
283 | |
284 Returns the length of text in the string which matches the | |
285 search pattern | |
286 | |
287 ***********************************************************************/ | |
288 | |
289 uint getMatchedLength () | |
290 { | |
291 return usearch_getMatchedLength (handle); | |
292 } | |
293 | |
294 /*********************************************************************** | |
295 | |
296 Returns the text that was matched by the most recent call to | |
297 first(), next(), previous(), or last(). | |
298 | |
299 ***********************************************************************/ | |
300 | |
301 void getMatchedText (UString s) | |
302 { | |
303 uint fmt (wchar* dst, uint length, inout UErrorCode e) | |
304 { | |
305 return usearch_getMatchedText (handle, dst, length, e); | |
306 } | |
307 | |
308 s.format (&fmt, "failed to extract matched text"); | |
309 } | |
310 | |
311 /*********************************************************************** | |
312 | |
313 Set the string text to be searched. | |
314 | |
315 ***********************************************************************/ | |
316 | |
317 void setText (UStringView t) | |
318 { | |
319 UErrorCode e; | |
320 | |
321 usearch_setText (handle, t.get.ptr, t.length, e); | |
322 testError (e, "failed to set search text"); | |
323 } | |
324 | |
325 /*********************************************************************** | |
326 | |
327 Return the string text to be searched. Note that this | |
328 returns a read-only reference to the search text. | |
329 | |
330 ***********************************************************************/ | |
331 | |
332 UStringView getText () | |
333 { | |
334 uint len; | |
335 | |
336 wchar *x = usearch_getText (handle, &len); | |
337 return new UStringView (x[0..len]); | |
338 } | |
339 | |
340 /*********************************************************************** | |
341 | |
342 Sets the pattern used for matching | |
343 | |
344 ***********************************************************************/ | |
345 | |
346 void setPattern (UStringView t) | |
347 { | |
348 UErrorCode e; | |
349 | |
350 usearch_setPattern (handle, t.get.ptr, t.length, e); | |
351 testError (e, "failed to set search pattern"); | |
352 } | |
353 | |
354 /*********************************************************************** | |
355 | |
356 Gets the search pattern. Note that this returns a | |
357 read-only reference to the pattern. | |
358 | |
359 ***********************************************************************/ | |
360 | |
361 UStringView getPattern () | |
362 { | |
363 uint len; | |
364 | |
365 wchar *x = usearch_getPattern (handle, &len); | |
366 return new UStringView (x[0..len]); | |
367 } | |
368 | |
369 /*********************************************************************** | |
370 | |
371 Set the BreakIterator that will be used to restrict the | |
372 points at which matches are detected. | |
373 | |
374 ***********************************************************************/ | |
375 | |
376 void setIterator (UBreakIterator* iterator) | |
377 { | |
378 UErrorCode e; | |
379 | |
380 this.iterator = iterator; | |
381 usearch_setBreakIterator (handle, cast(Handle)iterator.handle, e); | |
382 testError (e, "failed to set search iterator"); | |
383 } | |
384 | |
385 /*********************************************************************** | |
386 | |
387 Get the BreakIterator that will be used to restrict the | |
388 points at which matches are detected. | |
389 | |
390 ***********************************************************************/ | |
391 | |
392 UBreakIterator* getIterator () | |
393 { | |
394 return iterator; | |
395 } | |
396 | |
397 /*********************************************************************** | |
398 | |
399 Returns the first index at which the string text matches | |
400 the search pattern | |
401 | |
402 ***********************************************************************/ | |
403 | |
404 uint first () | |
405 { | |
406 UErrorCode e; | |
407 | |
408 uint x = usearch_first (handle, e); | |
409 testError (e, "failed on first search"); | |
410 return x; | |
411 } | |
412 | |
413 /*********************************************************************** | |
414 | |
415 Returns the last index in the target text at which it | |
416 matches the search pattern | |
417 | |
418 ***********************************************************************/ | |
419 | |
420 uint last () | |
421 { | |
422 UErrorCode e; | |
423 | |
424 uint x = usearch_last (handle, e); | |
425 testError (e, "failed on last search"); | |
426 return x; | |
427 } | |
428 | |
429 /*********************************************************************** | |
430 | |
431 Returns the index of the next point at which the string | |
432 text matches the search pattern, starting from the current | |
433 position. | |
434 | |
435 If pos is specified, returns the first index greater than | |
436 pos at which the string text matches the search pattern | |
437 | |
438 ***********************************************************************/ | |
439 | |
440 uint next (uint pos = uint.max) | |
441 { | |
442 UErrorCode e; | |
443 uint x; | |
444 | |
445 x = (pos == uint.max) ? usearch_next (handle, e) : | |
446 usearch_following (handle, pos, e); | |
447 | |
448 testError (e, "failed on next search"); | |
449 return x; | |
450 } | |
451 | |
452 /*********************************************************************** | |
453 | |
454 Returns the index of the previous point at which the | |
455 string text matches the search pattern, starting at | |
456 the current position. | |
457 | |
458 If pos is specified, returns the first index less | |
459 than pos at which the string text matches the search | |
460 pattern. | |
461 | |
462 ***********************************************************************/ | |
463 | |
464 uint previous (uint pos = uint.max) | |
465 { | |
466 UErrorCode e; | |
467 uint x; | |
468 | |
469 x = (pos == uint.max) ? usearch_previous (handle, e) : | |
470 usearch_preceding (handle, pos, e); | |
471 | |
472 testError (e, "failed on next search"); | |
473 return x; | |
474 } | |
475 | |
476 /*********************************************************************** | |
477 | |
478 Search will begin at the start of the text string if a | |
479 forward iteration is initiated before a backwards iteration. | |
480 Otherwise if a backwards iteration is initiated before a | |
481 forwards iteration, the search will begin at the end of the | |
482 text string | |
483 | |
484 ***********************************************************************/ | |
485 | |
486 void reset () | |
487 { | |
488 usearch_reset (handle); | |
489 } | |
490 | |
491 /*********************************************************************** | |
492 | |
493 Gets the collator used for the language rules. | |
494 | |
495 ***********************************************************************/ | |
496 | |
497 UCollator getCollator () | |
498 { | |
499 return new UCollator (usearch_getCollator (handle)); | |
500 } | |
501 | |
502 /*********************************************************************** | |
503 | |
504 Sets the collator used for the language rules. This | |
505 method causes internal data such as Boyer-Moore shift | |
506 tables to be recalculated, but the iterator's position | |
507 is unchanged | |
508 | |
509 ***********************************************************************/ | |
510 | |
511 void setCollator (UCollator col) | |
512 { | |
513 UErrorCode e; | |
514 | |
515 usearch_setCollator (handle, col.handle, e); | |
516 testError (e, "failed to set search collator"); | |
517 } | |
518 | |
519 | |
520 /*********************************************************************** | |
521 | |
522 Bind the ICU functions from a shared library. This is | |
523 complicated by the issues regarding D and DLLs on the | |
524 Windows platform | |
525 | |
526 ***********************************************************************/ | |
527 | |
528 private static void* library; | |
529 | |
530 /*********************************************************************** | |
531 | |
532 ***********************************************************************/ | |
533 | |
534 private static extern (C) | |
535 { | |
536 Handle function (wchar*, uint, wchar*, uint, char*, void*, inout UErrorCode) usearch_open; | |
537 Handle function (wchar*, uint, wchar*, uint, Handle, void*, inout UErrorCode) usearch_openFromCollator; | |
538 void function (Handle) usearch_close; | |
539 void function (Handle, uint, inout UErrorCode) usearch_setOffset; | |
540 uint function (Handle) usearch_getOffset; | |
541 uint function (Handle) usearch_getMatchedStart; | |
542 uint function (Handle) usearch_getMatchedLength; | |
543 uint function (Handle, wchar*, uint, inout UErrorCode) usearch_getMatchedText; | |
544 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setText; | |
545 wchar* function (Handle, uint*) usearch_getText; | |
546 void function (Handle, wchar*, uint, inout UErrorCode) usearch_setPattern; | |
547 wchar* function (Handle, uint*) usearch_getPattern; | |
548 uint function (Handle, inout UErrorCode) usearch_first; | |
549 uint function (Handle, inout UErrorCode) usearch_last; | |
550 uint function (Handle, inout UErrorCode) usearch_next; | |
551 uint function (Handle, inout UErrorCode) usearch_previous; | |
552 uint function (Handle, uint, inout UErrorCode) usearch_following; | |
553 uint function (Handle, uint, inout UErrorCode) usearch_preceding; | |
554 void function (Handle) usearch_reset; | |
555 void function (Handle, Handle, inout UErrorCode) usearch_setBreakIterator; | |
556 Handle function (Handle) usearch_getCollator; | |
557 void function (Handle, Handle, inout UErrorCode) usearch_setCollator; | |
558 } | |
559 | |
560 /*********************************************************************** | |
561 | |
562 ***********************************************************************/ | |
563 | |
564 static FunctionLoader.Bind[] targets = | |
565 [ | |
566 {cast(void**) &usearch_open, "usearch_open"}, | |
567 {cast(void**) &usearch_openFromCollator, "usearch_openFromCollator"}, | |
568 {cast(void**) &usearch_close, "usearch_close"}, | |
569 {cast(void**) &usearch_setOffset, "usearch_setOffset"}, | |
570 {cast(void**) &usearch_getOffset, "usearch_getOffset"}, | |
571 {cast(void**) &usearch_getMatchedStart, "usearch_getMatchedStart"}, | |
572 {cast(void**) &usearch_getMatchedLength, "usearch_getMatchedLength"}, | |
573 {cast(void**) &usearch_getMatchedText, "usearch_getMatchedText"}, | |
574 {cast(void**) &usearch_setText, "usearch_setText"}, | |
575 {cast(void**) &usearch_getText, "usearch_getText"}, | |
576 {cast(void**) &usearch_setPattern, "usearch_setPattern"}, | |
577 {cast(void**) &usearch_getPattern, "usearch_getPattern"}, | |
578 {cast(void**) &usearch_first, "usearch_first"}, | |
579 {cast(void**) &usearch_last, "usearch_last"}, | |
580 {cast(void**) &usearch_next, "usearch_next"}, | |
581 {cast(void**) &usearch_previous, "usearch_previous"}, | |
582 {cast(void**) &usearch_following, "usearch_following"}, | |
583 {cast(void**) &usearch_preceding, "usearch_preceding"}, | |
584 {cast(void**) &usearch_reset, "usearch_reset"}, | |
585 {cast(void**) &usearch_setBreakIterator, "usearch_setBreakIterator"}, | |
586 {cast(void**) &usearch_getCollator, "usearch_getCollator"}, | |
587 {cast(void**) &usearch_setCollator, "usearch_setCollator"}, | |
588 ]; | |
589 | |
590 /*********************************************************************** | |
591 | |
592 ***********************************************************************/ | |
593 | |
594 static this () | |
595 { | |
596 library = FunctionLoader.bind (icuin, targets); | |
597 } | |
598 | |
599 /*********************************************************************** | |
600 | |
601 ***********************************************************************/ | |
602 | |
603 static ~this () | |
604 { | |
605 FunctionLoader.unbind (library); | |
606 } | |
607 } |