1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/unicode/usearch.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,836 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2001-2011 IBM and others. All rights reserved. 1.7 +********************************************************************** 1.8 +* Date Name Description 1.9 +* 06/28/2001 synwee Creation. 1.10 +********************************************************************** 1.11 +*/ 1.12 +#ifndef USEARCH_H 1.13 +#define USEARCH_H 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 + 1.17 +#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 1.18 + 1.19 +#include "unicode/localpointer.h" 1.20 +#include "unicode/ucol.h" 1.21 +#include "unicode/ucoleitr.h" 1.22 +#include "unicode/ubrk.h" 1.23 + 1.24 +/** 1.25 + * \file 1.26 + * \brief C API: StringSearch 1.27 + * 1.28 + * C Apis for an engine that provides language-sensitive text searching based 1.29 + * on the comparison rules defined in a <tt>UCollator</tt> data struct, 1.30 + * see <tt>ucol.h</tt>. This ensures that language eccentricity can be 1.31 + * handled, e.g. for the German collator, characters ß and SS will be matched 1.32 + * if case is chosen to be ignored. 1.33 + * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 1.34 + * "ICU Collation Design Document"</a> for more information. 1.35 + * <p> 1.36 + * The algorithm implemented is a modified form of the Boyer Moore's search. 1.37 + * For more information see 1.38 + * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.html"> 1.39 + * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> 1.40 + * in February, 1999, for further information on the algorithm. 1.41 + * <p> 1.42 + * There are 2 match options for selection:<br> 1.43 + * Let S' be the sub-string of a text string S between the offsets start and 1.44 + * end <start, end>. 1.45 + * <br> 1.46 + * A pattern string P matches a text string S at the offsets <start, end> 1.47 + * if 1.48 + * <pre> 1.49 + * option 1. Some canonical equivalent of P matches some canonical equivalent 1.50 + * of S' 1.51 + * option 2. P matches S' and if P starts or ends with a combining mark, 1.52 + * there exists no non-ignorable combining mark before or after S' 1.53 + * in S respectively. 1.54 + * </pre> 1.55 + * Option 2. will be the default. 1.56 + * <p> 1.57 + * This search has APIs similar to that of other text iteration mechanisms 1.58 + * such as the break iterators in <tt>ubrk.h</tt>. Using these 1.59 + * APIs, it is easy to scan through text looking for all occurances of 1.60 + * a given pattern. This search iterator allows changing of direction by 1.61 + * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. 1.62 + * Though a direction change can occur without calling <tt>reset</tt> first, 1.63 + * this operation comes with some speed penalty. 1.64 + * Generally, match results in the forward direction will match the result 1.65 + * matches in the backwards direction in the reverse order 1.66 + * <p> 1.67 + * <tt>usearch.h</tt> provides APIs to specify the starting position 1.68 + * within the text string to be searched, e.g. <tt>usearch_setOffset</tt>, 1.69 + * <tt>usearch_preceding</tt> and <tt>usearch_following</tt>. Since the 1.70 + * starting position will be set as it is specified, please take note that 1.71 + * there are some dangerous positions which the search may render incorrect 1.72 + * results: 1.73 + * <ul> 1.74 + * <li> The midst of a substring that requires normalization. 1.75 + * <li> If the following match is to be found, the position should not be the 1.76 + * second character which requires to be swapped with the preceding 1.77 + * character. Vice versa, if the preceding match is to be found, 1.78 + * position to search from should not be the first character which 1.79 + * requires to be swapped with the next character. E.g certain Thai and 1.80 + * Lao characters require swapping. 1.81 + * <li> If a following pattern match is to be found, any position within a 1.82 + * contracting sequence except the first will fail. Vice versa if a 1.83 + * preceding pattern match is to be found, a invalid starting point 1.84 + * would be any character within a contracting sequence except the last. 1.85 + * </ul> 1.86 + * <p> 1.87 + * A breakiterator can be used if only matches at logical breaks are desired. 1.88 + * Using a breakiterator will only give you results that exactly matches the 1.89 + * boundaries given by the breakiterator. For instance the pattern "e" will 1.90 + * not be found in the string "\u00e9" if a character break iterator is used. 1.91 + * <p> 1.92 + * Options are provided to handle overlapping matches. 1.93 + * E.g. In English, overlapping matches produces the result 0 and 2 1.94 + * for the pattern "abab" in the text "ababab", where else mutually 1.95 + * exclusive matches only produce the result of 0. 1.96 + * <p> 1.97 + * Though collator attributes will be taken into consideration while 1.98 + * performing matches, there are no APIs here for setting and getting the 1.99 + * attributes. These attributes can be set by getting the collator 1.100 + * from <tt>usearch_getCollator</tt> and using the APIs in <tt>ucol.h</tt>. 1.101 + * Lastly to update String Search to the new collator attributes, 1.102 + * usearch_reset() has to be called. 1.103 + * <p> 1.104 + * Restriction: <br> 1.105 + * Currently there are no composite characters that consists of a 1.106 + * character with combining class > 0 before a character with combining 1.107 + * class == 0. However, if such a character exists in the future, the 1.108 + * search mechanism does not guarantee the results for option 1. 1.109 + * 1.110 + * <p> 1.111 + * Example of use:<br> 1.112 + * <pre><code> 1.113 + * char *tgtstr = "The quick brown fox jumped over the lazy fox"; 1.114 + * char *patstr = "fox"; 1.115 + * UChar target[64]; 1.116 + * UChar pattern[16]; 1.117 + * UErrorCode status = U_ZERO_ERROR; 1.118 + * u_uastrcpy(target, tgtstr); 1.119 + * u_uastrcpy(pattern, patstr); 1.120 + * 1.121 + * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", 1.122 + * NULL, &status); 1.123 + * if (U_SUCCESS(status)) { 1.124 + * for (int pos = usearch_first(search, &status); 1.125 + * pos != USEARCH_DONE; 1.126 + * pos = usearch_next(search, &status)) 1.127 + * { 1.128 + * printf("Found match at %d pos, length is %d\n", pos, 1.129 + * usearch_getMatchLength(search)); 1.130 + * } 1.131 + * } 1.132 + * 1.133 + * usearch_close(search); 1.134 + * </code></pre> 1.135 + * @stable ICU 2.4 1.136 + */ 1.137 + 1.138 +/** 1.139 +* DONE is returned by previous() and next() after all valid matches have 1.140 +* been returned, and by first() and last() if there are no matches at all. 1.141 +* @stable ICU 2.4 1.142 +*/ 1.143 +#define USEARCH_DONE -1 1.144 + 1.145 +/** 1.146 +* Data structure for searching 1.147 +* @stable ICU 2.4 1.148 +*/ 1.149 +struct UStringSearch; 1.150 +/** 1.151 +* Data structure for searching 1.152 +* @stable ICU 2.4 1.153 +*/ 1.154 +typedef struct UStringSearch UStringSearch; 1.155 + 1.156 +/** 1.157 +* @stable ICU 2.4 1.158 +*/ 1.159 +typedef enum { 1.160 + /** Option for overlapping matches */ 1.161 + USEARCH_OVERLAP, 1.162 + /** 1.163 + * Option for canonical matches. option 1 in header documentation. 1.164 + * The default value will be USEARCH_OFF 1.165 + */ 1.166 + USEARCH_CANONICAL_MATCH, 1.167 + /** 1.168 + * Option to control how collation elements are compared. 1.169 + * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. 1.170 + * @stable ICU 4.4 1.171 + */ 1.172 + USEARCH_ELEMENT_COMPARISON, 1.173 + 1.174 + USEARCH_ATTRIBUTE_COUNT 1.175 +} USearchAttribute; 1.176 + 1.177 +/** 1.178 +* @stable ICU 2.4 1.179 +*/ 1.180 +typedef enum { 1.181 + /** Default value for any USearchAttribute */ 1.182 + USEARCH_DEFAULT = -1, 1.183 + /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ 1.184 + USEARCH_OFF, 1.185 + /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ 1.186 + USEARCH_ON, 1.187 + /** 1.188 + * Value (default) for USEARCH_ELEMENT_COMPARISON; 1.189 + * standard collation element comparison at the specified collator 1.190 + * strength. 1.191 + * @stable ICU 4.4 1.192 + */ 1.193 + USEARCH_STANDARD_ELEMENT_COMPARISON, 1.194 + /** 1.195 + * Value for USEARCH_ELEMENT_COMPARISON; 1.196 + * collation element comparison is modified to effectively provide 1.197 + * behavior between the specified strength and strength - 1. Collation 1.198 + * elements in the pattern that have the base weight for the specified 1.199 + * strength are treated as "wildcards" that match an element with any 1.200 + * other weight at that collation level in the searched text. For 1.201 + * example, with a secondary-strength English collator, a plain 'e' in 1.202 + * the pattern will match a plain e or an e with any diacritic in the 1.203 + * searched text, but an e with diacritic in the pattern will only 1.204 + * match an e with the same diacritic in the searched text. 1.205 + * @stable ICU 4.4 1.206 + */ 1.207 + USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, 1.208 + /** 1.209 + * Value for USEARCH_ELEMENT_COMPARISON. 1.210 + * collation element comparison is modified to effectively provide 1.211 + * behavior between the specified strength and strength - 1. Collation 1.212 + * elements in either the pattern or the searched text that have the 1.213 + * base weight for the specified strength are treated as "wildcards" 1.214 + * that match an element with any other weight at that collation level. 1.215 + * For example, with a secondary-strength English collator, a plain 'e' 1.216 + * in the pattern will match a plain e or an e with any diacritic in the 1.217 + * searched text, but an e with diacritic in the pattern will only 1.218 + * match an e with the same diacritic or a plain e in the searched text. 1.219 + * @stable ICU 4.4 1.220 + */ 1.221 + USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, 1.222 + 1.223 + USEARCH_ATTRIBUTE_VALUE_COUNT 1.224 +} USearchAttributeValue; 1.225 + 1.226 +/* open and close ------------------------------------------------------ */ 1.227 + 1.228 +/** 1.229 +* Creating a search iterator data struct using the argument locale language 1.230 +* rule set. A collator will be created in the process, which will be owned by 1.231 +* this search and will be deleted in <tt>usearch_close</tt>. 1.232 +* @param pattern for matching 1.233 +* @param patternlength length of the pattern, -1 for null-termination 1.234 +* @param text text string 1.235 +* @param textlength length of the text string, -1 for null-termination 1.236 +* @param locale name of locale for the rules to be used 1.237 +* @param breakiter A BreakIterator that will be used to restrict the points 1.238 +* at which matches are detected. If a match is found, but 1.239 +* the match's start or end index is not a boundary as 1.240 +* determined by the <tt>BreakIterator</tt>, the match will 1.241 +* be rejected and another will be searched for. 1.242 +* If this parameter is <tt>NULL</tt>, no break detection is 1.243 +* attempted. 1.244 +* @param status for errors if it occurs. If pattern or text is NULL, or if 1.245 +* patternlength or textlength is 0 then an 1.246 +* U_ILLEGAL_ARGUMENT_ERROR is returned. 1.247 +* @return search iterator data structure, or NULL if there is an error. 1.248 +* @stable ICU 2.4 1.249 +*/ 1.250 +U_STABLE UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 1.251 + int32_t patternlength, 1.252 + const UChar *text, 1.253 + int32_t textlength, 1.254 + const char *locale, 1.255 + UBreakIterator *breakiter, 1.256 + UErrorCode *status); 1.257 + 1.258 +/** 1.259 +* Creating a search iterator data struct using the argument collator language 1.260 +* rule set. Note, user retains the ownership of this collator, thus the 1.261 +* responsibility of deletion lies with the user. 1.262 +* NOTE: string search cannot be instantiated from a collator that has 1.263 +* collate digits as numbers (CODAN) turned on. 1.264 +* @param pattern for matching 1.265 +* @param patternlength length of the pattern, -1 for null-termination 1.266 +* @param text text string 1.267 +* @param textlength length of the text string, -1 for null-termination 1.268 +* @param collator used for the language rules 1.269 +* @param breakiter A BreakIterator that will be used to restrict the points 1.270 +* at which matches are detected. If a match is found, but 1.271 +* the match's start or end index is not a boundary as 1.272 +* determined by the <tt>BreakIterator</tt>, the match will 1.273 +* be rejected and another will be searched for. 1.274 +* If this parameter is <tt>NULL</tt>, no break detection is 1.275 +* attempted. 1.276 +* @param status for errors if it occurs. If collator, pattern or text is NULL, 1.277 +* or if patternlength or textlength is 0 then an 1.278 +* U_ILLEGAL_ARGUMENT_ERROR is returned. 1.279 +* @return search iterator data structure, or NULL if there is an error. 1.280 +* @stable ICU 2.4 1.281 +*/ 1.282 +U_STABLE UStringSearch * U_EXPORT2 usearch_openFromCollator( 1.283 + const UChar *pattern, 1.284 + int32_t patternlength, 1.285 + const UChar *text, 1.286 + int32_t textlength, 1.287 + const UCollator *collator, 1.288 + UBreakIterator *breakiter, 1.289 + UErrorCode *status); 1.290 + 1.291 +/** 1.292 +* Destroying and cleaning up the search iterator data struct. 1.293 +* If a collator is created in <tt>usearch_open</tt>, it will be destroyed here. 1.294 +* @param searchiter data struct to clean up 1.295 +* @stable ICU 2.4 1.296 +*/ 1.297 +U_STABLE void U_EXPORT2 usearch_close(UStringSearch *searchiter); 1.298 + 1.299 +#if U_SHOW_CPLUSPLUS_API 1.300 + 1.301 +U_NAMESPACE_BEGIN 1.302 + 1.303 +/** 1.304 + * \class LocalUStringSearchPointer 1.305 + * "Smart pointer" class, closes a UStringSearch via usearch_close(). 1.306 + * For most methods see the LocalPointerBase base class. 1.307 + * 1.308 + * @see LocalPointerBase 1.309 + * @see LocalPointer 1.310 + * @stable ICU 4.4 1.311 + */ 1.312 +U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringSearchPointer, UStringSearch, usearch_close); 1.313 + 1.314 +U_NAMESPACE_END 1.315 + 1.316 +#endif 1.317 + 1.318 +/* get and set methods -------------------------------------------------- */ 1.319 + 1.320 +/** 1.321 +* Sets the current position in the text string which the next search will 1.322 +* start from. Clears previous states. 1.323 +* This method takes the argument index and sets the position in the text 1.324 +* string accordingly without checking if the index is pointing to a 1.325 +* valid starting point to begin searching. 1.326 +* Search positions that may render incorrect results are highlighted in the 1.327 +* header comments 1.328 +* @param strsrch search iterator data struct 1.329 +* @param position position to start next search from. If position is less 1.330 +* than or greater than the text range for searching, 1.331 +* an U_INDEX_OUTOFBOUNDS_ERROR will be returned 1.332 +* @param status error status if any. 1.333 +* @stable ICU 2.4 1.334 +*/ 1.335 +U_STABLE void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 1.336 + int32_t position, 1.337 + UErrorCode *status); 1.338 + 1.339 +/** 1.340 +* Return the current index in the string text being searched. 1.341 +* If the iteration has gone past the end of the text (or past the beginning 1.342 +* for a backwards search), <tt>USEARCH_DONE</tt> is returned. 1.343 +* @param strsrch search iterator data struct 1.344 +* @see #USEARCH_DONE 1.345 +* @stable ICU 2.4 1.346 +*/ 1.347 +U_STABLE int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch); 1.348 + 1.349 +/** 1.350 +* Sets the text searching attributes located in the enum USearchAttribute 1.351 +* with values from the enum USearchAttributeValue. 1.352 +* <tt>USEARCH_DEFAULT</tt> can be used for all attributes for resetting. 1.353 +* @param strsrch search iterator data struct 1.354 +* @param attribute text attribute to be set 1.355 +* @param value text attribute value 1.356 +* @param status for errors if it occurs 1.357 +* @see #usearch_getAttribute 1.358 +* @stable ICU 2.4 1.359 +*/ 1.360 +U_STABLE void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 1.361 + USearchAttribute attribute, 1.362 + USearchAttributeValue value, 1.363 + UErrorCode *status); 1.364 + 1.365 +/** 1.366 +* Gets the text searching attributes. 1.367 +* @param strsrch search iterator data struct 1.368 +* @param attribute text attribute to be retrieve 1.369 +* @return text attribute value 1.370 +* @see #usearch_setAttribute 1.371 +* @stable ICU 2.4 1.372 +*/ 1.373 +U_STABLE USearchAttributeValue U_EXPORT2 usearch_getAttribute( 1.374 + const UStringSearch *strsrch, 1.375 + USearchAttribute attribute); 1.376 + 1.377 +/** 1.378 +* Returns the index to the match in the text string that was searched. 1.379 +* This call returns a valid result only after a successful call to 1.380 +* <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 1.381 +* or <tt>usearch_last</tt>. 1.382 +* Just after construction, or after a searching method returns 1.383 +* <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 1.384 +* <p> 1.385 +* Use <tt>usearch_getMatchedLength</tt> to get the matched string length. 1.386 +* @param strsrch search iterator data struct 1.387 +* @return index to a substring within the text string that is being 1.388 +* searched. 1.389 +* @see #usearch_first 1.390 +* @see #usearch_next 1.391 +* @see #usearch_previous 1.392 +* @see #usearch_last 1.393 +* @see #USEARCH_DONE 1.394 +* @stable ICU 2.4 1.395 +*/ 1.396 +U_STABLE int32_t U_EXPORT2 usearch_getMatchedStart( 1.397 + const UStringSearch *strsrch); 1.398 + 1.399 +/** 1.400 +* Returns the length of text in the string which matches the search pattern. 1.401 +* This call returns a valid result only after a successful call to 1.402 +* <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 1.403 +* or <tt>usearch_last</tt>. 1.404 +* Just after construction, or after a searching method returns 1.405 +* <tt>USEARCH_DONE</tt>, this method will return 0. 1.406 +* @param strsrch search iterator data struct 1.407 +* @return The length of the match in the string text, or 0 if there is no 1.408 +* match currently. 1.409 +* @see #usearch_first 1.410 +* @see #usearch_next 1.411 +* @see #usearch_previous 1.412 +* @see #usearch_last 1.413 +* @see #USEARCH_DONE 1.414 +* @stable ICU 2.4 1.415 +*/ 1.416 +U_STABLE int32_t U_EXPORT2 usearch_getMatchedLength( 1.417 + const UStringSearch *strsrch); 1.418 + 1.419 +/** 1.420 +* Returns the text that was matched by the most recent call to 1.421 +* <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, 1.422 +* or <tt>usearch_last</tt>. 1.423 +* If the iterator is not pointing at a valid match (e.g. just after 1.424 +* construction or after <tt>USEARCH_DONE</tt> has been returned, returns 1.425 +* an empty string. If result is not large enough to store the matched text, 1.426 +* result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR 1.427 +* will be returned in status. result will be null-terminated whenever 1.428 +* possible. If the buffer fits the matched text exactly, a null-termination 1.429 +* is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status. 1.430 +* Pre-flighting can be either done with length = 0 or the API 1.431 +* <tt>usearch_getMatchLength</tt>. 1.432 +* @param strsrch search iterator data struct 1.433 +* @param result UChar buffer to store the matched string 1.434 +* @param resultCapacity length of the result buffer 1.435 +* @param status error returned if result is not large enough 1.436 +* @return exact length of the matched text, not counting the null-termination 1.437 +* @see #usearch_first 1.438 +* @see #usearch_next 1.439 +* @see #usearch_previous 1.440 +* @see #usearch_last 1.441 +* @see #USEARCH_DONE 1.442 +* @stable ICU 2.4 1.443 +*/ 1.444 +U_STABLE int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 1.445 + UChar *result, 1.446 + int32_t resultCapacity, 1.447 + UErrorCode *status); 1.448 + 1.449 +#if !UCONFIG_NO_BREAK_ITERATION 1.450 + 1.451 +/** 1.452 +* Set the BreakIterator that will be used to restrict the points at which 1.453 +* matches are detected. 1.454 +* @param strsrch search iterator data struct 1.455 +* @param breakiter A BreakIterator that will be used to restrict the points 1.456 +* at which matches are detected. If a match is found, but 1.457 +* the match's start or end index is not a boundary as 1.458 +* determined by the <tt>BreakIterator</tt>, the match will 1.459 +* be rejected and another will be searched for. 1.460 +* If this parameter is <tt>NULL</tt>, no break detection is 1.461 +* attempted. 1.462 +* @param status for errors if it occurs 1.463 +* @see #usearch_getBreakIterator 1.464 +* @stable ICU 2.4 1.465 +*/ 1.466 +U_STABLE void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 1.467 + UBreakIterator *breakiter, 1.468 + UErrorCode *status); 1.469 + 1.470 +/** 1.471 +* Returns the BreakIterator that is used to restrict the points at which 1.472 +* matches are detected. This will be the same object that was passed to the 1.473 +* constructor or to <tt>usearch_setBreakIterator</tt>. Note that 1.474 +* <tt>NULL</tt> 1.475 +* is a legal value; it means that break detection should not be attempted. 1.476 +* @param strsrch search iterator data struct 1.477 +* @return break iterator used 1.478 +* @see #usearch_setBreakIterator 1.479 +* @stable ICU 2.4 1.480 +*/ 1.481 +U_STABLE const UBreakIterator * U_EXPORT2 usearch_getBreakIterator( 1.482 + const UStringSearch *strsrch); 1.483 + 1.484 +#endif 1.485 + 1.486 +/** 1.487 +* Set the string text to be searched. Text iteration will hence begin at the 1.488 +* start of the text string. This method is useful if you want to re-use an 1.489 +* iterator to search for the same pattern within a different body of text. 1.490 +* @param strsrch search iterator data struct 1.491 +* @param text new string to look for match 1.492 +* @param textlength length of the new string, -1 for null-termination 1.493 +* @param status for errors if it occurs. If text is NULL, or textlength is 0 1.494 +* then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change 1.495 +* done to strsrch. 1.496 +* @see #usearch_getText 1.497 +* @stable ICU 2.4 1.498 +*/ 1.499 +U_STABLE void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 1.500 + const UChar *text, 1.501 + int32_t textlength, 1.502 + UErrorCode *status); 1.503 + 1.504 +/** 1.505 +* Return the string text to be searched. 1.506 +* @param strsrch search iterator data struct 1.507 +* @param length returned string text length 1.508 +* @return string text 1.509 +* @see #usearch_setText 1.510 +* @stable ICU 2.4 1.511 +*/ 1.512 +U_STABLE const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 1.513 + int32_t *length); 1.514 + 1.515 +/** 1.516 +* Gets the collator used for the language rules. 1.517 +* <p> 1.518 +* Deleting the returned <tt>UCollator</tt> before calling 1.519 +* <tt>usearch_close</tt> would cause the string search to fail. 1.520 +* <tt>usearch_close</tt> will delete the collator if this search owns it. 1.521 +* @param strsrch search iterator data struct 1.522 +* @return collator 1.523 +* @stable ICU 2.4 1.524 +*/ 1.525 +U_STABLE UCollator * U_EXPORT2 usearch_getCollator( 1.526 + const UStringSearch *strsrch); 1.527 + 1.528 +/** 1.529 +* Sets the collator used for the language rules. User retains the ownership 1.530 +* of this collator, thus the responsibility of deletion lies with the user. 1.531 +* This method causes internal data such as Boyer-Moore shift tables to 1.532 +* be recalculated, but the iterator's position is unchanged. 1.533 +* @param strsrch search iterator data struct 1.534 +* @param collator to be used 1.535 +* @param status for errors if it occurs 1.536 +* @stable ICU 2.4 1.537 +*/ 1.538 +U_STABLE void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 1.539 + const UCollator *collator, 1.540 + UErrorCode *status); 1.541 + 1.542 +/** 1.543 +* Sets the pattern used for matching. 1.544 +* Internal data like the Boyer Moore table will be recalculated, but the 1.545 +* iterator's position is unchanged. 1.546 +* @param strsrch search iterator data struct 1.547 +* @param pattern string 1.548 +* @param patternlength pattern length, -1 for null-terminated string 1.549 +* @param status for errors if it occurs. If text is NULL, or textlength is 0 1.550 +* then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change 1.551 +* done to strsrch. 1.552 +* @stable ICU 2.4 1.553 +*/ 1.554 +U_STABLE void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 1.555 + const UChar *pattern, 1.556 + int32_t patternlength, 1.557 + UErrorCode *status); 1.558 + 1.559 +/** 1.560 +* Gets the search pattern 1.561 +* @param strsrch search iterator data struct 1.562 +* @param length return length of the pattern, -1 indicates that the pattern 1.563 +* is null-terminated 1.564 +* @return pattern string 1.565 +* @stable ICU 2.4 1.566 +*/ 1.567 +U_STABLE const UChar * U_EXPORT2 usearch_getPattern( 1.568 + const UStringSearch *strsrch, 1.569 + int32_t *length); 1.570 + 1.571 +/* methods ------------------------------------------------------------- */ 1.572 + 1.573 +/** 1.574 +* Returns the first index at which the string text matches the search 1.575 +* pattern. 1.576 +* The iterator is adjusted so that its current index (as returned by 1.577 +* <tt>usearch_getOffset</tt>) is the match position if one was found. 1.578 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.579 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 1.580 +* @param strsrch search iterator data struct 1.581 +* @param status for errors if it occurs 1.582 +* @return The character index of the first match, or 1.583 +* <tt>USEARCH_DONE</tt> if there are no matches. 1.584 +* @see #usearch_getOffset 1.585 +* @see #USEARCH_DONE 1.586 +* @stable ICU 2.4 1.587 +*/ 1.588 +U_STABLE int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 1.589 + UErrorCode *status); 1.590 + 1.591 +/** 1.592 +* Returns the first index equal or greater than <tt>position</tt> at which 1.593 +* the string text 1.594 +* matches the search pattern. The iterator is adjusted so that its current 1.595 +* index (as returned by <tt>usearch_getOffset</tt>) is the match position if 1.596 +* one was found. 1.597 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.598 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 1.599 +* <p> 1.600 +* Search positions that may render incorrect results are highlighted in the 1.601 +* header comments. If position is less than or greater than the text range 1.602 +* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned 1.603 +* @param strsrch search iterator data struct 1.604 +* @param position to start the search at 1.605 +* @param status for errors if it occurs 1.606 +* @return The character index of the first match following <tt>pos</tt>, 1.607 +* or <tt>USEARCH_DONE</tt> if there are no matches. 1.608 +* @see #usearch_getOffset 1.609 +* @see #USEARCH_DONE 1.610 +* @stable ICU 2.4 1.611 +*/ 1.612 +U_STABLE int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 1.613 + int32_t position, 1.614 + UErrorCode *status); 1.615 + 1.616 +/** 1.617 +* Returns the last index in the target text at which it matches the search 1.618 +* pattern. The iterator is adjusted so that its current 1.619 +* index (as returned by <tt>usearch_getOffset</tt>) is the match position if 1.620 +* one was found. 1.621 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.622 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 1.623 +* @param strsrch search iterator data struct 1.624 +* @param status for errors if it occurs 1.625 +* @return The index of the first match, or <tt>USEARCH_DONE</tt> if there 1.626 +* are no matches. 1.627 +* @see #usearch_getOffset 1.628 +* @see #USEARCH_DONE 1.629 +* @stable ICU 2.4 1.630 +*/ 1.631 +U_STABLE int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 1.632 + UErrorCode *status); 1.633 + 1.634 +/** 1.635 +* Returns the first index less than <tt>position</tt> at which the string text 1.636 +* matches the search pattern. The iterator is adjusted so that its current 1.637 +* index (as returned by <tt>usearch_getOffset</tt>) is the match position if 1.638 +* one was found. 1.639 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.640 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 1.641 +* <p> 1.642 +* Search positions that may render incorrect results are highlighted in the 1.643 +* header comments. If position is less than or greater than the text range 1.644 +* for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned. 1.645 +* <p> 1.646 +* When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 1.647 +* result match is always less than <tt>position</tt>. 1.648 +* When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 1.649 +* <tt>position</tt>. 1.650 +* @param strsrch search iterator data struct 1.651 +* @param position index position the search is to begin at 1.652 +* @param status for errors if it occurs 1.653 +* @return The character index of the first match preceding <tt>pos</tt>, 1.654 +* or <tt>USEARCH_DONE</tt> if there are no matches. 1.655 +* @see #usearch_getOffset 1.656 +* @see #USEARCH_DONE 1.657 +* @stable ICU 2.4 1.658 +*/ 1.659 +U_STABLE int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 1.660 + int32_t position, 1.661 + UErrorCode *status); 1.662 + 1.663 +/** 1.664 +* Returns the index of the next point at which the string text matches the 1.665 +* search pattern, starting from the current position. 1.666 +* The iterator is adjusted so that its current 1.667 +* index (as returned by <tt>usearch_getOffset</tt>) is the match position if 1.668 +* one was found. 1.669 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.670 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 1.671 +* @param strsrch search iterator data struct 1.672 +* @param status for errors if it occurs 1.673 +* @return The index of the next match after the current position, or 1.674 +* <tt>USEARCH_DONE</tt> if there are no more matches. 1.675 +* @see #usearch_first 1.676 +* @see #usearch_getOffset 1.677 +* @see #USEARCH_DONE 1.678 +* @stable ICU 2.4 1.679 +*/ 1.680 +U_STABLE int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 1.681 + UErrorCode *status); 1.682 + 1.683 +/** 1.684 +* Returns the index of the previous point at which the string text matches 1.685 +* the search pattern, starting at the current position. 1.686 +* The iterator is adjusted so that its current 1.687 +* index (as returned by <tt>usearch_getOffset</tt>) is the match position if 1.688 +* one was found. 1.689 +* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 1.690 +* the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> 1.691 +* @param strsrch search iterator data struct 1.692 +* @param status for errors if it occurs 1.693 +* @return The index of the previous match before the current position, 1.694 +* or <tt>USEARCH_DONE</tt> if there are no more matches. 1.695 +* @see #usearch_last 1.696 +* @see #usearch_getOffset 1.697 +* @see #USEARCH_DONE 1.698 +* @stable ICU 2.4 1.699 +*/ 1.700 +U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 1.701 + UErrorCode *status); 1.702 + 1.703 +/** 1.704 +* Reset the iteration. 1.705 +* Search will begin at the start of the text string if a forward iteration 1.706 +* is initiated before a backwards iteration. Otherwise if a backwards 1.707 +* iteration is initiated before a forwards iteration, the search will begin 1.708 +* at the end of the text string. 1.709 +* @param strsrch search iterator data struct 1.710 +* @see #usearch_first 1.711 +* @stable ICU 2.4 1.712 +*/ 1.713 +U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); 1.714 + 1.715 +#ifndef U_HIDE_INTERNAL_API 1.716 +/** 1.717 + * Simple forward search for the pattern, starting at a specified index, 1.718 + * and using using a default set search options. 1.719 + * 1.720 + * This is an experimental function, and is not an official part of the 1.721 + * ICU API. 1.722 + * 1.723 + * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored. 1.724 + * 1.725 + * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and 1.726 + * any Break Iterator are ignored. 1.727 + * 1.728 + * Matches obey the following constraints: 1.729 + * 1.730 + * Characters at the start or end positions of a match that are ignorable 1.731 + * for collation are not included as part of the match, unless they 1.732 + * are part of a combining sequence, as described below. 1.733 + * 1.734 + * A match will not include a partial combining sequence. Combining 1.735 + * character sequences are considered to be inseperable units, 1.736 + * and either match the pattern completely, or are considered to not match 1.737 + * at all. Thus, for example, an A followed a combining accent mark will 1.738 + * not be found when searching for a plain (unaccented) A. (unless 1.739 + * the collation strength has been set to ignore all accents). 1.740 + * 1.741 + * When beginning a search, the initial starting position, startIdx, 1.742 + * is assumed to be an acceptable match boundary with respect to 1.743 + * combining characters. A combining sequence that spans across the 1.744 + * starting point will not supress a match beginning at startIdx. 1.745 + * 1.746 + * Characters that expand to multiple collation elements 1.747 + * (German sharp-S becoming 'ss', or the composed forms of accented 1.748 + * characters, for example) also must match completely. 1.749 + * Searching for a single 's' in a string containing only a sharp-s will 1.750 + * find no match. 1.751 + * 1.752 + * 1.753 + * @param strsrch the UStringSearch struct, which references both 1.754 + * the text to be searched and the pattern being sought. 1.755 + * @param startIdx The index into the text to begin the search. 1.756 + * @param matchStart An out parameter, the starting index of the matched text. 1.757 + * This parameter may be NULL. 1.758 + * A value of -1 will be returned if no match was found. 1.759 + * @param matchLimit Out parameter, the index of the first position following the matched text. 1.760 + * The matchLimit will be at a suitable position for beginning a subsequent search 1.761 + * in the input text. 1.762 + * This parameter may be NULL. 1.763 + * A value of -1 will be returned if no match was found. 1.764 + * 1.765 + * @param status Report any errors. Note that no match found is not an error. 1.766 + * @return TRUE if a match was found, FALSE otherwise. 1.767 + * 1.768 + * @internal 1.769 + */ 1.770 +U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 1.771 + int32_t startIdx, 1.772 + int32_t *matchStart, 1.773 + int32_t *matchLimit, 1.774 + UErrorCode *status); 1.775 + 1.776 +/** 1.777 + * Simple backwards search for the pattern, starting at a specified index, 1.778 + * and using using a default set search options. 1.779 + * 1.780 + * This is an experimental function, and is not an official part of the 1.781 + * ICU API. 1.782 + * 1.783 + * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored. 1.784 + * 1.785 + * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and 1.786 + * any Break Iterator are ignored. 1.787 + * 1.788 + * Matches obey the following constraints: 1.789 + * 1.790 + * Characters at the start or end positions of a match that are ignorable 1.791 + * for collation are not included as part of the match, unless they 1.792 + * are part of a combining sequence, as described below. 1.793 + * 1.794 + * A match will not include a partial combining sequence. Combining 1.795 + * character sequences are considered to be inseperable units, 1.796 + * and either match the pattern completely, or are considered to not match 1.797 + * at all. Thus, for example, an A followed a combining accent mark will 1.798 + * not be found when searching for a plain (unaccented) A. (unless 1.799 + * the collation strength has been set to ignore all accents). 1.800 + * 1.801 + * When beginning a search, the initial starting position, startIdx, 1.802 + * is assumed to be an acceptable match boundary with respect to 1.803 + * combining characters. A combining sequence that spans across the 1.804 + * starting point will not supress a match beginning at startIdx. 1.805 + * 1.806 + * Characters that expand to multiple collation elements 1.807 + * (German sharp-S becoming 'ss', or the composed forms of accented 1.808 + * characters, for example) also must match completely. 1.809 + * Searching for a single 's' in a string containing only a sharp-s will 1.810 + * find no match. 1.811 + * 1.812 + * 1.813 + * @param strsrch the UStringSearch struct, which references both 1.814 + * the text to be searched and the pattern being sought. 1.815 + * @param startIdx The index into the text to begin the search. 1.816 + * @param matchStart An out parameter, the starting index of the matched text. 1.817 + * This parameter may be NULL. 1.818 + * A value of -1 will be returned if no match was found. 1.819 + * @param matchLimit Out parameter, the index of the first position following the matched text. 1.820 + * The matchLimit will be at a suitable position for beginning a subsequent search 1.821 + * in the input text. 1.822 + * This parameter may be NULL. 1.823 + * A value of -1 will be returned if no match was found. 1.824 + * 1.825 + * @param status Report any errors. Note that no match found is not an error. 1.826 + * @return TRUE if a match was found, FALSE otherwise. 1.827 + * 1.828 + * @internal 1.829 + */ 1.830 +U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 1.831 + int32_t startIdx, 1.832 + int32_t *matchStart, 1.833 + int32_t *matchLimit, 1.834 + UErrorCode *status); 1.835 +#endif /* U_HIDE_INTERNAL_API */ 1.836 + 1.837 +#endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ 1.838 + 1.839 +#endif