1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/normlzr.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,797 @@ 1.4 +/* 1.5 + ******************************************************************** 1.6 + * COPYRIGHT: 1.7 + * Copyright (c) 1996-2011, International Business Machines Corporation and 1.8 + * others. All Rights Reserved. 1.9 + ******************************************************************** 1.10 + */ 1.11 + 1.12 +#ifndef NORMLZR_H 1.13 +#define NORMLZR_H 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 + 1.17 +/** 1.18 + * \file 1.19 + * \brief C++ API: Unicode Normalization 1.20 + */ 1.21 + 1.22 +#if !UCONFIG_NO_NORMALIZATION 1.23 + 1.24 +#include "unicode/chariter.h" 1.25 +#include "unicode/normalizer2.h" 1.26 +#include "unicode/unistr.h" 1.27 +#include "unicode/unorm.h" 1.28 +#include "unicode/uobject.h" 1.29 + 1.30 +U_NAMESPACE_BEGIN 1.31 +/** 1.32 + * The Normalizer class supports the standard normalization forms described in 1.33 + * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 1.34 + * Unicode Standard Annex #15: Unicode Normalization Forms</a>. 1.35 + * 1.36 + * Note: This API has been replaced by the Normalizer2 class and is only available 1.37 + * for backward compatibility. This class simply delegates to the Normalizer2 class. 1.38 + * There is one exception: The new API does not provide a replacement for Normalizer::compare(). 1.39 + * 1.40 + * The Normalizer class consists of two parts: 1.41 + * - static functions that normalize strings or test if strings are normalized 1.42 + * - a Normalizer object is an iterator that takes any kind of text and 1.43 + * provides iteration over its normalized form 1.44 + * 1.45 + * The Normalizer class is not suitable for subclassing. 1.46 + * 1.47 + * For basic information about normalization forms and details about the C API 1.48 + * please see the documentation in unorm.h. 1.49 + * 1.50 + * The iterator API with the Normalizer constructors and the non-static functions 1.51 + * use a CharacterIterator as input. It is possible to pass a string which 1.52 + * is then internally wrapped in a CharacterIterator. 1.53 + * The input text is not normalized all at once, but incrementally where needed 1.54 + * (providing efficient random access). 1.55 + * This allows to pass in a large text but spend only a small amount of time 1.56 + * normalizing a small part of that text. 1.57 + * However, if the entire text is normalized, then the iterator will be 1.58 + * slower than normalizing the entire text at once and iterating over the result. 1.59 + * A possible use of the Normalizer iterator is also to report an index into the 1.60 + * original text that is close to where the normalized characters come from. 1.61 + * 1.62 + * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0. 1.63 + * The earlier implementation reported the getIndex() inconsistently, 1.64 + * and previous() could not be used after setIndex(), next(), first(), and current(). 1.65 + * 1.66 + * Normalizer allows to start normalizing from anywhere in the input text by 1.67 + * calling setIndexOnly(), first(), or last(). 1.68 + * Without calling any of these, the iterator will start at the beginning of the text. 1.69 + * 1.70 + * At any time, next() returns the next normalized code point (UChar32), 1.71 + * with post-increment semantics (like CharacterIterator::next32PostInc()). 1.72 + * previous() returns the previous normalized code point (UChar32), 1.73 + * with pre-decrement semantics (like CharacterIterator::previous32()). 1.74 + * 1.75 + * current() returns the current code point 1.76 + * (respectively the one at the newly set index) without moving 1.77 + * the getIndex(). Note that if the text at the current position 1.78 + * needs to be normalized, then these functions will do that. 1.79 + * (This is why current() is not const.) 1.80 + * It is more efficient to call setIndexOnly() instead, which does not 1.81 + * normalize. 1.82 + * 1.83 + * getIndex() always refers to the position in the input text where the normalized 1.84 + * code points are returned from. It does not always change with each returned 1.85 + * code point. 1.86 + * The code point that is returned from any of the functions 1.87 + * corresponds to text at or after getIndex(), according to the 1.88 + * function's iteration semantics (post-increment or pre-decrement). 1.89 + * 1.90 + * next() returns a code point from at or after the getIndex() 1.91 + * from before the next() call. After the next() call, the getIndex() 1.92 + * might have moved to where the next code point will be returned from 1.93 + * (from a next() or current() call). 1.94 + * This is semantically equivalent to array access with array[index++] 1.95 + * (post-increment semantics). 1.96 + * 1.97 + * previous() returns a code point from at or after the getIndex() 1.98 + * from after the previous() call. 1.99 + * This is semantically equivalent to array access with array[--index] 1.100 + * (pre-decrement semantics). 1.101 + * 1.102 + * Internally, the Normalizer iterator normalizes a small piece of text 1.103 + * starting at the getIndex() and ending at a following "safe" index. 1.104 + * The normalized results is stored in an internal string buffer, and 1.105 + * the code points are iterated from there. 1.106 + * With multiple iteration calls, this is repeated until the next piece 1.107 + * of text needs to be normalized, and the getIndex() needs to be moved. 1.108 + * 1.109 + * The following "safe" index, the internal buffer, and the secondary 1.110 + * iteration index into that buffer are not exposed on the API. 1.111 + * This also means that it is currently not practical to return to 1.112 + * a particular, arbitrary position in the text because one would need to 1.113 + * know, and be able to set, in addition to the getIndex(), at least also the 1.114 + * current index into the internal buffer. 1.115 + * It is currently only possible to observe when getIndex() changes 1.116 + * (with careful consideration of the iteration semantics), 1.117 + * at which time the internal index will be 0. 1.118 + * For example, if getIndex() is different after next() than before it, 1.119 + * then the internal index is 0 and one can return to this getIndex() 1.120 + * later with setIndexOnly(). 1.121 + * 1.122 + * Note: While the setIndex() and getIndex() refer to indices in the 1.123 + * underlying Unicode input text, the next() and previous() methods 1.124 + * iterate through characters in the normalized output. 1.125 + * This means that there is not necessarily a one-to-one correspondence 1.126 + * between characters returned by next() and previous() and the indices 1.127 + * passed to and returned from setIndex() and getIndex(). 1.128 + * It is for this reason that Normalizer does not implement the CharacterIterator interface. 1.129 + * 1.130 + * @author Laura Werner, Mark Davis, Markus Scherer 1.131 + * @stable ICU 2.0 1.132 + */ 1.133 +class U_COMMON_API Normalizer : public UObject { 1.134 +public: 1.135 + /** 1.136 + * If DONE is returned from an iteration function that returns a code point, 1.137 + * then there are no more normalization results available. 1.138 + * @stable ICU 2.0 1.139 + */ 1.140 + enum { 1.141 + DONE=0xffff 1.142 + }; 1.143 + 1.144 + // Constructors 1.145 + 1.146 + /** 1.147 + * Creates a new <code>Normalizer</code> object for iterating over the 1.148 + * normalized form of a given string. 1.149 + * <p> 1.150 + * @param str The string to be normalized. The normalization 1.151 + * will start at the beginning of the string. 1.152 + * 1.153 + * @param mode The normalization mode. 1.154 + * @stable ICU 2.0 1.155 + */ 1.156 + Normalizer(const UnicodeString& str, UNormalizationMode mode); 1.157 + 1.158 + /** 1.159 + * Creates a new <code>Normalizer</code> object for iterating over the 1.160 + * normalized form of a given string. 1.161 + * <p> 1.162 + * @param str The string to be normalized. The normalization 1.163 + * will start at the beginning of the string. 1.164 + * 1.165 + * @param length Length of the string, or -1 if NUL-terminated. 1.166 + * @param mode The normalization mode. 1.167 + * @stable ICU 2.0 1.168 + */ 1.169 + Normalizer(const UChar* str, int32_t length, UNormalizationMode mode); 1.170 + 1.171 + /** 1.172 + * Creates a new <code>Normalizer</code> object for iterating over the 1.173 + * normalized form of the given text. 1.174 + * <p> 1.175 + * @param iter The input text to be normalized. The normalization 1.176 + * will start at the beginning of the string. 1.177 + * 1.178 + * @param mode The normalization mode. 1.179 + * @stable ICU 2.0 1.180 + */ 1.181 + Normalizer(const CharacterIterator& iter, UNormalizationMode mode); 1.182 + 1.183 + /** 1.184 + * Copy constructor. 1.185 + * @param copy The object to be copied. 1.186 + * @stable ICU 2.0 1.187 + */ 1.188 + Normalizer(const Normalizer& copy); 1.189 + 1.190 + /** 1.191 + * Destructor 1.192 + * @stable ICU 2.0 1.193 + */ 1.194 + virtual ~Normalizer(); 1.195 + 1.196 + 1.197 + //------------------------------------------------------------------------- 1.198 + // Static utility methods 1.199 + //------------------------------------------------------------------------- 1.200 + 1.201 + /** 1.202 + * Normalizes a <code>UnicodeString</code> according to the specified normalization mode. 1.203 + * This is a wrapper for unorm_normalize(), using UnicodeString's. 1.204 + * 1.205 + * The <code>options</code> parameter specifies which optional 1.206 + * <code>Normalizer</code> features are to be enabled for this operation. 1.207 + * 1.208 + * @param source the input string to be normalized. 1.209 + * @param mode the normalization mode 1.210 + * @param options the optional features to be enabled (0 for no options) 1.211 + * @param result The normalized string (on output). 1.212 + * @param status The error code. 1.213 + * @stable ICU 2.0 1.214 + */ 1.215 + static void U_EXPORT2 normalize(const UnicodeString& source, 1.216 + UNormalizationMode mode, int32_t options, 1.217 + UnicodeString& result, 1.218 + UErrorCode &status); 1.219 + 1.220 + /** 1.221 + * Compose a <code>UnicodeString</code>. 1.222 + * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. 1.223 + * This is a wrapper for unorm_normalize(), using UnicodeString's. 1.224 + * 1.225 + * The <code>options</code> parameter specifies which optional 1.226 + * <code>Normalizer</code> features are to be enabled for this operation. 1.227 + * 1.228 + * @param source the string to be composed. 1.229 + * @param compat Perform compatibility decomposition before composition. 1.230 + * If this argument is <code>FALSE</code>, only canonical 1.231 + * decomposition will be performed. 1.232 + * @param options the optional features to be enabled (0 for no options) 1.233 + * @param result The composed string (on output). 1.234 + * @param status The error code. 1.235 + * @stable ICU 2.0 1.236 + */ 1.237 + static void U_EXPORT2 compose(const UnicodeString& source, 1.238 + UBool compat, int32_t options, 1.239 + UnicodeString& result, 1.240 + UErrorCode &status); 1.241 + 1.242 + /** 1.243 + * Static method to decompose a <code>UnicodeString</code>. 1.244 + * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. 1.245 + * This is a wrapper for unorm_normalize(), using UnicodeString's. 1.246 + * 1.247 + * The <code>options</code> parameter specifies which optional 1.248 + * <code>Normalizer</code> features are to be enabled for this operation. 1.249 + * 1.250 + * @param source the string to be decomposed. 1.251 + * @param compat Perform compatibility decomposition. 1.252 + * If this argument is <code>FALSE</code>, only canonical 1.253 + * decomposition will be performed. 1.254 + * @param options the optional features to be enabled (0 for no options) 1.255 + * @param result The decomposed string (on output). 1.256 + * @param status The error code. 1.257 + * @stable ICU 2.0 1.258 + */ 1.259 + static void U_EXPORT2 decompose(const UnicodeString& source, 1.260 + UBool compat, int32_t options, 1.261 + UnicodeString& result, 1.262 + UErrorCode &status); 1.263 + 1.264 + /** 1.265 + * Performing quick check on a string, to quickly determine if the string is 1.266 + * in a particular normalization format. 1.267 + * This is a wrapper for unorm_quickCheck(), using a UnicodeString. 1.268 + * 1.269 + * Three types of result can be returned UNORM_YES, UNORM_NO or 1.270 + * UNORM_MAYBE. Result UNORM_YES indicates that the argument 1.271 + * string is in the desired normalized format, UNORM_NO determines that 1.272 + * argument string is not in the desired normalized format. A 1.273 + * UNORM_MAYBE result indicates that a more thorough check is required, 1.274 + * the user may have to put the string in its normalized form and compare the 1.275 + * results. 1.276 + * @param source string for determining if it is in a normalized format 1.277 + * @param mode normalization format 1.278 + * @param status A reference to a UErrorCode to receive any errors 1.279 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 1.280 + * 1.281 + * @see isNormalized 1.282 + * @stable ICU 2.0 1.283 + */ 1.284 + static inline UNormalizationCheckResult 1.285 + quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status); 1.286 + 1.287 + /** 1.288 + * Performing quick check on a string; same as the other version of quickCheck 1.289 + * but takes an extra options parameter like most normalization functions. 1.290 + * 1.291 + * @param source string for determining if it is in a normalized format 1.292 + * @param mode normalization format 1.293 + * @param options the optional features to be enabled (0 for no options) 1.294 + * @param status A reference to a UErrorCode to receive any errors 1.295 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE 1.296 + * 1.297 + * @see isNormalized 1.298 + * @stable ICU 2.6 1.299 + */ 1.300 + static UNormalizationCheckResult 1.301 + quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status); 1.302 + 1.303 + /** 1.304 + * Test if a string is in a given normalization form. 1.305 + * This is semantically equivalent to source.equals(normalize(source, mode)) . 1.306 + * 1.307 + * Unlike unorm_quickCheck(), this function returns a definitive result, 1.308 + * never a "maybe". 1.309 + * For NFD, NFKD, and FCD, both functions work exactly the same. 1.310 + * For NFC and NFKC where quickCheck may return "maybe", this function will 1.311 + * perform further tests to arrive at a TRUE/FALSE result. 1.312 + * 1.313 + * @param src String that is to be tested if it is in a normalization format. 1.314 + * @param mode Which normalization form to test for. 1.315 + * @param errorCode ICU error code in/out parameter. 1.316 + * Must fulfill U_SUCCESS before the function call. 1.317 + * @return Boolean value indicating whether the source string is in the 1.318 + * "mode" normalization form. 1.319 + * 1.320 + * @see quickCheck 1.321 + * @stable ICU 2.2 1.322 + */ 1.323 + static inline UBool 1.324 + isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode); 1.325 + 1.326 + /** 1.327 + * Test if a string is in a given normalization form; same as the other version of isNormalized 1.328 + * but takes an extra options parameter like most normalization functions. 1.329 + * 1.330 + * @param src String that is to be tested if it is in a normalization format. 1.331 + * @param mode Which normalization form to test for. 1.332 + * @param options the optional features to be enabled (0 for no options) 1.333 + * @param errorCode ICU error code in/out parameter. 1.334 + * Must fulfill U_SUCCESS before the function call. 1.335 + * @return Boolean value indicating whether the source string is in the 1.336 + * "mode" normalization form. 1.337 + * 1.338 + * @see quickCheck 1.339 + * @stable ICU 2.6 1.340 + */ 1.341 + static UBool 1.342 + isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode); 1.343 + 1.344 + /** 1.345 + * Concatenate normalized strings, making sure that the result is normalized as well. 1.346 + * 1.347 + * If both the left and the right strings are in 1.348 + * the normalization form according to "mode/options", 1.349 + * then the result will be 1.350 + * 1.351 + * \code 1.352 + * dest=normalize(left+right, mode, options) 1.353 + * \endcode 1.354 + * 1.355 + * For details see unorm_concatenate in unorm.h. 1.356 + * 1.357 + * @param left Left source string. 1.358 + * @param right Right source string. 1.359 + * @param result The output string. 1.360 + * @param mode The normalization mode. 1.361 + * @param options A bit set of normalization options. 1.362 + * @param errorCode ICU error code in/out parameter. 1.363 + * Must fulfill U_SUCCESS before the function call. 1.364 + * @return result 1.365 + * 1.366 + * @see unorm_concatenate 1.367 + * @see normalize 1.368 + * @see unorm_next 1.369 + * @see unorm_previous 1.370 + * 1.371 + * @stable ICU 2.1 1.372 + */ 1.373 + static UnicodeString & 1.374 + U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right, 1.375 + UnicodeString &result, 1.376 + UNormalizationMode mode, int32_t options, 1.377 + UErrorCode &errorCode); 1.378 + 1.379 + /** 1.380 + * Compare two strings for canonical equivalence. 1.381 + * Further options include case-insensitive comparison and 1.382 + * code point order (as opposed to code unit order). 1.383 + * 1.384 + * Canonical equivalence between two strings is defined as their normalized 1.385 + * forms (NFD or NFC) being identical. 1.386 + * This function compares strings incrementally instead of normalizing 1.387 + * (and optionally case-folding) both strings entirely, 1.388 + * improving performance significantly. 1.389 + * 1.390 + * Bulk normalization is only necessary if the strings do not fulfill the FCD 1.391 + * conditions. Only in this case, and only if the strings are relatively long, 1.392 + * is memory allocated temporarily. 1.393 + * For FCD strings and short non-FCD strings there is no memory allocation. 1.394 + * 1.395 + * Semantically, this is equivalent to 1.396 + * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 1.397 + * where code point order and foldCase are all optional. 1.398 + * 1.399 + * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match 1.400 + * the case folding must be performed first, then the normalization. 1.401 + * 1.402 + * @param s1 First source string. 1.403 + * @param s2 Second source string. 1.404 + * 1.405 + * @param options A bit set of options: 1.406 + * - U_FOLD_CASE_DEFAULT or 0 is used for default options: 1.407 + * Case-sensitive comparison in code unit order, and the input strings 1.408 + * are quick-checked for FCD. 1.409 + * 1.410 + * - UNORM_INPUT_IS_FCD 1.411 + * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. 1.412 + * If not set, the function will quickCheck for FCD 1.413 + * and normalize if necessary. 1.414 + * 1.415 + * - U_COMPARE_CODE_POINT_ORDER 1.416 + * Set to choose code point order instead of code unit order 1.417 + * (see u_strCompare for details). 1.418 + * 1.419 + * - U_COMPARE_IGNORE_CASE 1.420 + * Set to compare strings case-insensitively using case folding, 1.421 + * instead of case-sensitively. 1.422 + * If set, then the following case folding options are used. 1.423 + * 1.424 + * - Options as used with case-insensitive comparisons, currently: 1.425 + * 1.426 + * - U_FOLD_CASE_EXCLUDE_SPECIAL_I 1.427 + * (see u_strCaseCompare for details) 1.428 + * 1.429 + * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT 1.430 + * 1.431 + * @param errorCode ICU error code in/out parameter. 1.432 + * Must fulfill U_SUCCESS before the function call. 1.433 + * @return <0 or 0 or >0 as usual for string comparisons 1.434 + * 1.435 + * @see unorm_compare 1.436 + * @see normalize 1.437 + * @see UNORM_FCD 1.438 + * @see u_strCompare 1.439 + * @see u_strCaseCompare 1.440 + * 1.441 + * @stable ICU 2.2 1.442 + */ 1.443 + static inline int32_t 1.444 + compare(const UnicodeString &s1, const UnicodeString &s2, 1.445 + uint32_t options, 1.446 + UErrorCode &errorCode); 1.447 + 1.448 + //------------------------------------------------------------------------- 1.449 + // Iteration API 1.450 + //------------------------------------------------------------------------- 1.451 + 1.452 + /** 1.453 + * Return the current character in the normalized text. 1.454 + * current() may need to normalize some text at getIndex(). 1.455 + * The getIndex() is not changed. 1.456 + * 1.457 + * @return the current normalized code point 1.458 + * @stable ICU 2.0 1.459 + */ 1.460 + UChar32 current(void); 1.461 + 1.462 + /** 1.463 + * Return the first character in the normalized text. 1.464 + * This is equivalent to setIndexOnly(startIndex()) followed by next(). 1.465 + * (Post-increment semantics.) 1.466 + * 1.467 + * @return the first normalized code point 1.468 + * @stable ICU 2.0 1.469 + */ 1.470 + UChar32 first(void); 1.471 + 1.472 + /** 1.473 + * Return the last character in the normalized text. 1.474 + * This is equivalent to setIndexOnly(endIndex()) followed by previous(). 1.475 + * (Pre-decrement semantics.) 1.476 + * 1.477 + * @return the last normalized code point 1.478 + * @stable ICU 2.0 1.479 + */ 1.480 + UChar32 last(void); 1.481 + 1.482 + /** 1.483 + * Return the next character in the normalized text. 1.484 + * (Post-increment semantics.) 1.485 + * If the end of the text has already been reached, DONE is returned. 1.486 + * The DONE value could be confused with a U+FFFF non-character code point 1.487 + * in the text. If this is possible, you can test getIndex()<endIndex() 1.488 + * before calling next(), or (getIndex()<endIndex() || last()!=DONE) 1.489 + * after calling next(). (Calling last() will change the iterator state!) 1.490 + * 1.491 + * The C API unorm_next() is more efficient and does not have this ambiguity. 1.492 + * 1.493 + * @return the next normalized code point 1.494 + * @stable ICU 2.0 1.495 + */ 1.496 + UChar32 next(void); 1.497 + 1.498 + /** 1.499 + * Return the previous character in the normalized text and decrement. 1.500 + * (Pre-decrement semantics.) 1.501 + * If the beginning of the text has already been reached, DONE is returned. 1.502 + * The DONE value could be confused with a U+FFFF non-character code point 1.503 + * in the text. If this is possible, you can test 1.504 + * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change 1.505 + * the iterator state!) 1.506 + * 1.507 + * The C API unorm_previous() is more efficient and does not have this ambiguity. 1.508 + * 1.509 + * @return the previous normalized code point 1.510 + * @stable ICU 2.0 1.511 + */ 1.512 + UChar32 previous(void); 1.513 + 1.514 + /** 1.515 + * Set the iteration position in the input text that is being normalized, 1.516 + * without any immediate normalization. 1.517 + * After setIndexOnly(), getIndex() will return the same index that is 1.518 + * specified here. 1.519 + * 1.520 + * @param index the desired index in the input text. 1.521 + * @stable ICU 2.0 1.522 + */ 1.523 + void setIndexOnly(int32_t index); 1.524 + 1.525 + /** 1.526 + * Reset the index to the beginning of the text. 1.527 + * This is equivalent to setIndexOnly(startIndex)). 1.528 + * @stable ICU 2.0 1.529 + */ 1.530 + void reset(void); 1.531 + 1.532 + /** 1.533 + * Retrieve the current iteration position in the input text that is 1.534 + * being normalized. 1.535 + * 1.536 + * A following call to next() will return a normalized code point from 1.537 + * the input text at or after this index. 1.538 + * 1.539 + * After a call to previous(), getIndex() will point at or before the 1.540 + * position in the input text where the normalized code point 1.541 + * was returned from with previous(). 1.542 + * 1.543 + * @return the current index in the input text 1.544 + * @stable ICU 2.0 1.545 + */ 1.546 + int32_t getIndex(void) const; 1.547 + 1.548 + /** 1.549 + * Retrieve the index of the start of the input text. This is the begin index 1.550 + * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string 1.551 + * over which this <code>Normalizer</code> is iterating. 1.552 + * 1.553 + * @return the smallest index in the input text where the Normalizer operates 1.554 + * @stable ICU 2.0 1.555 + */ 1.556 + int32_t startIndex(void) const; 1.557 + 1.558 + /** 1.559 + * Retrieve the index of the end of the input text. This is the end index 1.560 + * of the <code>CharacterIterator</code> or the length of the string 1.561 + * over which this <code>Normalizer</code> is iterating. 1.562 + * This end index is exclusive, i.e., the Normalizer operates only on characters 1.563 + * before this index. 1.564 + * 1.565 + * @return the first index in the input text where the Normalizer does not operate 1.566 + * @stable ICU 2.0 1.567 + */ 1.568 + int32_t endIndex(void) const; 1.569 + 1.570 + /** 1.571 + * Returns TRUE when both iterators refer to the same character in the same 1.572 + * input text. 1.573 + * 1.574 + * @param that a Normalizer object to compare this one to 1.575 + * @return comparison result 1.576 + * @stable ICU 2.0 1.577 + */ 1.578 + UBool operator==(const Normalizer& that) const; 1.579 + 1.580 + /** 1.581 + * Returns FALSE when both iterators refer to the same character in the same 1.582 + * input text. 1.583 + * 1.584 + * @param that a Normalizer object to compare this one to 1.585 + * @return comparison result 1.586 + * @stable ICU 2.0 1.587 + */ 1.588 + inline UBool operator!=(const Normalizer& that) const; 1.589 + 1.590 + /** 1.591 + * Returns a pointer to a new Normalizer that is a clone of this one. 1.592 + * The caller is responsible for deleting the new clone. 1.593 + * @return a pointer to a new Normalizer 1.594 + * @stable ICU 2.0 1.595 + */ 1.596 + Normalizer* clone(void) const; 1.597 + 1.598 + /** 1.599 + * Generates a hash code for this iterator. 1.600 + * 1.601 + * @return the hash code 1.602 + * @stable ICU 2.0 1.603 + */ 1.604 + int32_t hashCode(void) const; 1.605 + 1.606 + //------------------------------------------------------------------------- 1.607 + // Property access methods 1.608 + //------------------------------------------------------------------------- 1.609 + 1.610 + /** 1.611 + * Set the normalization mode for this object. 1.612 + * <p> 1.613 + * <b>Note:</b>If the normalization mode is changed while iterating 1.614 + * over a string, calls to {@link #next() } and {@link #previous() } may 1.615 + * return previously buffers characters in the old normalization mode 1.616 + * until the iteration is able to re-sync at the next base character. 1.617 + * It is safest to call {@link #setIndexOnly }, {@link #reset() }, 1.618 + * {@link #setText }, {@link #first() }, 1.619 + * {@link #last() }, etc. after calling <code>setMode</code>. 1.620 + * <p> 1.621 + * @param newMode the new mode for this <code>Normalizer</code>. 1.622 + * @see #getUMode 1.623 + * @stable ICU 2.0 1.624 + */ 1.625 + void setMode(UNormalizationMode newMode); 1.626 + 1.627 + /** 1.628 + * Return the normalization mode for this object. 1.629 + * 1.630 + * This is an unusual name because there used to be a getMode() that 1.631 + * returned a different type. 1.632 + * 1.633 + * @return the mode for this <code>Normalizer</code> 1.634 + * @see #setMode 1.635 + * @stable ICU 2.0 1.636 + */ 1.637 + UNormalizationMode getUMode(void) const; 1.638 + 1.639 + /** 1.640 + * Set options that affect this <code>Normalizer</code>'s operation. 1.641 + * Options do not change the basic composition or decomposition operation 1.642 + * that is being performed, but they control whether 1.643 + * certain optional portions of the operation are done. 1.644 + * Currently the only available option is obsolete. 1.645 + * 1.646 + * It is possible to specify multiple options that are all turned on or off. 1.647 + * 1.648 + * @param option the option(s) whose value is/are to be set. 1.649 + * @param value the new setting for the option. Use <code>TRUE</code> to 1.650 + * turn the option(s) on and <code>FALSE</code> to turn it/them off. 1.651 + * 1.652 + * @see #getOption 1.653 + * @stable ICU 2.0 1.654 + */ 1.655 + void setOption(int32_t option, 1.656 + UBool value); 1.657 + 1.658 + /** 1.659 + * Determine whether an option is turned on or off. 1.660 + * If multiple options are specified, then the result is TRUE if any 1.661 + * of them are set. 1.662 + * <p> 1.663 + * @param option the option(s) that are to be checked 1.664 + * @return TRUE if any of the option(s) are set 1.665 + * @see #setOption 1.666 + * @stable ICU 2.0 1.667 + */ 1.668 + UBool getOption(int32_t option) const; 1.669 + 1.670 + /** 1.671 + * Set the input text over which this <code>Normalizer</code> will iterate. 1.672 + * The iteration position is set to the beginning. 1.673 + * 1.674 + * @param newText a string that replaces the current input text 1.675 + * @param status a UErrorCode 1.676 + * @stable ICU 2.0 1.677 + */ 1.678 + void setText(const UnicodeString& newText, 1.679 + UErrorCode &status); 1.680 + 1.681 + /** 1.682 + * Set the input text over which this <code>Normalizer</code> will iterate. 1.683 + * The iteration position is set to the beginning. 1.684 + * 1.685 + * @param newText a CharacterIterator object that replaces the current input text 1.686 + * @param status a UErrorCode 1.687 + * @stable ICU 2.0 1.688 + */ 1.689 + void setText(const CharacterIterator& newText, 1.690 + UErrorCode &status); 1.691 + 1.692 + /** 1.693 + * Set the input text over which this <code>Normalizer</code> will iterate. 1.694 + * The iteration position is set to the beginning. 1.695 + * 1.696 + * @param newText a string that replaces the current input text 1.697 + * @param length the length of the string, or -1 if NUL-terminated 1.698 + * @param status a UErrorCode 1.699 + * @stable ICU 2.0 1.700 + */ 1.701 + void setText(const UChar* newText, 1.702 + int32_t length, 1.703 + UErrorCode &status); 1.704 + /** 1.705 + * Copies the input text into the UnicodeString argument. 1.706 + * 1.707 + * @param result Receives a copy of the text under iteration. 1.708 + * @stable ICU 2.0 1.709 + */ 1.710 + void getText(UnicodeString& result); 1.711 + 1.712 + /** 1.713 + * ICU "poor man's RTTI", returns a UClassID for this class. 1.714 + * @returns a UClassID for this class. 1.715 + * @stable ICU 2.2 1.716 + */ 1.717 + static UClassID U_EXPORT2 getStaticClassID(); 1.718 + 1.719 + /** 1.720 + * ICU "poor man's RTTI", returns a UClassID for the actual class. 1.721 + * @return a UClassID for the actual class. 1.722 + * @stable ICU 2.2 1.723 + */ 1.724 + virtual UClassID getDynamicClassID() const; 1.725 + 1.726 +private: 1.727 + //------------------------------------------------------------------------- 1.728 + // Private functions 1.729 + //------------------------------------------------------------------------- 1.730 + 1.731 + Normalizer(); // default constructor not implemented 1.732 + Normalizer &operator=(const Normalizer &that); // assignment operator not implemented 1.733 + 1.734 + // Private utility methods for iteration 1.735 + // For documentation, see the source code 1.736 + UBool nextNormalize(); 1.737 + UBool previousNormalize(); 1.738 + 1.739 + void init(); 1.740 + void clearBuffer(void); 1.741 + 1.742 + //------------------------------------------------------------------------- 1.743 + // Private data 1.744 + //------------------------------------------------------------------------- 1.745 + 1.746 + FilteredNormalizer2*fFilteredNorm2; // owned if not NULL 1.747 + const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2 1.748 + UNormalizationMode fUMode; 1.749 + int32_t fOptions; 1.750 + 1.751 + // The input text and our position in it 1.752 + CharacterIterator *text; 1.753 + 1.754 + // The normalization buffer is the result of normalization 1.755 + // of the source in [currentIndex..nextIndex[ . 1.756 + int32_t currentIndex, nextIndex; 1.757 + 1.758 + // A buffer for holding intermediate results 1.759 + UnicodeString buffer; 1.760 + int32_t bufferPos; 1.761 +}; 1.762 + 1.763 +//------------------------------------------------------------------------- 1.764 +// Inline implementations 1.765 +//------------------------------------------------------------------------- 1.766 + 1.767 +inline UBool 1.768 +Normalizer::operator!= (const Normalizer& other) const 1.769 +{ return ! operator==(other); } 1.770 + 1.771 +inline UNormalizationCheckResult 1.772 +Normalizer::quickCheck(const UnicodeString& source, 1.773 + UNormalizationMode mode, 1.774 + UErrorCode &status) { 1.775 + return quickCheck(source, mode, 0, status); 1.776 +} 1.777 + 1.778 +inline UBool 1.779 +Normalizer::isNormalized(const UnicodeString& source, 1.780 + UNormalizationMode mode, 1.781 + UErrorCode &status) { 1.782 + return isNormalized(source, mode, 0, status); 1.783 +} 1.784 + 1.785 +inline int32_t 1.786 +Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, 1.787 + uint32_t options, 1.788 + UErrorCode &errorCode) { 1.789 + // all argument checking is done in unorm_compare 1.790 + return unorm_compare(s1.getBuffer(), s1.length(), 1.791 + s2.getBuffer(), s2.length(), 1.792 + options, 1.793 + &errorCode); 1.794 +} 1.795 + 1.796 +U_NAMESPACE_END 1.797 + 1.798 +#endif /* #if !UCONFIG_NO_NORMALIZATION */ 1.799 + 1.800 +#endif // NORMLZR_H