The Tor Browser: diff intl/icu/source/common/unicode/normlzr.h

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/unicode/normlzr.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,797 @@
     1.4 +/*
     1.5 + ********************************************************************
     1.6 + * COPYRIGHT:
     1.7 + * Copyright (c) 1996-2011, International Business Machines Corporation and
     1.8 + * others. All Rights Reserved.
     1.9 + ********************************************************************
    1.10 + */
    1.11 +
    1.12 +#ifndef NORMLZR_H
    1.13 +#define NORMLZR_H
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +
    1.17 +/**
    1.18 + * \file 
    1.19 + * \brief C++ API: Unicode Normalization
    1.20 + */
    1.21 + 
    1.22 +#if !UCONFIG_NO_NORMALIZATION
    1.23 +
    1.24 +#include "unicode/chariter.h"
    1.25 +#include "unicode/normalizer2.h"
    1.26 +#include "unicode/unistr.h"
    1.27 +#include "unicode/unorm.h"
    1.28 +#include "unicode/uobject.h"
    1.29 +
    1.30 +U_NAMESPACE_BEGIN
    1.31 +/**
    1.32 + * The Normalizer class supports the standard normalization forms described in
    1.33 + * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
    1.34 + * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
    1.35 + *
    1.36 + * Note: This API has been replaced by the Normalizer2 class and is only available
    1.37 + * for backward compatibility. This class simply delegates to the Normalizer2 class.
    1.38 + * There is one exception: The new API does not provide a replacement for Normalizer::compare().
    1.39 + *
    1.40 + * The Normalizer class consists of two parts:
    1.41 + * - static functions that normalize strings or test if strings are normalized
    1.42 + * - a Normalizer object is an iterator that takes any kind of text and
    1.43 + *   provides iteration over its normalized form
    1.44 + *
    1.45 + * The Normalizer class is not suitable for subclassing.
    1.46 + *
    1.47 + * For basic information about normalization forms and details about the C API
    1.48 + * please see the documentation in unorm.h.
    1.49 + *
    1.50 + * The iterator API with the Normalizer constructors and the non-static functions
    1.51 + * use a CharacterIterator as input. It is possible to pass a string which
    1.52 + * is then internally wrapped in a CharacterIterator.
    1.53 + * The input text is not normalized all at once, but incrementally where needed
    1.54 + * (providing efficient random access).
    1.55 + * This allows to pass in a large text but spend only a small amount of time
    1.56 + * normalizing a small part of that text.
    1.57 + * However, if the entire text is normalized, then the iterator will be
    1.58 + * slower than normalizing the entire text at once and iterating over the result.
    1.59 + * A possible use of the Normalizer iterator is also to report an index into the
    1.60 + * original text that is close to where the normalized characters come from.
    1.61 + *
    1.62 + * <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
    1.63 + * The earlier implementation reported the getIndex() inconsistently,
    1.64 + * and previous() could not be used after setIndex(), next(), first(), and current().
    1.65 + *
    1.66 + * Normalizer allows to start normalizing from anywhere in the input text by
    1.67 + * calling setIndexOnly(), first(), or last().
    1.68 + * Without calling any of these, the iterator will start at the beginning of the text.
    1.69 + *
    1.70 + * At any time, next() returns the next normalized code point (UChar32),
    1.71 + * with post-increment semantics (like CharacterIterator::next32PostInc()).
    1.72 + * previous() returns the previous normalized code point (UChar32),
    1.73 + * with pre-decrement semantics (like CharacterIterator::previous32()).
    1.74 + *
    1.75 + * current() returns the current code point
    1.76 + * (respectively the one at the newly set index) without moving
    1.77 + * the getIndex(). Note that if the text at the current position
    1.78 + * needs to be normalized, then these functions will do that.
    1.79 + * (This is why current() is not const.)
    1.80 + * It is more efficient to call setIndexOnly() instead, which does not
    1.81 + * normalize.
    1.82 + *
    1.83 + * getIndex() always refers to the position in the input text where the normalized
    1.84 + * code points are returned from. It does not always change with each returned
    1.85 + * code point.
    1.86 + * The code point that is returned from any of the functions
    1.87 + * corresponds to text at or after getIndex(), according to the
    1.88 + * function's iteration semantics (post-increment or pre-decrement).
    1.89 + *
    1.90 + * next() returns a code point from at or after the getIndex()
    1.91 + * from before the next() call. After the next() call, the getIndex()
    1.92 + * might have moved to where the next code point will be returned from
    1.93 + * (from a next() or current() call).
    1.94 + * This is semantically equivalent to array access with array[index++]
    1.95 + * (post-increment semantics).
    1.96 + *
    1.97 + * previous() returns a code point from at or after the getIndex()
    1.98 + * from after the previous() call.
    1.99 + * This is semantically equivalent to array access with array[--index]
   1.100 + * (pre-decrement semantics).
   1.101 + *
   1.102 + * Internally, the Normalizer iterator normalizes a small piece of text
   1.103 + * starting at the getIndex() and ending at a following "safe" index.
   1.104 + * The normalized results is stored in an internal string buffer, and
   1.105 + * the code points are iterated from there.
   1.106 + * With multiple iteration calls, this is repeated until the next piece
   1.107 + * of text needs to be normalized, and the getIndex() needs to be moved.
   1.108 + *
   1.109 + * The following "safe" index, the internal buffer, and the secondary
   1.110 + * iteration index into that buffer are not exposed on the API.
   1.111 + * This also means that it is currently not practical to return to
   1.112 + * a particular, arbitrary position in the text because one would need to
   1.113 + * know, and be able to set, in addition to the getIndex(), at least also the
   1.114 + * current index into the internal buffer.
   1.115 + * It is currently only possible to observe when getIndex() changes
   1.116 + * (with careful consideration of the iteration semantics),
   1.117 + * at which time the internal index will be 0.
   1.118 + * For example, if getIndex() is different after next() than before it,
   1.119 + * then the internal index is 0 and one can return to this getIndex()
   1.120 + * later with setIndexOnly().
   1.121 + *
   1.122 + * Note: While the setIndex() and getIndex() refer to indices in the
   1.123 + * underlying Unicode input text, the next() and previous() methods
   1.124 + * iterate through characters in the normalized output.
   1.125 + * This means that there is not necessarily a one-to-one correspondence
   1.126 + * between characters returned by next() and previous() and the indices
   1.127 + * passed to and returned from setIndex() and getIndex().
   1.128 + * It is for this reason that Normalizer does not implement the CharacterIterator interface.
   1.129 + *
   1.130 + * @author Laura Werner, Mark Davis, Markus Scherer
   1.131 + * @stable ICU 2.0
   1.132 + */
   1.133 +class U_COMMON_API Normalizer : public UObject {
   1.134 +public:
   1.135 +  /**
   1.136 +   * If DONE is returned from an iteration function that returns a code point,
   1.137 +   * then there are no more normalization results available.
   1.138 +   * @stable ICU 2.0
   1.139 +   */
   1.140 +  enum {
   1.141 +      DONE=0xffff
   1.142 +  };
   1.143 +
   1.144 +  // Constructors
   1.145 +
   1.146 +  /**
   1.147 +   * Creates a new <code>Normalizer</code> object for iterating over the
   1.148 +   * normalized form of a given string.
   1.149 +   * <p>
   1.150 +   * @param str   The string to be normalized.  The normalization
   1.151 +   *              will start at the beginning of the string.
   1.152 +   *
   1.153 +   * @param mode  The normalization mode.
   1.154 +   * @stable ICU 2.0
   1.155 +   */
   1.156 +  Normalizer(const UnicodeString& str, UNormalizationMode mode);
   1.157 +
   1.158 +  /**
   1.159 +   * Creates a new <code>Normalizer</code> object for iterating over the
   1.160 +   * normalized form of a given string.
   1.161 +   * <p>
   1.162 +   * @param str   The string to be normalized.  The normalization
   1.163 +   *              will start at the beginning of the string.
   1.164 +   *
   1.165 +   * @param length Length of the string, or -1 if NUL-terminated.
   1.166 +   * @param mode  The normalization mode.
   1.167 +   * @stable ICU 2.0
   1.168 +   */
   1.169 +  Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
   1.170 +
   1.171 +  /**
   1.172 +   * Creates a new <code>Normalizer</code> object for iterating over the
   1.173 +   * normalized form of the given text.
   1.174 +   * <p>
   1.175 +   * @param iter  The input text to be normalized.  The normalization
   1.176 +   *              will start at the beginning of the string.
   1.177 +   *
   1.178 +   * @param mode  The normalization mode.
   1.179 +   * @stable ICU 2.0
   1.180 +   */
   1.181 +  Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
   1.182 +
   1.183 +  /**
   1.184 +   * Copy constructor.
   1.185 +   * @param copy The object to be copied.
   1.186 +   * @stable ICU 2.0
   1.187 +   */
   1.188 +  Normalizer(const Normalizer& copy);
   1.189 +
   1.190 +  /**
   1.191 +   * Destructor
   1.192 +   * @stable ICU 2.0
   1.193 +   */
   1.194 +  virtual ~Normalizer();
   1.195 +
   1.196 +
   1.197 +  //-------------------------------------------------------------------------
   1.198 +  // Static utility methods
   1.199 +  //-------------------------------------------------------------------------
   1.200 +
   1.201 +  /**
   1.202 +   * Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
   1.203 +   * This is a wrapper for unorm_normalize(), using UnicodeString's.
   1.204 +   *
   1.205 +   * The <code>options</code> parameter specifies which optional
   1.206 +   * <code>Normalizer</code> features are to be enabled for this operation.
   1.207 +   *
   1.208 +   * @param source    the input string to be normalized.
   1.209 +   * @param mode      the normalization mode
   1.210 +   * @param options   the optional features to be enabled (0 for no options)
   1.211 +   * @param result    The normalized string (on output).
   1.212 +   * @param status    The error code.
   1.213 +   * @stable ICU 2.0
   1.214 +   */
   1.215 +  static void U_EXPORT2 normalize(const UnicodeString& source,
   1.216 +                        UNormalizationMode mode, int32_t options,
   1.217 +                        UnicodeString& result,
   1.218 +                        UErrorCode &status);
   1.219 +
   1.220 +  /**
   1.221 +   * Compose a <code>UnicodeString</code>.
   1.222 +   * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
   1.223 +   * This is a wrapper for unorm_normalize(), using UnicodeString's.
   1.224 +   *
   1.225 +   * The <code>options</code> parameter specifies which optional
   1.226 +   * <code>Normalizer</code> features are to be enabled for this operation.
   1.227 +   *
   1.228 +   * @param source    the string to be composed.
   1.229 +   * @param compat    Perform compatibility decomposition before composition.
   1.230 +   *                  If this argument is <code>FALSE</code>, only canonical
   1.231 +   *                  decomposition will be performed.
   1.232 +   * @param options   the optional features to be enabled (0 for no options)
   1.233 +   * @param result    The composed string (on output).
   1.234 +   * @param status    The error code.
   1.235 +   * @stable ICU 2.0
   1.236 +   */
   1.237 +  static void U_EXPORT2 compose(const UnicodeString& source,
   1.238 +                      UBool compat, int32_t options,
   1.239 +                      UnicodeString& result,
   1.240 +                      UErrorCode &status);
   1.241 +
   1.242 +  /**
   1.243 +   * Static method to decompose a <code>UnicodeString</code>.
   1.244 +   * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
   1.245 +   * This is a wrapper for unorm_normalize(), using UnicodeString's.
   1.246 +   *
   1.247 +   * The <code>options</code> parameter specifies which optional
   1.248 +   * <code>Normalizer</code> features are to be enabled for this operation.
   1.249 +   *
   1.250 +   * @param source    the string to be decomposed.
   1.251 +   * @param compat    Perform compatibility decomposition.
   1.252 +   *                  If this argument is <code>FALSE</code>, only canonical
   1.253 +   *                  decomposition will be performed.
   1.254 +   * @param options   the optional features to be enabled (0 for no options)
   1.255 +   * @param result    The decomposed string (on output).
   1.256 +   * @param status    The error code.
   1.257 +   * @stable ICU 2.0
   1.258 +   */
   1.259 +  static void U_EXPORT2 decompose(const UnicodeString& source,
   1.260 +                        UBool compat, int32_t options,
   1.261 +                        UnicodeString& result,
   1.262 +                        UErrorCode &status);
   1.263 +
   1.264 +  /**
   1.265 +   * Performing quick check on a string, to quickly determine if the string is
   1.266 +   * in a particular normalization format.
   1.267 +   * This is a wrapper for unorm_quickCheck(), using a UnicodeString.
   1.268 +   *
   1.269 +   * Three types of result can be returned UNORM_YES, UNORM_NO or
   1.270 +   * UNORM_MAYBE. Result UNORM_YES indicates that the argument
   1.271 +   * string is in the desired normalized format, UNORM_NO determines that
   1.272 +   * argument string is not in the desired normalized format. A
   1.273 +   * UNORM_MAYBE result indicates that a more thorough check is required,
   1.274 +   * the user may have to put the string in its normalized form and compare the
   1.275 +   * results.
   1.276 +   * @param source       string for determining if it is in a normalized format
   1.277 +   * @param mode         normalization format
   1.278 +   * @param status A reference to a UErrorCode to receive any errors
   1.279 +   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
   1.280 +   *
   1.281 +   * @see isNormalized
   1.282 +   * @stable ICU 2.0
   1.283 +   */
   1.284 +  static inline UNormalizationCheckResult
   1.285 +  quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
   1.286 +
   1.287 +  /**
   1.288 +   * Performing quick check on a string; same as the other version of quickCheck
   1.289 +   * but takes an extra options parameter like most normalization functions.
   1.290 +   *
   1.291 +   * @param source       string for determining if it is in a normalized format
   1.292 +   * @param mode         normalization format
   1.293 +   * @param options      the optional features to be enabled (0 for no options)
   1.294 +   * @param status A reference to a UErrorCode to receive any errors
   1.295 +   * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
   1.296 +   *
   1.297 +   * @see isNormalized
   1.298 +   * @stable ICU 2.6
   1.299 +   */
   1.300 +  static UNormalizationCheckResult
   1.301 +  quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
   1.302 +
   1.303 +  /**
   1.304 +   * Test if a string is in a given normalization form.
   1.305 +   * This is semantically equivalent to source.equals(normalize(source, mode)) .
   1.306 +   *
   1.307 +   * Unlike unorm_quickCheck(), this function returns a definitive result,
   1.308 +   * never a "maybe".
   1.309 +   * For NFD, NFKD, and FCD, both functions work exactly the same.
   1.310 +   * For NFC and NFKC where quickCheck may return "maybe", this function will
   1.311 +   * perform further tests to arrive at a TRUE/FALSE result.
   1.312 +   *
   1.313 +   * @param src        String that is to be tested if it is in a normalization format.
   1.314 +   * @param mode       Which normalization form to test for.
   1.315 +   * @param errorCode  ICU error code in/out parameter.
   1.316 +   *                   Must fulfill U_SUCCESS before the function call.
   1.317 +   * @return Boolean value indicating whether the source string is in the
   1.318 +   *         "mode" normalization form.
   1.319 +   *
   1.320 +   * @see quickCheck
   1.321 +   * @stable ICU 2.2
   1.322 +   */
   1.323 +  static inline UBool
   1.324 +  isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
   1.325 +
   1.326 +  /**
   1.327 +   * Test if a string is in a given normalization form; same as the other version of isNormalized
   1.328 +   * but takes an extra options parameter like most normalization functions.
   1.329 +   *
   1.330 +   * @param src        String that is to be tested if it is in a normalization format.
   1.331 +   * @param mode       Which normalization form to test for.
   1.332 +   * @param options      the optional features to be enabled (0 for no options)
   1.333 +   * @param errorCode  ICU error code in/out parameter.
   1.334 +   *                   Must fulfill U_SUCCESS before the function call.
   1.335 +   * @return Boolean value indicating whether the source string is in the
   1.336 +   *         "mode" normalization form.
   1.337 +   *
   1.338 +   * @see quickCheck
   1.339 +   * @stable ICU 2.6
   1.340 +   */
   1.341 +  static UBool
   1.342 +  isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
   1.343 +
   1.344 +  /**
   1.345 +   * Concatenate normalized strings, making sure that the result is normalized as well.
   1.346 +   *
   1.347 +   * If both the left and the right strings are in
   1.348 +   * the normalization form according to "mode/options",
   1.349 +   * then the result will be
   1.350 +   *
   1.351 +   * \code
   1.352 +   *     dest=normalize(left+right, mode, options)
   1.353 +   * \endcode
   1.354 +   *
   1.355 +   * For details see unorm_concatenate in unorm.h.
   1.356 +   *
   1.357 +   * @param left Left source string.
   1.358 +   * @param right Right source string.
   1.359 +   * @param result The output string.
   1.360 +   * @param mode The normalization mode.
   1.361 +   * @param options A bit set of normalization options.
   1.362 +   * @param errorCode ICU error code in/out parameter.
   1.363 +   *                   Must fulfill U_SUCCESS before the function call.
   1.364 +   * @return result
   1.365 +   *
   1.366 +   * @see unorm_concatenate
   1.367 +   * @see normalize
   1.368 +   * @see unorm_next
   1.369 +   * @see unorm_previous
   1.370 +   *
   1.371 +   * @stable ICU 2.1
   1.372 +   */
   1.373 +  static UnicodeString &
   1.374 +  U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right,
   1.375 +              UnicodeString &result,
   1.376 +              UNormalizationMode mode, int32_t options,
   1.377 +              UErrorCode &errorCode);
   1.378 +
   1.379 +  /**
   1.380 +   * Compare two strings for canonical equivalence.
   1.381 +   * Further options include case-insensitive comparison and
   1.382 +   * code point order (as opposed to code unit order).
   1.383 +   *
   1.384 +   * Canonical equivalence between two strings is defined as their normalized
   1.385 +   * forms (NFD or NFC) being identical.
   1.386 +   * This function compares strings incrementally instead of normalizing
   1.387 +   * (and optionally case-folding) both strings entirely,
   1.388 +   * improving performance significantly.
   1.389 +   *
   1.390 +   * Bulk normalization is only necessary if the strings do not fulfill the FCD
   1.391 +   * conditions. Only in this case, and only if the strings are relatively long,
   1.392 +   * is memory allocated temporarily.
   1.393 +   * For FCD strings and short non-FCD strings there is no memory allocation.
   1.394 +   *
   1.395 +   * Semantically, this is equivalent to
   1.396 +   *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
   1.397 +   * where code point order and foldCase are all optional.
   1.398 +   *
   1.399 +   * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
   1.400 +   * the case folding must be performed first, then the normalization.
   1.401 +   *
   1.402 +   * @param s1 First source string.
   1.403 +   * @param s2 Second source string.
   1.404 +   *
   1.405 +   * @param options A bit set of options:
   1.406 +   *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
   1.407 +   *     Case-sensitive comparison in code unit order, and the input strings
   1.408 +   *     are quick-checked for FCD.
   1.409 +   *
   1.410 +   *   - UNORM_INPUT_IS_FCD
   1.411 +   *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
   1.412 +   *     If not set, the function will quickCheck for FCD
   1.413 +   *     and normalize if necessary.
   1.414 +   *
   1.415 +   *   - U_COMPARE_CODE_POINT_ORDER
   1.416 +   *     Set to choose code point order instead of code unit order
   1.417 +   *     (see u_strCompare for details).
   1.418 +   *
   1.419 +   *   - U_COMPARE_IGNORE_CASE
   1.420 +   *     Set to compare strings case-insensitively using case folding,
   1.421 +   *     instead of case-sensitively.
   1.422 +   *     If set, then the following case folding options are used.
   1.423 +   *
   1.424 +   *   - Options as used with case-insensitive comparisons, currently:
   1.425 +   *
   1.426 +   *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
   1.427 +   *    (see u_strCaseCompare for details)
   1.428 +   *
   1.429 +   *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
   1.430 +   *
   1.431 +   * @param errorCode ICU error code in/out parameter.
   1.432 +   *                  Must fulfill U_SUCCESS before the function call.
   1.433 +   * @return <0 or 0 or >0 as usual for string comparisons
   1.434 +   *
   1.435 +   * @see unorm_compare
   1.436 +   * @see normalize
   1.437 +   * @see UNORM_FCD
   1.438 +   * @see u_strCompare
   1.439 +   * @see u_strCaseCompare
   1.440 +   *
   1.441 +   * @stable ICU 2.2
   1.442 +   */
   1.443 +  static inline int32_t
   1.444 +  compare(const UnicodeString &s1, const UnicodeString &s2,
   1.445 +          uint32_t options,
   1.446 +          UErrorCode &errorCode);
   1.447 +
   1.448 +  //-------------------------------------------------------------------------
   1.449 +  // Iteration API
   1.450 +  //-------------------------------------------------------------------------
   1.451 +
   1.452 +  /**
   1.453 +   * Return the current character in the normalized text.
   1.454 +   * current() may need to normalize some text at getIndex().
   1.455 +   * The getIndex() is not changed.
   1.456 +   *
   1.457 +   * @return the current normalized code point
   1.458 +   * @stable ICU 2.0
   1.459 +   */
   1.460 +  UChar32              current(void);
   1.461 +
   1.462 +  /**
   1.463 +   * Return the first character in the normalized text.
   1.464 +   * This is equivalent to setIndexOnly(startIndex()) followed by next().
   1.465 +   * (Post-increment semantics.)
   1.466 +   *
   1.467 +   * @return the first normalized code point
   1.468 +   * @stable ICU 2.0
   1.469 +   */
   1.470 +  UChar32              first(void);
   1.471 +
   1.472 +  /**
   1.473 +   * Return the last character in the normalized text.
   1.474 +   * This is equivalent to setIndexOnly(endIndex()) followed by previous().
   1.475 +   * (Pre-decrement semantics.)
   1.476 +   *
   1.477 +   * @return the last normalized code point
   1.478 +   * @stable ICU 2.0
   1.479 +   */
   1.480 +  UChar32              last(void);
   1.481 +
   1.482 +  /**
   1.483 +   * Return the next character in the normalized text.
   1.484 +   * (Post-increment semantics.)
   1.485 +   * If the end of the text has already been reached, DONE is returned.
   1.486 +   * The DONE value could be confused with a U+FFFF non-character code point
   1.487 +   * in the text. If this is possible, you can test getIndex()<endIndex()
   1.488 +   * before calling next(), or (getIndex()<endIndex() || last()!=DONE)
   1.489 +   * after calling next(). (Calling last() will change the iterator state!)
   1.490 +   *
   1.491 +   * The C API unorm_next() is more efficient and does not have this ambiguity.
   1.492 +   *
   1.493 +   * @return the next normalized code point
   1.494 +   * @stable ICU 2.0
   1.495 +   */
   1.496 +  UChar32              next(void);
   1.497 +
   1.498 +  /**
   1.499 +   * Return the previous character in the normalized text and decrement.
   1.500 +   * (Pre-decrement semantics.)
   1.501 +   * If the beginning of the text has already been reached, DONE is returned.
   1.502 +   * The DONE value could be confused with a U+FFFF non-character code point
   1.503 +   * in the text. If this is possible, you can test
   1.504 +   * (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
   1.505 +   * the iterator state!)
   1.506 +   *
   1.507 +   * The C API unorm_previous() is more efficient and does not have this ambiguity.
   1.508 +   *
   1.509 +   * @return the previous normalized code point
   1.510 +   * @stable ICU 2.0
   1.511 +   */
   1.512 +  UChar32              previous(void);
   1.513 +
   1.514 +  /**
   1.515 +   * Set the iteration position in the input text that is being normalized,
   1.516 +   * without any immediate normalization.
   1.517 +   * After setIndexOnly(), getIndex() will return the same index that is
   1.518 +   * specified here.
   1.519 +   *
   1.520 +   * @param index the desired index in the input text.
   1.521 +   * @stable ICU 2.0
   1.522 +   */
   1.523 +  void                 setIndexOnly(int32_t index);
   1.524 +
   1.525 +  /**
   1.526 +   * Reset the index to the beginning of the text.
   1.527 +   * This is equivalent to setIndexOnly(startIndex)).
   1.528 +   * @stable ICU 2.0
   1.529 +   */
   1.530 +  void                reset(void);
   1.531 +
   1.532 +  /**
   1.533 +   * Retrieve the current iteration position in the input text that is
   1.534 +   * being normalized.
   1.535 +   *
   1.536 +   * A following call to next() will return a normalized code point from
   1.537 +   * the input text at or after this index.
   1.538 +   *
   1.539 +   * After a call to previous(), getIndex() will point at or before the
   1.540 +   * position in the input text where the normalized code point
   1.541 +   * was returned from with previous().
   1.542 +   *
   1.543 +   * @return the current index in the input text
   1.544 +   * @stable ICU 2.0
   1.545 +   */
   1.546 +  int32_t            getIndex(void) const;
   1.547 +
   1.548 +  /**
   1.549 +   * Retrieve the index of the start of the input text. This is the begin index
   1.550 +   * of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
   1.551 +   * over which this <code>Normalizer</code> is iterating.
   1.552 +   *
   1.553 +   * @return the smallest index in the input text where the Normalizer operates
   1.554 +   * @stable ICU 2.0
   1.555 +   */
   1.556 +  int32_t            startIndex(void) const;
   1.557 +
   1.558 +  /**
   1.559 +   * Retrieve the index of the end of the input text. This is the end index
   1.560 +   * of the <code>CharacterIterator</code> or the length of the string
   1.561 +   * over which this <code>Normalizer</code> is iterating.
   1.562 +   * This end index is exclusive, i.e., the Normalizer operates only on characters
   1.563 +   * before this index.
   1.564 +   *
   1.565 +   * @return the first index in the input text where the Normalizer does not operate
   1.566 +   * @stable ICU 2.0
   1.567 +   */
   1.568 +  int32_t            endIndex(void) const;
   1.569 +
   1.570 +  /**
   1.571 +   * Returns TRUE when both iterators refer to the same character in the same
   1.572 +   * input text.
   1.573 +   *
   1.574 +   * @param that a Normalizer object to compare this one to
   1.575 +   * @return comparison result
   1.576 +   * @stable ICU 2.0
   1.577 +   */
   1.578 +  UBool        operator==(const Normalizer& that) const;
   1.579 +
   1.580 +  /**
   1.581 +   * Returns FALSE when both iterators refer to the same character in the same
   1.582 +   * input text.
   1.583 +   *
   1.584 +   * @param that a Normalizer object to compare this one to
   1.585 +   * @return comparison result
   1.586 +   * @stable ICU 2.0
   1.587 +   */
   1.588 +  inline UBool        operator!=(const Normalizer& that) const;
   1.589 +
   1.590 +  /**
   1.591 +   * Returns a pointer to a new Normalizer that is a clone of this one.
   1.592 +   * The caller is responsible for deleting the new clone.
   1.593 +   * @return a pointer to a new Normalizer
   1.594 +   * @stable ICU 2.0
   1.595 +   */
   1.596 +  Normalizer*        clone(void) const;
   1.597 +
   1.598 +  /**
   1.599 +   * Generates a hash code for this iterator.
   1.600 +   *
   1.601 +   * @return the hash code
   1.602 +   * @stable ICU 2.0
   1.603 +   */
   1.604 +  int32_t                hashCode(void) const;
   1.605 +
   1.606 +  //-------------------------------------------------------------------------
   1.607 +  // Property access methods
   1.608 +  //-------------------------------------------------------------------------
   1.609 +
   1.610 +  /**
   1.611 +   * Set the normalization mode for this object.
   1.612 +   * <p>
   1.613 +   * <b>Note:</b>If the normalization mode is changed while iterating
   1.614 +   * over a string, calls to {@link #next() } and {@link #previous() } may
   1.615 +   * return previously buffers characters in the old normalization mode
   1.616 +   * until the iteration is able to re-sync at the next base character.
   1.617 +   * It is safest to call {@link #setIndexOnly }, {@link #reset() },
   1.618 +   * {@link #setText }, {@link #first() },
   1.619 +   * {@link #last() }, etc. after calling <code>setMode</code>.
   1.620 +   * <p>
   1.621 +   * @param newMode the new mode for this <code>Normalizer</code>.
   1.622 +   * @see #getUMode
   1.623 +   * @stable ICU 2.0
   1.624 +   */
   1.625 +  void setMode(UNormalizationMode newMode);
   1.626 +
   1.627 +  /**
   1.628 +   * Return the normalization mode for this object.
   1.629 +   *
   1.630 +   * This is an unusual name because there used to be a getMode() that
   1.631 +   * returned a different type.
   1.632 +   *
   1.633 +   * @return the mode for this <code>Normalizer</code>
   1.634 +   * @see #setMode
   1.635 +   * @stable ICU 2.0
   1.636 +   */
   1.637 +  UNormalizationMode getUMode(void) const;
   1.638 +
   1.639 +  /**
   1.640 +   * Set options that affect this <code>Normalizer</code>'s operation.
   1.641 +   * Options do not change the basic composition or decomposition operation
   1.642 +   * that is being performed, but they control whether
   1.643 +   * certain optional portions of the operation are done.
   1.644 +   * Currently the only available option is obsolete.
   1.645 +   *
   1.646 +   * It is possible to specify multiple options that are all turned on or off.
   1.647 +   *
   1.648 +   * @param   option  the option(s) whose value is/are to be set.
   1.649 +   * @param   value   the new setting for the option.  Use <code>TRUE</code> to
   1.650 +   *                  turn the option(s) on and <code>FALSE</code> to turn it/them off.
   1.651 +   *
   1.652 +   * @see #getOption
   1.653 +   * @stable ICU 2.0
   1.654 +   */
   1.655 +  void setOption(int32_t option,
   1.656 +         UBool value);
   1.657 +
   1.658 +  /**
   1.659 +   * Determine whether an option is turned on or off.
   1.660 +   * If multiple options are specified, then the result is TRUE if any
   1.661 +   * of them are set.
   1.662 +   * <p>
   1.663 +   * @param option the option(s) that are to be checked
   1.664 +   * @return TRUE if any of the option(s) are set
   1.665 +   * @see #setOption
   1.666 +   * @stable ICU 2.0
   1.667 +   */
   1.668 +  UBool getOption(int32_t option) const;
   1.669 +
   1.670 +  /**
   1.671 +   * Set the input text over which this <code>Normalizer</code> will iterate.
   1.672 +   * The iteration position is set to the beginning.
   1.673 +   *
   1.674 +   * @param newText a string that replaces the current input text
   1.675 +   * @param status a UErrorCode
   1.676 +   * @stable ICU 2.0
   1.677 +   */
   1.678 +  void setText(const UnicodeString& newText,
   1.679 +           UErrorCode &status);
   1.680 +
   1.681 +  /**
   1.682 +   * Set the input text over which this <code>Normalizer</code> will iterate.
   1.683 +   * The iteration position is set to the beginning.
   1.684 +   *
   1.685 +   * @param newText a CharacterIterator object that replaces the current input text
   1.686 +   * @param status a UErrorCode
   1.687 +   * @stable ICU 2.0
   1.688 +   */
   1.689 +  void setText(const CharacterIterator& newText,
   1.690 +           UErrorCode &status);
   1.691 +
   1.692 +  /**
   1.693 +   * Set the input text over which this <code>Normalizer</code> will iterate.
   1.694 +   * The iteration position is set to the beginning.
   1.695 +   *
   1.696 +   * @param newText a string that replaces the current input text
   1.697 +   * @param length the length of the string, or -1 if NUL-terminated
   1.698 +   * @param status a UErrorCode
   1.699 +   * @stable ICU 2.0
   1.700 +   */
   1.701 +  void setText(const UChar* newText,
   1.702 +                    int32_t length,
   1.703 +            UErrorCode &status);
   1.704 +  /**
   1.705 +   * Copies the input text into the UnicodeString argument.
   1.706 +   *
   1.707 +   * @param result Receives a copy of the text under iteration.
   1.708 +   * @stable ICU 2.0
   1.709 +   */
   1.710 +  void            getText(UnicodeString&  result);
   1.711 +
   1.712 +  /**
   1.713 +   * ICU "poor man's RTTI", returns a UClassID for this class.
   1.714 +   * @returns a UClassID for this class.
   1.715 +   * @stable ICU 2.2
   1.716 +   */
   1.717 +  static UClassID U_EXPORT2 getStaticClassID();
   1.718 +
   1.719 +  /**
   1.720 +   * ICU "poor man's RTTI", returns a UClassID for the actual class.
   1.721 +   * @return a UClassID for the actual class.
   1.722 +   * @stable ICU 2.2
   1.723 +   */
   1.724 +  virtual UClassID getDynamicClassID() const;
   1.725 +
   1.726 +private:
   1.727 +  //-------------------------------------------------------------------------
   1.728 +  // Private functions
   1.729 +  //-------------------------------------------------------------------------
   1.730 +
   1.731 +  Normalizer(); // default constructor not implemented
   1.732 +  Normalizer &operator=(const Normalizer &that); // assignment operator not implemented
   1.733 +
   1.734 +  // Private utility methods for iteration
   1.735 +  // For documentation, see the source code
   1.736 +  UBool nextNormalize();
   1.737 +  UBool previousNormalize();
   1.738 +
   1.739 +  void    init();
   1.740 +  void    clearBuffer(void);
   1.741 +
   1.742 +  //-------------------------------------------------------------------------
   1.743 +  // Private data
   1.744 +  //-------------------------------------------------------------------------
   1.745 +
   1.746 +  FilteredNormalizer2*fFilteredNorm2;  // owned if not NULL
   1.747 +  const Normalizer2  *fNorm2;  // not owned; may be equal to fFilteredNorm2
   1.748 +  UNormalizationMode  fUMode;
   1.749 +  int32_t             fOptions;
   1.750 +
   1.751 +  // The input text and our position in it
   1.752 +  CharacterIterator  *text;
   1.753 +
   1.754 +  // The normalization buffer is the result of normalization
   1.755 +  // of the source in [currentIndex..nextIndex[ .
   1.756 +  int32_t         currentIndex, nextIndex;
   1.757 +
   1.758 +  // A buffer for holding intermediate results
   1.759 +  UnicodeString       buffer;
   1.760 +  int32_t         bufferPos;
   1.761 +};
   1.762 +
   1.763 +//-------------------------------------------------------------------------
   1.764 +// Inline implementations
   1.765 +//-------------------------------------------------------------------------
   1.766 +
   1.767 +inline UBool
   1.768 +Normalizer::operator!= (const Normalizer& other) const
   1.769 +{ return ! operator==(other); }
   1.770 +
   1.771 +inline UNormalizationCheckResult
   1.772 +Normalizer::quickCheck(const UnicodeString& source,
   1.773 +                       UNormalizationMode mode,
   1.774 +                       UErrorCode &status) {
   1.775 +    return quickCheck(source, mode, 0, status);
   1.776 +}
   1.777 +
   1.778 +inline UBool
   1.779 +Normalizer::isNormalized(const UnicodeString& source,
   1.780 +                         UNormalizationMode mode,
   1.781 +                         UErrorCode &status) {
   1.782 +    return isNormalized(source, mode, 0, status);
   1.783 +}
   1.784 +
   1.785 +inline int32_t
   1.786 +Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
   1.787 +                    uint32_t options,
   1.788 +                    UErrorCode &errorCode) {
   1.789 +  // all argument checking is done in unorm_compare
   1.790 +  return unorm_compare(s1.getBuffer(), s1.length(),
   1.791 +                       s2.getBuffer(), s2.length(),
   1.792 +                       options,
   1.793 +                       &errorCode);
   1.794 +}
   1.795 +
   1.796 +U_NAMESPACE_END
   1.797 +
   1.798 +#endif /* #if !UCONFIG_NO_NORMALIZATION */
   1.799 +
   1.800 +#endif // NORMLZR_H
The Tor Browser / file diff

diff: intl/icu/source/common/unicode/normlzr.h

intl/icu/source/common/unicode/normlzr.h