intl/icu/source/common/unicode/unorm.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/unicode/unorm.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,561 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +* Copyright (c) 1996-2010, International Business Machines Corporation
     1.7 +*               and others. All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +* File unorm.h
    1.10 +*
    1.11 +* Created by: Vladimir Weinstein 12052000
    1.12 +*
    1.13 +* Modification history :
    1.14 +*
    1.15 +* Date        Name        Description
    1.16 +* 02/01/01    synwee      Added normalization quickcheck enum and method.
    1.17 +*/
    1.18 +#ifndef UNORM_H
    1.19 +#define UNORM_H
    1.20 +
    1.21 +#include "unicode/utypes.h"
    1.22 +
    1.23 +#if !UCONFIG_NO_NORMALIZATION
    1.24 +
    1.25 +#include "unicode/uiter.h"
    1.26 +#include "unicode/unorm2.h"
    1.27 +
    1.28 +/**
    1.29 + * \file
    1.30 + * \brief C API: Unicode Normalization 
    1.31 + *
    1.32 + * <h2>Unicode normalization API</h2>
    1.33 + *
    1.34 + * Note: This API has been replaced by the unorm2.h API and is only available
    1.35 + * for backward compatibility. The functions here simply delegate to the
    1.36 + * unorm2.h functions, for example unorm2_getInstance() and unorm2_normalize().
    1.37 + * There is one exception: The new API does not provide a replacement for unorm_compare().
    1.38 + *
    1.39 + * <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
    1.40 + * decomposed form, allowing for easier sorting and searching of text.
    1.41 + * <code>unorm_normalize</code> supports the standard normalization forms described in
    1.42 + * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
    1.43 + * Unicode Standard Annex #15: Unicode Normalization Forms</a>.
    1.44 + *
    1.45 + * Characters with accents or other adornments can be encoded in
    1.46 + * several different ways in Unicode.  For example, take the character A-acute.
    1.47 + * In Unicode, this can be encoded as a single character (the
    1.48 + * "composed" form):
    1.49 + *
    1.50 + * \code
    1.51 + *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
    1.52 + * \endcode
    1.53 + *
    1.54 + * or as two separate characters (the "decomposed" form):
    1.55 + *
    1.56 + * \code
    1.57 + *      0041    LATIN CAPITAL LETTER A
    1.58 + *      0301    COMBINING ACUTE ACCENT
    1.59 + * \endcode
    1.60 + *
    1.61 + * To a user of your program, however, both of these sequences should be
    1.62 + * treated as the same "user-level" character "A with acute accent".  When you are searching or
    1.63 + * comparing text, you must ensure that these two sequences are treated 
    1.64 + * equivalently.  In addition, you must handle characters with more than one
    1.65 + * accent.  Sometimes the order of a character's combining accents is
    1.66 + * significant, while in other cases accent sequences in different orders are
    1.67 + * really equivalent.
    1.68 + *
    1.69 + * Similarly, the string "ffi" can be encoded as three separate letters:
    1.70 + *
    1.71 + * \code
    1.72 + *      0066    LATIN SMALL LETTER F
    1.73 + *      0066    LATIN SMALL LETTER F
    1.74 + *      0069    LATIN SMALL LETTER I
    1.75 + * \endcode
    1.76 + *
    1.77 + * or as the single character
    1.78 + *
    1.79 + * \code
    1.80 + *      FB03    LATIN SMALL LIGATURE FFI
    1.81 + * \endcode
    1.82 + *
    1.83 + * The ffi ligature is not a distinct semantic character, and strictly speaking
    1.84 + * it shouldn't be in Unicode at all, but it was included for compatibility
    1.85 + * with existing character sets that already provided it.  The Unicode standard
    1.86 + * identifies such characters by giving them "compatibility" decompositions
    1.87 + * into the corresponding semantic characters.  When sorting and searching, you
    1.88 + * will often want to use these mappings.
    1.89 + *
    1.90 + * <code>unorm_normalize</code> helps solve these problems by transforming text into the
    1.91 + * canonical composed and decomposed forms as shown in the first example above.  
    1.92 + * In addition, you can have it perform compatibility decompositions so that 
    1.93 + * you can treat compatibility characters the same as their equivalents.
    1.94 + * Finally, <code>unorm_normalize</code> rearranges accents into the proper canonical
    1.95 + * order, so that you do not have to worry about accent rearrangement on your
    1.96 + * own.
    1.97 + *
    1.98 + * Form FCD, "Fast C or D", is also designed for collation.
    1.99 + * It allows to work on strings that are not necessarily normalized
   1.100 + * with an algorithm (like in collation) that works under "canonical closure", i.e., it treats precomposed
   1.101 + * characters and their decomposed equivalents the same.
   1.102 + *
   1.103 + * It is not a normalization form because it does not provide for uniqueness of representation. Multiple strings
   1.104 + * may be canonically equivalent (their NFDs are identical) and may all conform to FCD without being identical
   1.105 + * themselves.
   1.106 + *
   1.107 + * The form is defined such that the "raw decomposition", the recursive canonical decomposition of each character,
   1.108 + * results in a string that is canonically ordered. This means that precomposed characters are allowed for as long
   1.109 + * as their decompositions do not need canonical reordering.
   1.110 + *
   1.111 + * Its advantage for a process like collation is that all NFD and most NFC texts - and many unnormalized texts -
   1.112 + * already conform to FCD and do not need to be normalized (NFD) for such a process. The FCD quick check will
   1.113 + * return UNORM_YES for most strings in practice.
   1.114 + *
   1.115 + * unorm_normalize(UNORM_FCD) may be implemented with UNORM_NFD.
   1.116 + *
   1.117 + * For more details on FCD see the collation design document:
   1.118 + * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
   1.119 + *
   1.120 + * ICU collation performs either NFD or FCD normalization automatically if normalization
   1.121 + * is turned on for the collator object.
   1.122 + * Beyond collation and string search, normalized strings may be useful for string equivalence comparisons,
   1.123 + * transliteration/transcription, unique representations, etc.
   1.124 + *
   1.125 + * The W3C generally recommends to exchange texts in NFC.
   1.126 + * Note also that most legacy character encodings use only precomposed forms and often do not
   1.127 + * encode any combining marks by themselves. For conversion to such character encodings the
   1.128 + * Unicode text needs to be normalized to NFC.
   1.129 + * For more usage examples, see the Unicode Standard Annex.
   1.130 + */
   1.131 +
   1.132 +/**
   1.133 + * Constants for normalization modes.
   1.134 + * @stable ICU 2.0
   1.135 + */
   1.136 +typedef enum {
   1.137 +  /** No decomposition/composition. @stable ICU 2.0 */
   1.138 +  UNORM_NONE = 1, 
   1.139 +  /** Canonical decomposition. @stable ICU 2.0 */
   1.140 +  UNORM_NFD = 2,
   1.141 +  /** Compatibility decomposition. @stable ICU 2.0 */
   1.142 +  UNORM_NFKD = 3,
   1.143 +  /** Canonical decomposition followed by canonical composition. @stable ICU 2.0 */
   1.144 +  UNORM_NFC = 4,
   1.145 +  /** Default normalization. @stable ICU 2.0 */
   1.146 +  UNORM_DEFAULT = UNORM_NFC, 
   1.147 +  /** Compatibility decomposition followed by canonical composition. @stable ICU 2.0 */
   1.148 +  UNORM_NFKC =5,
   1.149 +  /** "Fast C or D" form. @stable ICU 2.0 */
   1.150 +  UNORM_FCD = 6,
   1.151 +
   1.152 +  /** One more than the highest normalization mode constant. @stable ICU 2.0 */
   1.153 +  UNORM_MODE_COUNT
   1.154 +} UNormalizationMode;
   1.155 +
   1.156 +/**
   1.157 + * Constants for options flags for normalization.
   1.158 + * Use 0 for default options,
   1.159 + * including normalization according to the Unicode version
   1.160 + * that is currently supported by ICU (see u_getUnicodeVersion).
   1.161 + * @stable ICU 2.6
   1.162 + */
   1.163 +enum {
   1.164 +    /**
   1.165 +     * Options bit set value to select Unicode 3.2 normalization
   1.166 +     * (except NormalizationCorrections).
   1.167 +     * At most one Unicode version can be selected at a time.
   1.168 +     * @stable ICU 2.6
   1.169 +     */
   1.170 +    UNORM_UNICODE_3_2=0x20
   1.171 +};
   1.172 +
   1.173 +/**
   1.174 + * Lowest-order bit number of unorm_compare() options bits corresponding to
   1.175 + * normalization options bits.
   1.176 + *
   1.177 + * The options parameter for unorm_compare() uses most bits for
   1.178 + * itself and for various comparison and folding flags.
   1.179 + * The most significant bits, however, are shifted down and passed on
   1.180 + * to the normalization implementation.
   1.181 + * (That is, from unorm_compare(..., options, ...),
   1.182 + * options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
   1.183 + * internal normalization functions.)
   1.184 + *
   1.185 + * @see unorm_compare
   1.186 + * @stable ICU 2.6
   1.187 + */
   1.188 +#define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
   1.189 +
   1.190 +/**
   1.191 + * Normalize a string.
   1.192 + * The string will be normalized according the specified normalization mode
   1.193 + * and options.
   1.194 + * The source and result buffers must not be the same, nor overlap.
   1.195 + *
   1.196 + * @param source The string to normalize.
   1.197 + * @param sourceLength The length of source, or -1 if NUL-terminated.
   1.198 + * @param mode The normalization mode; one of UNORM_NONE, 
   1.199 + *             UNORM_NFD, UNORM_NFC, UNORM_NFKC, UNORM_NFKD, UNORM_DEFAULT.
   1.200 + * @param options The normalization options, ORed together (0 for no options).
   1.201 + * @param result A pointer to a buffer to receive the result string.
   1.202 + *               The result string is NUL-terminated if possible.
   1.203 + * @param resultLength The maximum size of result.
   1.204 + * @param status A pointer to a UErrorCode to receive any errors.
   1.205 + * @return The total buffer size needed; if greater than resultLength,
   1.206 + *         the output was truncated, and the error code is set to U_BUFFER_OVERFLOW_ERROR.
   1.207 + * @stable ICU 2.0
   1.208 + */
   1.209 +U_STABLE int32_t U_EXPORT2 
   1.210 +unorm_normalize(const UChar *source, int32_t sourceLength,
   1.211 +                UNormalizationMode mode, int32_t options,
   1.212 +                UChar *result, int32_t resultLength,
   1.213 +                UErrorCode *status);
   1.214 +
   1.215 +/**
   1.216 + * Performing quick check on a string, to quickly determine if the string is 
   1.217 + * in a particular normalization format.
   1.218 + * Three types of result can be returned UNORM_YES, UNORM_NO or
   1.219 + * UNORM_MAYBE. Result UNORM_YES indicates that the argument
   1.220 + * string is in the desired normalized format, UNORM_NO determines that
   1.221 + * argument string is not in the desired normalized format. A 
   1.222 + * UNORM_MAYBE result indicates that a more thorough check is required, 
   1.223 + * the user may have to put the string in its normalized form and compare the 
   1.224 + * results.
   1.225 + *
   1.226 + * @param source       string for determining if it is in a normalized format
   1.227 + * @param sourcelength length of source to test, or -1 if NUL-terminated
   1.228 + * @param mode         which normalization form to test for
   1.229 + * @param status       a pointer to a UErrorCode to receive any errors
   1.230 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
   1.231 + *
   1.232 + * @see unorm_isNormalized
   1.233 + * @stable ICU 2.0
   1.234 + */
   1.235 +U_STABLE UNormalizationCheckResult U_EXPORT2
   1.236 +unorm_quickCheck(const UChar *source, int32_t sourcelength,
   1.237 +                 UNormalizationMode mode,
   1.238 +                 UErrorCode *status);
   1.239 +
   1.240 +/**
   1.241 + * Performing quick check on a string; same as unorm_quickCheck but
   1.242 + * takes an extra options parameter like most normalization functions.
   1.243 + *
   1.244 + * @param src        String that is to be tested if it is in a normalization format.
   1.245 + * @param srcLength  Length of source to test, or -1 if NUL-terminated.
   1.246 + * @param mode       Which normalization form to test for.
   1.247 + * @param options    The normalization options, ORed together (0 for no options).
   1.248 + * @param pErrorCode ICU error code in/out parameter.
   1.249 + *                   Must fulfill U_SUCCESS before the function call.
   1.250 + * @return UNORM_YES, UNORM_NO or UNORM_MAYBE
   1.251 + *
   1.252 + * @see unorm_quickCheck
   1.253 + * @see unorm_isNormalized
   1.254 + * @stable ICU 2.6
   1.255 + */
   1.256 +U_STABLE UNormalizationCheckResult U_EXPORT2
   1.257 +unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength, 
   1.258 +                            UNormalizationMode mode, int32_t options,
   1.259 +                            UErrorCode *pErrorCode);
   1.260 +
   1.261 +/**
   1.262 + * Test if a string is in a given normalization form.
   1.263 + * This is semantically equivalent to source.equals(normalize(source, mode)) .
   1.264 + *
   1.265 + * Unlike unorm_quickCheck(), this function returns a definitive result,
   1.266 + * never a "maybe".
   1.267 + * For NFD, NFKD, and FCD, both functions work exactly the same.
   1.268 + * For NFC and NFKC where quickCheck may return "maybe", this function will
   1.269 + * perform further tests to arrive at a TRUE/FALSE result.
   1.270 + *
   1.271 + * @param src        String that is to be tested if it is in a normalization format.
   1.272 + * @param srcLength  Length of source to test, or -1 if NUL-terminated.
   1.273 + * @param mode       Which normalization form to test for.
   1.274 + * @param pErrorCode ICU error code in/out parameter.
   1.275 + *                   Must fulfill U_SUCCESS before the function call.
   1.276 + * @return Boolean value indicating whether the source string is in the
   1.277 + *         "mode" normalization form.
   1.278 + *
   1.279 + * @see unorm_quickCheck
   1.280 + * @stable ICU 2.2
   1.281 + */
   1.282 +U_STABLE UBool U_EXPORT2
   1.283 +unorm_isNormalized(const UChar *src, int32_t srcLength,
   1.284 +                   UNormalizationMode mode,
   1.285 +                   UErrorCode *pErrorCode);
   1.286 +
   1.287 +/**
   1.288 + * Test if a string is in a given normalization form; same as unorm_isNormalized but
   1.289 + * takes an extra options parameter like most normalization functions.
   1.290 + *
   1.291 + * @param src        String that is to be tested if it is in a normalization format.
   1.292 + * @param srcLength  Length of source to test, or -1 if NUL-terminated.
   1.293 + * @param mode       Which normalization form to test for.
   1.294 + * @param options    The normalization options, ORed together (0 for no options).
   1.295 + * @param pErrorCode ICU error code in/out parameter.
   1.296 + *                   Must fulfill U_SUCCESS before the function call.
   1.297 + * @return Boolean value indicating whether the source string is in the
   1.298 + *         "mode/options" normalization form.
   1.299 + *
   1.300 + * @see unorm_quickCheck
   1.301 + * @see unorm_isNormalized
   1.302 + * @stable ICU 2.6
   1.303 + */
   1.304 +U_STABLE UBool U_EXPORT2
   1.305 +unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
   1.306 +                              UNormalizationMode mode, int32_t options,
   1.307 +                              UErrorCode *pErrorCode);
   1.308 +
   1.309 +/**
   1.310 + * Iterative normalization forward.
   1.311 + * This function (together with unorm_previous) is somewhat
   1.312 + * similar to the C++ Normalizer class (see its non-static functions).
   1.313 + *
   1.314 + * Iterative normalization is useful when only a small portion of a longer
   1.315 + * string/text needs to be processed.
   1.316 + *
   1.317 + * For example, the likelihood may be high that processing the first 10% of some
   1.318 + * text will be sufficient to find certain data.
   1.319 + * Another example: When one wants to concatenate two normalized strings and get a
   1.320 + * normalized result, it is much more efficient to normalize just a small part of
   1.321 + * the result around the concatenation place instead of re-normalizing everything.
   1.322 + *
   1.323 + * The input text is an instance of the C character iteration API UCharIterator.
   1.324 + * It may wrap around a simple string, a CharacterIterator, a Replaceable, or any
   1.325 + * other kind of text object.
   1.326 + *
   1.327 + * If a buffer overflow occurs, then the caller needs to reset the iterator to the
   1.328 + * old index and call the function again with a larger buffer - if the caller cares
   1.329 + * for the actual output.
   1.330 + * Regardless of the output buffer, the iterator will always be moved to the next
   1.331 + * normalization boundary.
   1.332 + *
   1.333 + * This function (like unorm_previous) serves two purposes:
   1.334 + *
   1.335 + * 1) To find the next boundary so that the normalization of the part of the text
   1.336 + * from the current position to that boundary does not affect and is not affected
   1.337 + * by the part of the text beyond that boundary.
   1.338 + *
   1.339 + * 2) To normalize the text up to the boundary.
   1.340 + *
   1.341 + * The second step is optional, per the doNormalize parameter.
   1.342 + * It is omitted for operations like string concatenation, where the two adjacent
   1.343 + * string ends need to be normalized together.
   1.344 + * In such a case, the output buffer will just contain a copy of the text up to the
   1.345 + * boundary.
   1.346 + *
   1.347 + * pNeededToNormalize is an output-only parameter. Its output value is only defined
   1.348 + * if normalization was requested (doNormalize) and successful (especially, no
   1.349 + * buffer overflow).
   1.350 + * It is useful for operations like a normalizing transliterator, where one would
   1.351 + * not want to replace a piece of text if it is not modified.
   1.352 + *
   1.353 + * If doNormalize==TRUE and pNeededToNormalize!=NULL then *pNeeded... is set TRUE
   1.354 + * if the normalization was necessary.
   1.355 + *
   1.356 + * If doNormalize==FALSE then *pNeededToNormalize will be set to FALSE.
   1.357 + *
   1.358 + * If the buffer overflows, then *pNeededToNormalize will be undefined;
   1.359 + * essentially, whenever U_FAILURE is true (like in buffer overflows), this result
   1.360 + * will be undefined.
   1.361 + *
   1.362 + * @param src The input text in the form of a C character iterator.
   1.363 + * @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
   1.364 + * @param destCapacity The number of UChars that fit into dest.
   1.365 + * @param mode The normalization mode.
   1.366 + * @param options The normalization options, ORed together (0 for no options).
   1.367 + * @param doNormalize Indicates if the source text up to the next boundary
   1.368 + *                    is to be normalized (TRUE) or just copied (FALSE).
   1.369 + * @param pNeededToNormalize Output flag indicating if the normalization resulted in
   1.370 + *                           different text from the input.
   1.371 + *                           Not defined if an error occurs including buffer overflow.
   1.372 + *                           Always FALSE if !doNormalize.
   1.373 + * @param pErrorCode ICU error code in/out parameter.
   1.374 + *                   Must fulfill U_SUCCESS before the function call.
   1.375 + * @return Length of output (number of UChars) when successful or buffer overflow.
   1.376 + *
   1.377 + * @see unorm_previous
   1.378 + * @see unorm_normalize
   1.379 + *
   1.380 + * @stable ICU 2.1
   1.381 + */
   1.382 +U_STABLE int32_t U_EXPORT2
   1.383 +unorm_next(UCharIterator *src,
   1.384 +           UChar *dest, int32_t destCapacity,
   1.385 +           UNormalizationMode mode, int32_t options,
   1.386 +           UBool doNormalize, UBool *pNeededToNormalize,
   1.387 +           UErrorCode *pErrorCode);
   1.388 +
   1.389 +/**
   1.390 + * Iterative normalization backward.
   1.391 + * This function (together with unorm_next) is somewhat
   1.392 + * similar to the C++ Normalizer class (see its non-static functions).
   1.393 + * For all details see unorm_next.
   1.394 + *
   1.395 + * @param src The input text in the form of a C character iterator.
   1.396 + * @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
   1.397 + * @param destCapacity The number of UChars that fit into dest.
   1.398 + * @param mode The normalization mode.
   1.399 + * @param options The normalization options, ORed together (0 for no options).
   1.400 + * @param doNormalize Indicates if the source text up to the next boundary
   1.401 + *                    is to be normalized (TRUE) or just copied (FALSE).
   1.402 + * @param pNeededToNormalize Output flag indicating if the normalization resulted in
   1.403 + *                           different text from the input.
   1.404 + *                           Not defined if an error occurs including buffer overflow.
   1.405 + *                           Always FALSE if !doNormalize.
   1.406 + * @param pErrorCode ICU error code in/out parameter.
   1.407 + *                   Must fulfill U_SUCCESS before the function call.
   1.408 + * @return Length of output (number of UChars) when successful or buffer overflow.
   1.409 + *
   1.410 + * @see unorm_next
   1.411 + * @see unorm_normalize
   1.412 + *
   1.413 + * @stable ICU 2.1
   1.414 + */
   1.415 +U_STABLE int32_t U_EXPORT2
   1.416 +unorm_previous(UCharIterator *src,
   1.417 +               UChar *dest, int32_t destCapacity,
   1.418 +               UNormalizationMode mode, int32_t options,
   1.419 +               UBool doNormalize, UBool *pNeededToNormalize,
   1.420 +               UErrorCode *pErrorCode);
   1.421 +
   1.422 +/**
   1.423 + * Concatenate normalized strings, making sure that the result is normalized as well.
   1.424 + *
   1.425 + * If both the left and the right strings are in
   1.426 + * the normalization form according to "mode/options",
   1.427 + * then the result will be
   1.428 + *
   1.429 + * \code
   1.430 + *     dest=normalize(left+right, mode, options)
   1.431 + * \endcode
   1.432 + *
   1.433 + * With the input strings already being normalized,
   1.434 + * this function will use unorm_next() and unorm_previous()
   1.435 + * to find the adjacent end pieces of the input strings.
   1.436 + * Only the concatenation of these end pieces will be normalized and
   1.437 + * then concatenated with the remaining parts of the input strings.
   1.438 + *
   1.439 + * It is allowed to have dest==left to avoid copying the entire left string.
   1.440 + *
   1.441 + * @param left Left source string, may be same as dest.
   1.442 + * @param leftLength Length of left source string, or -1 if NUL-terminated.
   1.443 + * @param right Right source string. Must not be the same as dest, nor overlap.
   1.444 + * @param rightLength Length of right source string, or -1 if NUL-terminated.
   1.445 + * @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
   1.446 + * @param destCapacity The number of UChars that fit into dest.
   1.447 + * @param mode The normalization mode.
   1.448 + * @param options The normalization options, ORed together (0 for no options).
   1.449 + * @param pErrorCode ICU error code in/out parameter.
   1.450 + *                   Must fulfill U_SUCCESS before the function call.
   1.451 + * @return Length of output (number of UChars) when successful or buffer overflow.
   1.452 + *
   1.453 + * @see unorm_normalize
   1.454 + * @see unorm_next
   1.455 + * @see unorm_previous
   1.456 + *
   1.457 + * @stable ICU 2.1
   1.458 + */
   1.459 +U_STABLE int32_t U_EXPORT2
   1.460 +unorm_concatenate(const UChar *left, int32_t leftLength,
   1.461 +                  const UChar *right, int32_t rightLength,
   1.462 +                  UChar *dest, int32_t destCapacity,
   1.463 +                  UNormalizationMode mode, int32_t options,
   1.464 +                  UErrorCode *pErrorCode);
   1.465 +
   1.466 +/**
   1.467 + * Option bit for unorm_compare:
   1.468 + * Both input strings are assumed to fulfill FCD conditions.
   1.469 + * @stable ICU 2.2
   1.470 + */
   1.471 +#define UNORM_INPUT_IS_FCD          0x20000
   1.472 +
   1.473 +/**
   1.474 + * Option bit for unorm_compare:
   1.475 + * Perform case-insensitive comparison.
   1.476 + * @stable ICU 2.2
   1.477 + */
   1.478 +#define U_COMPARE_IGNORE_CASE       0x10000
   1.479 +
   1.480 +#ifndef U_COMPARE_CODE_POINT_ORDER
   1.481 +/* see also unistr.h and ustring.h */
   1.482 +/**
   1.483 + * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
   1.484 + * Compare strings in code point order instead of code unit order.
   1.485 + * @stable ICU 2.2
   1.486 + */
   1.487 +#define U_COMPARE_CODE_POINT_ORDER  0x8000
   1.488 +#endif
   1.489 +
   1.490 +/**
   1.491 + * Compare two strings for canonical equivalence.
   1.492 + * Further options include case-insensitive comparison and
   1.493 + * code point order (as opposed to code unit order).
   1.494 + *
   1.495 + * Canonical equivalence between two strings is defined as their normalized
   1.496 + * forms (NFD or NFC) being identical.
   1.497 + * This function compares strings incrementally instead of normalizing
   1.498 + * (and optionally case-folding) both strings entirely,
   1.499 + * improving performance significantly.
   1.500 + *
   1.501 + * Bulk normalization is only necessary if the strings do not fulfill the FCD
   1.502 + * conditions. Only in this case, and only if the strings are relatively long,
   1.503 + * is memory allocated temporarily.
   1.504 + * For FCD strings and short non-FCD strings there is no memory allocation.
   1.505 + *
   1.506 + * Semantically, this is equivalent to
   1.507 + *   strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2))))
   1.508 + * where code point order and foldCase are all optional.
   1.509 + *
   1.510 + * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
   1.511 + * the case folding must be performed first, then the normalization.
   1.512 + *
   1.513 + * @param s1 First source string.
   1.514 + * @param length1 Length of first source string, or -1 if NUL-terminated.
   1.515 + *
   1.516 + * @param s2 Second source string.
   1.517 + * @param length2 Length of second source string, or -1 if NUL-terminated.
   1.518 + *
   1.519 + * @param options A bit set of options:
   1.520 + *   - U_FOLD_CASE_DEFAULT or 0 is used for default options:
   1.521 + *     Case-sensitive comparison in code unit order, and the input strings
   1.522 + *     are quick-checked for FCD.
   1.523 + *
   1.524 + *   - UNORM_INPUT_IS_FCD
   1.525 + *     Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
   1.526 + *     If not set, the function will quickCheck for FCD
   1.527 + *     and normalize if necessary.
   1.528 + *
   1.529 + *   - U_COMPARE_CODE_POINT_ORDER
   1.530 + *     Set to choose code point order instead of code unit order
   1.531 + *     (see u_strCompare for details).
   1.532 + *
   1.533 + *   - U_COMPARE_IGNORE_CASE
   1.534 + *     Set to compare strings case-insensitively using case folding,
   1.535 + *     instead of case-sensitively.
   1.536 + *     If set, then the following case folding options are used.
   1.537 + *
   1.538 + *   - Options as used with case-insensitive comparisons, currently:
   1.539 + *
   1.540 + *   - U_FOLD_CASE_EXCLUDE_SPECIAL_I
   1.541 + *    (see u_strCaseCompare for details)
   1.542 + *
   1.543 + *   - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
   1.544 + *
   1.545 + * @param pErrorCode ICU error code in/out parameter.
   1.546 + *                   Must fulfill U_SUCCESS before the function call.
   1.547 + * @return <0 or 0 or >0 as usual for string comparisons
   1.548 + *
   1.549 + * @see unorm_normalize
   1.550 + * @see UNORM_FCD
   1.551 + * @see u_strCompare
   1.552 + * @see u_strCaseCompare
   1.553 + *
   1.554 + * @stable ICU 2.2
   1.555 + */
   1.556 +U_STABLE int32_t U_EXPORT2
   1.557 +unorm_compare(const UChar *s1, int32_t length1,
   1.558 +              const UChar *s2, int32_t length2,
   1.559 +              uint32_t options,
   1.560 +              UErrorCode *pErrorCode);
   1.561 +
   1.562 +#endif /* #if !UCONFIG_NO_NORMALIZATION */
   1.563 +
   1.564 +#endif

mercurial