intl/icu/source/common/unicode/normalizer2.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2009-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: normalizer2.h
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2009nov22
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #ifndef __NORMALIZER2_H__
michael@0 18 #define __NORMALIZER2_H__
michael@0 19
michael@0 20 /**
michael@0 21 * \file
michael@0 22 * \brief C++ API: New API for Unicode Normalization.
michael@0 23 */
michael@0 24
michael@0 25 #include "unicode/utypes.h"
michael@0 26
michael@0 27 #if !UCONFIG_NO_NORMALIZATION
michael@0 28
michael@0 29 #include "unicode/uniset.h"
michael@0 30 #include "unicode/unistr.h"
michael@0 31 #include "unicode/unorm2.h"
michael@0 32
michael@0 33 U_NAMESPACE_BEGIN
michael@0 34
michael@0 35 /**
michael@0 36 * Unicode normalization functionality for standard Unicode normalization or
michael@0 37 * for using custom mapping tables.
michael@0 38 * All instances of this class are unmodifiable/immutable.
michael@0 39 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
michael@0 40 * The Normalizer2 class is not intended for public subclassing.
michael@0 41 *
michael@0 42 * The primary functions are to produce a normalized string and to detect whether
michael@0 43 * a string is already normalized.
michael@0 44 * The most commonly used normalization forms are those defined in
michael@0 45 * http://www.unicode.org/unicode/reports/tr15/
michael@0 46 * However, this API supports additional normalization forms for specialized purposes.
michael@0 47 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
michael@0 48 * and can be used in implementations of UTS #46.
michael@0 49 *
michael@0 50 * Not only are the standard compose and decompose modes supplied,
michael@0 51 * but additional modes are provided as documented in the Mode enum.
michael@0 52 *
michael@0 53 * Some of the functions in this class identify normalization boundaries.
michael@0 54 * At a normalization boundary, the portions of the string
michael@0 55 * before it and starting from it do not interact and can be handled independently.
michael@0 56 *
michael@0 57 * The spanQuickCheckYes() stops at a normalization boundary.
michael@0 58 * When the goal is a normalized string, then the text before the boundary
michael@0 59 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
michael@0 60 *
michael@0 61 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
michael@0 62 * a character is guaranteed to be at a normalization boundary,
michael@0 63 * regardless of context.
michael@0 64 * This is used for moving from one normalization boundary to the next
michael@0 65 * or preceding boundary, and for performing iterative normalization.
michael@0 66 *
michael@0 67 * Iterative normalization is useful when only a small portion of a
michael@0 68 * longer string needs to be processed.
michael@0 69 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
michael@0 70 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
michael@0 71 * (to process only the substring for which sort key bytes are computed).
michael@0 72 *
michael@0 73 * The set of normalization boundaries returned by these functions may not be
michael@0 74 * complete: There may be more boundaries that could be returned.
michael@0 75 * Different functions may return different boundaries.
michael@0 76 * @stable ICU 4.4
michael@0 77 */
michael@0 78 class U_COMMON_API Normalizer2 : public UObject {
michael@0 79 public:
michael@0 80 /**
michael@0 81 * Destructor.
michael@0 82 * @stable ICU 4.4
michael@0 83 */
michael@0 84 ~Normalizer2();
michael@0 85
michael@0 86 /**
michael@0 87 * Returns a Normalizer2 instance for Unicode NFC normalization.
michael@0 88 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
michael@0 89 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 90 * @param errorCode Standard ICU error code. Its input value must
michael@0 91 * pass the U_SUCCESS() test, or else the function returns
michael@0 92 * immediately. Check for U_FAILURE() on output or use with
michael@0 93 * function chaining. (See User Guide for details.)
michael@0 94 * @return the requested Normalizer2, if successful
michael@0 95 * @stable ICU 49
michael@0 96 */
michael@0 97 static const Normalizer2 *
michael@0 98 getNFCInstance(UErrorCode &errorCode);
michael@0 99
michael@0 100 /**
michael@0 101 * Returns a Normalizer2 instance for Unicode NFD normalization.
michael@0 102 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
michael@0 103 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 104 * @param errorCode Standard ICU error code. Its input value must
michael@0 105 * pass the U_SUCCESS() test, or else the function returns
michael@0 106 * immediately. Check for U_FAILURE() on output or use with
michael@0 107 * function chaining. (See User Guide for details.)
michael@0 108 * @return the requested Normalizer2, if successful
michael@0 109 * @stable ICU 49
michael@0 110 */
michael@0 111 static const Normalizer2 *
michael@0 112 getNFDInstance(UErrorCode &errorCode);
michael@0 113
michael@0 114 /**
michael@0 115 * Returns a Normalizer2 instance for Unicode NFKC normalization.
michael@0 116 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
michael@0 117 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 118 * @param errorCode Standard ICU error code. Its input value must
michael@0 119 * pass the U_SUCCESS() test, or else the function returns
michael@0 120 * immediately. Check for U_FAILURE() on output or use with
michael@0 121 * function chaining. (See User Guide for details.)
michael@0 122 * @return the requested Normalizer2, if successful
michael@0 123 * @stable ICU 49
michael@0 124 */
michael@0 125 static const Normalizer2 *
michael@0 126 getNFKCInstance(UErrorCode &errorCode);
michael@0 127
michael@0 128 /**
michael@0 129 * Returns a Normalizer2 instance for Unicode NFKD normalization.
michael@0 130 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
michael@0 131 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 132 * @param errorCode Standard ICU error code. Its input value must
michael@0 133 * pass the U_SUCCESS() test, or else the function returns
michael@0 134 * immediately. Check for U_FAILURE() on output or use with
michael@0 135 * function chaining. (See User Guide for details.)
michael@0 136 * @return the requested Normalizer2, if successful
michael@0 137 * @stable ICU 49
michael@0 138 */
michael@0 139 static const Normalizer2 *
michael@0 140 getNFKDInstance(UErrorCode &errorCode);
michael@0 141
michael@0 142 /**
michael@0 143 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
michael@0 144 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
michael@0 145 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 146 * @param errorCode Standard ICU error code. Its input value must
michael@0 147 * pass the U_SUCCESS() test, or else the function returns
michael@0 148 * immediately. Check for U_FAILURE() on output or use with
michael@0 149 * function chaining. (See User Guide for details.)
michael@0 150 * @return the requested Normalizer2, if successful
michael@0 151 * @stable ICU 49
michael@0 152 */
michael@0 153 static const Normalizer2 *
michael@0 154 getNFKCCasefoldInstance(UErrorCode &errorCode);
michael@0 155
michael@0 156 /**
michael@0 157 * Returns a Normalizer2 instance which uses the specified data file
michael@0 158 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
michael@0 159 * and which composes or decomposes text according to the specified mode.
michael@0 160 * Returns an unmodifiable singleton instance. Do not delete it.
michael@0 161 *
michael@0 162 * Use packageName=NULL for data files that are part of ICU's own data.
michael@0 163 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
michael@0 164 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
michael@0 165 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
michael@0 166 *
michael@0 167 * @param packageName NULL for ICU built-in data, otherwise application data package name
michael@0 168 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
michael@0 169 * @param mode normalization mode (compose or decompose etc.)
michael@0 170 * @param errorCode Standard ICU error code. Its input value must
michael@0 171 * pass the U_SUCCESS() test, or else the function returns
michael@0 172 * immediately. Check for U_FAILURE() on output or use with
michael@0 173 * function chaining. (See User Guide for details.)
michael@0 174 * @return the requested Normalizer2, if successful
michael@0 175 * @stable ICU 4.4
michael@0 176 */
michael@0 177 static const Normalizer2 *
michael@0 178 getInstance(const char *packageName,
michael@0 179 const char *name,
michael@0 180 UNormalization2Mode mode,
michael@0 181 UErrorCode &errorCode);
michael@0 182
michael@0 183 /**
michael@0 184 * Returns the normalized form of the source string.
michael@0 185 * @param src source string
michael@0 186 * @param errorCode Standard ICU error code. Its input value must
michael@0 187 * pass the U_SUCCESS() test, or else the function returns
michael@0 188 * immediately. Check for U_FAILURE() on output or use with
michael@0 189 * function chaining. (See User Guide for details.)
michael@0 190 * @return normalized src
michael@0 191 * @stable ICU 4.4
michael@0 192 */
michael@0 193 UnicodeString
michael@0 194 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
michael@0 195 UnicodeString result;
michael@0 196 normalize(src, result, errorCode);
michael@0 197 return result;
michael@0 198 }
michael@0 199 /**
michael@0 200 * Writes the normalized form of the source string to the destination string
michael@0 201 * (replacing its contents) and returns the destination string.
michael@0 202 * The source and destination strings must be different objects.
michael@0 203 * @param src source string
michael@0 204 * @param dest destination string; its contents is replaced with normalized src
michael@0 205 * @param errorCode Standard ICU error code. Its input value must
michael@0 206 * pass the U_SUCCESS() test, or else the function returns
michael@0 207 * immediately. Check for U_FAILURE() on output or use with
michael@0 208 * function chaining. (See User Guide for details.)
michael@0 209 * @return dest
michael@0 210 * @stable ICU 4.4
michael@0 211 */
michael@0 212 virtual UnicodeString &
michael@0 213 normalize(const UnicodeString &src,
michael@0 214 UnicodeString &dest,
michael@0 215 UErrorCode &errorCode) const = 0;
michael@0 216 /**
michael@0 217 * Appends the normalized form of the second string to the first string
michael@0 218 * (merging them at the boundary) and returns the first string.
michael@0 219 * The result is normalized if the first string was normalized.
michael@0 220 * The first and second strings must be different objects.
michael@0 221 * @param first string, should be normalized
michael@0 222 * @param second string, will be normalized
michael@0 223 * @param errorCode Standard ICU error code. Its input value must
michael@0 224 * pass the U_SUCCESS() test, or else the function returns
michael@0 225 * immediately. Check for U_FAILURE() on output or use with
michael@0 226 * function chaining. (See User Guide for details.)
michael@0 227 * @return first
michael@0 228 * @stable ICU 4.4
michael@0 229 */
michael@0 230 virtual UnicodeString &
michael@0 231 normalizeSecondAndAppend(UnicodeString &first,
michael@0 232 const UnicodeString &second,
michael@0 233 UErrorCode &errorCode) const = 0;
michael@0 234 /**
michael@0 235 * Appends the second string to the first string
michael@0 236 * (merging them at the boundary) and returns the first string.
michael@0 237 * The result is normalized if both the strings were normalized.
michael@0 238 * The first and second strings must be different objects.
michael@0 239 * @param first string, should be normalized
michael@0 240 * @param second string, should be normalized
michael@0 241 * @param errorCode Standard ICU error code. Its input value must
michael@0 242 * pass the U_SUCCESS() test, or else the function returns
michael@0 243 * immediately. Check for U_FAILURE() on output or use with
michael@0 244 * function chaining. (See User Guide for details.)
michael@0 245 * @return first
michael@0 246 * @stable ICU 4.4
michael@0 247 */
michael@0 248 virtual UnicodeString &
michael@0 249 append(UnicodeString &first,
michael@0 250 const UnicodeString &second,
michael@0 251 UErrorCode &errorCode) const = 0;
michael@0 252
michael@0 253 /**
michael@0 254 * Gets the decomposition mapping of c.
michael@0 255 * Roughly equivalent to normalizing the String form of c
michael@0 256 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
michael@0 257 * returns FALSE and does not write a string
michael@0 258 * if c does not have a decomposition mapping in this instance's data.
michael@0 259 * This function is independent of the mode of the Normalizer2.
michael@0 260 * @param c code point
michael@0 261 * @param decomposition String object which will be set to c's
michael@0 262 * decomposition mapping, if there is one.
michael@0 263 * @return TRUE if c has a decomposition, otherwise FALSE
michael@0 264 * @stable ICU 4.6
michael@0 265 */
michael@0 266 virtual UBool
michael@0 267 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
michael@0 268
michael@0 269 /**
michael@0 270 * Gets the raw decomposition mapping of c.
michael@0 271 *
michael@0 272 * This is similar to the getDecomposition() method but returns the
michael@0 273 * raw decomposition mapping as specified in UnicodeData.txt or
michael@0 274 * (for custom data) in the mapping files processed by the gennorm2 tool.
michael@0 275 * By contrast, getDecomposition() returns the processed,
michael@0 276 * recursively-decomposed version of this mapping.
michael@0 277 *
michael@0 278 * When used on a standard NFKC Normalizer2 instance,
michael@0 279 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
michael@0 280 *
michael@0 281 * When used on a standard NFC Normalizer2 instance,
michael@0 282 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
michael@0 283 * in this case, the result contains either one or two code points (=1..4 UChars).
michael@0 284 *
michael@0 285 * This function is independent of the mode of the Normalizer2.
michael@0 286 * The default implementation returns FALSE.
michael@0 287 * @param c code point
michael@0 288 * @param decomposition String object which will be set to c's
michael@0 289 * raw decomposition mapping, if there is one.
michael@0 290 * @return TRUE if c has a decomposition, otherwise FALSE
michael@0 291 * @stable ICU 49
michael@0 292 */
michael@0 293 virtual UBool
michael@0 294 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
michael@0 295
michael@0 296 /**
michael@0 297 * Performs pairwise composition of a & b and returns the composite if there is one.
michael@0 298 *
michael@0 299 * Returns a composite code point c only if c has a two-way mapping to a+b.
michael@0 300 * In standard Unicode normalization, this means that
michael@0 301 * c has a canonical decomposition to a+b
michael@0 302 * and c does not have the Full_Composition_Exclusion property.
michael@0 303 *
michael@0 304 * This function is independent of the mode of the Normalizer2.
michael@0 305 * The default implementation returns a negative value.
michael@0 306 * @param a A (normalization starter) code point.
michael@0 307 * @param b Another code point.
michael@0 308 * @return The non-negative composite code point if there is one; otherwise a negative value.
michael@0 309 * @stable ICU 49
michael@0 310 */
michael@0 311 virtual UChar32
michael@0 312 composePair(UChar32 a, UChar32 b) const;
michael@0 313
michael@0 314 /**
michael@0 315 * Gets the combining class of c.
michael@0 316 * The default implementation returns 0
michael@0 317 * but all standard implementations return the Unicode Canonical_Combining_Class value.
michael@0 318 * @param c code point
michael@0 319 * @return c's combining class
michael@0 320 * @stable ICU 49
michael@0 321 */
michael@0 322 virtual uint8_t
michael@0 323 getCombiningClass(UChar32 c) const;
michael@0 324
michael@0 325 /**
michael@0 326 * Tests if the string is normalized.
michael@0 327 * Internally, in cases where the quickCheck() method would return "maybe"
michael@0 328 * (which is only possible for the two COMPOSE modes) this method
michael@0 329 * resolves to "yes" or "no" to provide a definitive result,
michael@0 330 * at the cost of doing more work in those cases.
michael@0 331 * @param s input string
michael@0 332 * @param errorCode Standard ICU error code. Its input value must
michael@0 333 * pass the U_SUCCESS() test, or else the function returns
michael@0 334 * immediately. Check for U_FAILURE() on output or use with
michael@0 335 * function chaining. (See User Guide for details.)
michael@0 336 * @return TRUE if s is normalized
michael@0 337 * @stable ICU 4.4
michael@0 338 */
michael@0 339 virtual UBool
michael@0 340 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
michael@0 341
michael@0 342 /**
michael@0 343 * Tests if the string is normalized.
michael@0 344 * For the two COMPOSE modes, the result could be "maybe" in cases that
michael@0 345 * would take a little more work to resolve definitively.
michael@0 346 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
michael@0 347 * combination of quick check + normalization, to avoid
michael@0 348 * re-checking the "yes" prefix.
michael@0 349 * @param s input string
michael@0 350 * @param errorCode Standard ICU error code. Its input value must
michael@0 351 * pass the U_SUCCESS() test, or else the function returns
michael@0 352 * immediately. Check for U_FAILURE() on output or use with
michael@0 353 * function chaining. (See User Guide for details.)
michael@0 354 * @return UNormalizationCheckResult
michael@0 355 * @stable ICU 4.4
michael@0 356 */
michael@0 357 virtual UNormalizationCheckResult
michael@0 358 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
michael@0 359
michael@0 360 /**
michael@0 361 * Returns the end of the normalized substring of the input string.
michael@0 362 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
michael@0 363 * the substring <code>UnicodeString(s, 0, end)</code>
michael@0 364 * will pass the quick check with a "yes" result.
michael@0 365 *
michael@0 366 * The returned end index is usually one or more characters before the
michael@0 367 * "no" or "maybe" character: The end index is at a normalization boundary.
michael@0 368 * (See the class documentation for more about normalization boundaries.)
michael@0 369 *
michael@0 370 * When the goal is a normalized string and most input strings are expected
michael@0 371 * to be normalized already, then call this method,
michael@0 372 * and if it returns a prefix shorter than the input string,
michael@0 373 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
michael@0 374 * @param s input string
michael@0 375 * @param errorCode Standard ICU error code. Its input value must
michael@0 376 * pass the U_SUCCESS() test, or else the function returns
michael@0 377 * immediately. Check for U_FAILURE() on output or use with
michael@0 378 * function chaining. (See User Guide for details.)
michael@0 379 * @return "yes" span end index
michael@0 380 * @stable ICU 4.4
michael@0 381 */
michael@0 382 virtual int32_t
michael@0 383 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
michael@0 384
michael@0 385 /**
michael@0 386 * Tests if the character always has a normalization boundary before it,
michael@0 387 * regardless of context.
michael@0 388 * If true, then the character does not normalization-interact with
michael@0 389 * preceding characters.
michael@0 390 * In other words, a string containing this character can be normalized
michael@0 391 * by processing portions before this character and starting from this
michael@0 392 * character independently.
michael@0 393 * This is used for iterative normalization. See the class documentation for details.
michael@0 394 * @param c character to test
michael@0 395 * @return TRUE if c has a normalization boundary before it
michael@0 396 * @stable ICU 4.4
michael@0 397 */
michael@0 398 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
michael@0 399
michael@0 400 /**
michael@0 401 * Tests if the character always has a normalization boundary after it,
michael@0 402 * regardless of context.
michael@0 403 * If true, then the character does not normalization-interact with
michael@0 404 * following characters.
michael@0 405 * In other words, a string containing this character can be normalized
michael@0 406 * by processing portions up to this character and after this
michael@0 407 * character independently.
michael@0 408 * This is used for iterative normalization. See the class documentation for details.
michael@0 409 * Note that this operation may be significantly slower than hasBoundaryBefore().
michael@0 410 * @param c character to test
michael@0 411 * @return TRUE if c has a normalization boundary after it
michael@0 412 * @stable ICU 4.4
michael@0 413 */
michael@0 414 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
michael@0 415
michael@0 416 /**
michael@0 417 * Tests if the character is normalization-inert.
michael@0 418 * If true, then the character does not change, nor normalization-interact with
michael@0 419 * preceding or following characters.
michael@0 420 * In other words, a string containing this character can be normalized
michael@0 421 * by processing portions before this character and after this
michael@0 422 * character independently.
michael@0 423 * This is used for iterative normalization. See the class documentation for details.
michael@0 424 * Note that this operation may be significantly slower than hasBoundaryBefore().
michael@0 425 * @param c character to test
michael@0 426 * @return TRUE if c is normalization-inert
michael@0 427 * @stable ICU 4.4
michael@0 428 */
michael@0 429 virtual UBool isInert(UChar32 c) const = 0;
michael@0 430 };
michael@0 431
michael@0 432 /**
michael@0 433 * Normalization filtered by a UnicodeSet.
michael@0 434 * Normalizes portions of the text contained in the filter set and leaves
michael@0 435 * portions not contained in the filter set unchanged.
michael@0 436 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
michael@0 437 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
michael@0 438 * This class implements all of (and only) the Normalizer2 API.
michael@0 439 * An instance of this class is unmodifiable/immutable but is constructed and
michael@0 440 * must be destructed by the owner.
michael@0 441 * @stable ICU 4.4
michael@0 442 */
michael@0 443 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
michael@0 444 public:
michael@0 445 /**
michael@0 446 * Constructs a filtered normalizer wrapping any Normalizer2 instance
michael@0 447 * and a filter set.
michael@0 448 * Both are aliased and must not be modified or deleted while this object
michael@0 449 * is used.
michael@0 450 * The filter set should be frozen; otherwise the performance will suffer greatly.
michael@0 451 * @param n2 wrapped Normalizer2 instance
michael@0 452 * @param filterSet UnicodeSet which determines the characters to be normalized
michael@0 453 * @stable ICU 4.4
michael@0 454 */
michael@0 455 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
michael@0 456 norm2(n2), set(filterSet) {}
michael@0 457
michael@0 458 /**
michael@0 459 * Destructor.
michael@0 460 * @stable ICU 4.4
michael@0 461 */
michael@0 462 ~FilteredNormalizer2();
michael@0 463
michael@0 464 /**
michael@0 465 * Writes the normalized form of the source string to the destination string
michael@0 466 * (replacing its contents) and returns the destination string.
michael@0 467 * The source and destination strings must be different objects.
michael@0 468 * @param src source string
michael@0 469 * @param dest destination string; its contents is replaced with normalized src
michael@0 470 * @param errorCode Standard ICU error code. Its input value must
michael@0 471 * pass the U_SUCCESS() test, or else the function returns
michael@0 472 * immediately. Check for U_FAILURE() on output or use with
michael@0 473 * function chaining. (See User Guide for details.)
michael@0 474 * @return dest
michael@0 475 * @stable ICU 4.4
michael@0 476 */
michael@0 477 virtual UnicodeString &
michael@0 478 normalize(const UnicodeString &src,
michael@0 479 UnicodeString &dest,
michael@0 480 UErrorCode &errorCode) const;
michael@0 481 /**
michael@0 482 * Appends the normalized form of the second string to the first string
michael@0 483 * (merging them at the boundary) and returns the first string.
michael@0 484 * The result is normalized if the first string was normalized.
michael@0 485 * The first and second strings must be different objects.
michael@0 486 * @param first string, should be normalized
michael@0 487 * @param second string, will be normalized
michael@0 488 * @param errorCode Standard ICU error code. Its input value must
michael@0 489 * pass the U_SUCCESS() test, or else the function returns
michael@0 490 * immediately. Check for U_FAILURE() on output or use with
michael@0 491 * function chaining. (See User Guide for details.)
michael@0 492 * @return first
michael@0 493 * @stable ICU 4.4
michael@0 494 */
michael@0 495 virtual UnicodeString &
michael@0 496 normalizeSecondAndAppend(UnicodeString &first,
michael@0 497 const UnicodeString &second,
michael@0 498 UErrorCode &errorCode) const;
michael@0 499 /**
michael@0 500 * Appends the second string to the first string
michael@0 501 * (merging them at the boundary) and returns the first string.
michael@0 502 * The result is normalized if both the strings were normalized.
michael@0 503 * The first and second strings must be different objects.
michael@0 504 * @param first string, should be normalized
michael@0 505 * @param second string, should be normalized
michael@0 506 * @param errorCode Standard ICU error code. Its input value must
michael@0 507 * pass the U_SUCCESS() test, or else the function returns
michael@0 508 * immediately. Check for U_FAILURE() on output or use with
michael@0 509 * function chaining. (See User Guide for details.)
michael@0 510 * @return first
michael@0 511 * @stable ICU 4.4
michael@0 512 */
michael@0 513 virtual UnicodeString &
michael@0 514 append(UnicodeString &first,
michael@0 515 const UnicodeString &second,
michael@0 516 UErrorCode &errorCode) const;
michael@0 517
michael@0 518 /**
michael@0 519 * Gets the decomposition mapping of c.
michael@0 520 * For details see the base class documentation.
michael@0 521 *
michael@0 522 * This function is independent of the mode of the Normalizer2.
michael@0 523 * @param c code point
michael@0 524 * @param decomposition String object which will be set to c's
michael@0 525 * decomposition mapping, if there is one.
michael@0 526 * @return TRUE if c has a decomposition, otherwise FALSE
michael@0 527 * @stable ICU 4.6
michael@0 528 */
michael@0 529 virtual UBool
michael@0 530 getDecomposition(UChar32 c, UnicodeString &decomposition) const;
michael@0 531
michael@0 532 /**
michael@0 533 * Gets the raw decomposition mapping of c.
michael@0 534 * For details see the base class documentation.
michael@0 535 *
michael@0 536 * This function is independent of the mode of the Normalizer2.
michael@0 537 * @param c code point
michael@0 538 * @param decomposition String object which will be set to c's
michael@0 539 * raw decomposition mapping, if there is one.
michael@0 540 * @return TRUE if c has a decomposition, otherwise FALSE
michael@0 541 * @stable ICU 49
michael@0 542 */
michael@0 543 virtual UBool
michael@0 544 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
michael@0 545
michael@0 546 /**
michael@0 547 * Performs pairwise composition of a & b and returns the composite if there is one.
michael@0 548 * For details see the base class documentation.
michael@0 549 *
michael@0 550 * This function is independent of the mode of the Normalizer2.
michael@0 551 * @param a A (normalization starter) code point.
michael@0 552 * @param b Another code point.
michael@0 553 * @return The non-negative composite code point if there is one; otherwise a negative value.
michael@0 554 * @stable ICU 49
michael@0 555 */
michael@0 556 virtual UChar32
michael@0 557 composePair(UChar32 a, UChar32 b) const;
michael@0 558
michael@0 559 /**
michael@0 560 * Gets the combining class of c.
michael@0 561 * The default implementation returns 0
michael@0 562 * but all standard implementations return the Unicode Canonical_Combining_Class value.
michael@0 563 * @param c code point
michael@0 564 * @return c's combining class
michael@0 565 * @stable ICU 49
michael@0 566 */
michael@0 567 virtual uint8_t
michael@0 568 getCombiningClass(UChar32 c) const;
michael@0 569
michael@0 570 /**
michael@0 571 * Tests if the string is normalized.
michael@0 572 * For details see the Normalizer2 base class documentation.
michael@0 573 * @param s input string
michael@0 574 * @param errorCode Standard ICU error code. Its input value must
michael@0 575 * pass the U_SUCCESS() test, or else the function returns
michael@0 576 * immediately. Check for U_FAILURE() on output or use with
michael@0 577 * function chaining. (See User Guide for details.)
michael@0 578 * @return TRUE if s is normalized
michael@0 579 * @stable ICU 4.4
michael@0 580 */
michael@0 581 virtual UBool
michael@0 582 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
michael@0 583 /**
michael@0 584 * Tests if the string is normalized.
michael@0 585 * For details see the Normalizer2 base class documentation.
michael@0 586 * @param s input string
michael@0 587 * @param errorCode Standard ICU error code. Its input value must
michael@0 588 * pass the U_SUCCESS() test, or else the function returns
michael@0 589 * immediately. Check for U_FAILURE() on output or use with
michael@0 590 * function chaining. (See User Guide for details.)
michael@0 591 * @return UNormalizationCheckResult
michael@0 592 * @stable ICU 4.4
michael@0 593 */
michael@0 594 virtual UNormalizationCheckResult
michael@0 595 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
michael@0 596 /**
michael@0 597 * Returns the end of the normalized substring of the input string.
michael@0 598 * For details see the Normalizer2 base class documentation.
michael@0 599 * @param s input string
michael@0 600 * @param errorCode Standard ICU error code. Its input value must
michael@0 601 * pass the U_SUCCESS() test, or else the function returns
michael@0 602 * immediately. Check for U_FAILURE() on output or use with
michael@0 603 * function chaining. (See User Guide for details.)
michael@0 604 * @return "yes" span end index
michael@0 605 * @stable ICU 4.4
michael@0 606 */
michael@0 607 virtual int32_t
michael@0 608 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
michael@0 609
michael@0 610 /**
michael@0 611 * Tests if the character always has a normalization boundary before it,
michael@0 612 * regardless of context.
michael@0 613 * For details see the Normalizer2 base class documentation.
michael@0 614 * @param c character to test
michael@0 615 * @return TRUE if c has a normalization boundary before it
michael@0 616 * @stable ICU 4.4
michael@0 617 */
michael@0 618 virtual UBool hasBoundaryBefore(UChar32 c) const;
michael@0 619
michael@0 620 /**
michael@0 621 * Tests if the character always has a normalization boundary after it,
michael@0 622 * regardless of context.
michael@0 623 * For details see the Normalizer2 base class documentation.
michael@0 624 * @param c character to test
michael@0 625 * @return TRUE if c has a normalization boundary after it
michael@0 626 * @stable ICU 4.4
michael@0 627 */
michael@0 628 virtual UBool hasBoundaryAfter(UChar32 c) const;
michael@0 629
michael@0 630 /**
michael@0 631 * Tests if the character is normalization-inert.
michael@0 632 * For details see the Normalizer2 base class documentation.
michael@0 633 * @param c character to test
michael@0 634 * @return TRUE if c is normalization-inert
michael@0 635 * @stable ICU 4.4
michael@0 636 */
michael@0 637 virtual UBool isInert(UChar32 c) const;
michael@0 638 private:
michael@0 639 UnicodeString &
michael@0 640 normalize(const UnicodeString &src,
michael@0 641 UnicodeString &dest,
michael@0 642 USetSpanCondition spanCondition,
michael@0 643 UErrorCode &errorCode) const;
michael@0 644
michael@0 645 UnicodeString &
michael@0 646 normalizeSecondAndAppend(UnicodeString &first,
michael@0 647 const UnicodeString &second,
michael@0 648 UBool doNormalize,
michael@0 649 UErrorCode &errorCode) const;
michael@0 650
michael@0 651 const Normalizer2 &norm2;
michael@0 652 const UnicodeSet &set;
michael@0 653 };
michael@0 654
michael@0 655 U_NAMESPACE_END
michael@0 656
michael@0 657 #endif // !UCONFIG_NO_NORMALIZATION
michael@0 658 #endif // __NORMALIZER2_H__

mercurial