michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2009-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: normalizer2.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2009nov22 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #ifndef __NORMALIZER2_H__ michael@0: #define __NORMALIZER2_H__ michael@0: michael@0: /** michael@0: * \file michael@0: * \brief C++ API: New API for Unicode Normalization. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/unorm2.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /** michael@0: * Unicode normalization functionality for standard Unicode normalization or michael@0: * for using custom mapping tables. michael@0: * All instances of this class are unmodifiable/immutable. michael@0: * Instances returned by getInstance() are singletons that must not be deleted by the caller. michael@0: * The Normalizer2 class is not intended for public subclassing. michael@0: * michael@0: * The primary functions are to produce a normalized string and to detect whether michael@0: * a string is already normalized. michael@0: * The most commonly used normalization forms are those defined in michael@0: * http://www.unicode.org/unicode/reports/tr15/ michael@0: * However, this API supports additional normalization forms for specialized purposes. michael@0: * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) michael@0: * and can be used in implementations of UTS #46. michael@0: * michael@0: * Not only are the standard compose and decompose modes supplied, michael@0: * but additional modes are provided as documented in the Mode enum. michael@0: * michael@0: * Some of the functions in this class identify normalization boundaries. michael@0: * At a normalization boundary, the portions of the string michael@0: * before it and starting from it do not interact and can be handled independently. michael@0: * michael@0: * The spanQuickCheckYes() stops at a normalization boundary. michael@0: * When the goal is a normalized string, then the text before the boundary michael@0: * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). michael@0: * michael@0: * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether michael@0: * a character is guaranteed to be at a normalization boundary, michael@0: * regardless of context. michael@0: * This is used for moving from one normalization boundary to the next michael@0: * or preceding boundary, and for performing iterative normalization. michael@0: * michael@0: * Iterative normalization is useful when only a small portion of a michael@0: * longer string needs to be processed. michael@0: * For example, in ICU, iterative normalization is used by the NormalizationTransliterator michael@0: * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() michael@0: * (to process only the substring for which sort key bytes are computed). michael@0: * michael@0: * The set of normalization boundaries returned by these functions may not be michael@0: * complete: There may be more boundaries that could be returned. michael@0: * Different functions may return different boundaries. michael@0: * @stable ICU 4.4 michael@0: */ michael@0: class U_COMMON_API Normalizer2 : public UObject { michael@0: public: michael@0: /** michael@0: * Destructor. michael@0: * @stable ICU 4.4 michael@0: */ michael@0: ~Normalizer2(); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance for Unicode NFC normalization. michael@0: * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 49 michael@0: */ michael@0: static const Normalizer2 * michael@0: getNFCInstance(UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance for Unicode NFD normalization. michael@0: * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 49 michael@0: */ michael@0: static const Normalizer2 * michael@0: getNFDInstance(UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance for Unicode NFKC normalization. michael@0: * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 49 michael@0: */ michael@0: static const Normalizer2 * michael@0: getNFKCInstance(UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance for Unicode NFKD normalization. michael@0: * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 49 michael@0: */ michael@0: static const Normalizer2 * michael@0: getNFKDInstance(UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. michael@0: * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 49 michael@0: */ michael@0: static const Normalizer2 * michael@0: getNFKCCasefoldInstance(UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns a Normalizer2 instance which uses the specified data file michael@0: * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) michael@0: * and which composes or decomposes text according to the specified mode. michael@0: * Returns an unmodifiable singleton instance. Do not delete it. michael@0: * michael@0: * Use packageName=NULL for data files that are part of ICU's own data. michael@0: * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. michael@0: * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. michael@0: * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. michael@0: * michael@0: * @param packageName NULL for ICU built-in data, otherwise application data package name michael@0: * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file michael@0: * @param mode normalization mode (compose or decompose etc.) michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return the requested Normalizer2, if successful michael@0: * @stable ICU 4.4 michael@0: */ michael@0: static const Normalizer2 * michael@0: getInstance(const char *packageName, michael@0: const char *name, michael@0: UNormalization2Mode mode, michael@0: UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Returns the normalized form of the source string. michael@0: * @param src source string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return normalized src michael@0: * @stable ICU 4.4 michael@0: */ michael@0: UnicodeString michael@0: normalize(const UnicodeString &src, UErrorCode &errorCode) const { michael@0: UnicodeString result; michael@0: normalize(src, result, errorCode); michael@0: return result; michael@0: } michael@0: /** michael@0: * Writes the normalized form of the source string to the destination string michael@0: * (replacing its contents) and returns the destination string. michael@0: * The source and destination strings must be different objects. michael@0: * @param src source string michael@0: * @param dest destination string; its contents is replaced with normalized src michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return dest michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: normalize(const UnicodeString &src, michael@0: UnicodeString &dest, michael@0: UErrorCode &errorCode) const = 0; michael@0: /** michael@0: * Appends the normalized form of the second string to the first string michael@0: * (merging them at the boundary) and returns the first string. michael@0: * The result is normalized if the first string was normalized. michael@0: * The first and second strings must be different objects. michael@0: * @param first string, should be normalized michael@0: * @param second string, will be normalized michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return first michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: normalizeSecondAndAppend(UnicodeString &first, michael@0: const UnicodeString &second, michael@0: UErrorCode &errorCode) const = 0; michael@0: /** michael@0: * Appends the second string to the first string michael@0: * (merging them at the boundary) and returns the first string. michael@0: * The result is normalized if both the strings were normalized. michael@0: * The first and second strings must be different objects. michael@0: * @param first string, should be normalized michael@0: * @param second string, should be normalized michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return first michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: append(UnicodeString &first, michael@0: const UnicodeString &second, michael@0: UErrorCode &errorCode) const = 0; michael@0: michael@0: /** michael@0: * Gets the decomposition mapping of c. michael@0: * Roughly equivalent to normalizing the String form of c michael@0: * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function michael@0: * returns FALSE and does not write a string michael@0: * if c does not have a decomposition mapping in this instance's data. michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * @param c code point michael@0: * @param decomposition String object which will be set to c's michael@0: * decomposition mapping, if there is one. michael@0: * @return TRUE if c has a decomposition, otherwise FALSE michael@0: * @stable ICU 4.6 michael@0: */ michael@0: virtual UBool michael@0: getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; michael@0: michael@0: /** michael@0: * Gets the raw decomposition mapping of c. michael@0: * michael@0: * This is similar to the getDecomposition() method but returns the michael@0: * raw decomposition mapping as specified in UnicodeData.txt or michael@0: * (for custom data) in the mapping files processed by the gennorm2 tool. michael@0: * By contrast, getDecomposition() returns the processed, michael@0: * recursively-decomposed version of this mapping. michael@0: * michael@0: * When used on a standard NFKC Normalizer2 instance, michael@0: * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. michael@0: * michael@0: * When used on a standard NFC Normalizer2 instance, michael@0: * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); michael@0: * in this case, the result contains either one or two code points (=1..4 UChars). michael@0: * michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * The default implementation returns FALSE. michael@0: * @param c code point michael@0: * @param decomposition String object which will be set to c's michael@0: * raw decomposition mapping, if there is one. michael@0: * @return TRUE if c has a decomposition, otherwise FALSE michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual UBool michael@0: getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; michael@0: michael@0: /** michael@0: * Performs pairwise composition of a & b and returns the composite if there is one. michael@0: * michael@0: * Returns a composite code point c only if c has a two-way mapping to a+b. michael@0: * In standard Unicode normalization, this means that michael@0: * c has a canonical decomposition to a+b michael@0: * and c does not have the Full_Composition_Exclusion property. michael@0: * michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * The default implementation returns a negative value. michael@0: * @param a A (normalization starter) code point. michael@0: * @param b Another code point. michael@0: * @return The non-negative composite code point if there is one; otherwise a negative value. michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual UChar32 michael@0: composePair(UChar32 a, UChar32 b) const; michael@0: michael@0: /** michael@0: * Gets the combining class of c. michael@0: * The default implementation returns 0 michael@0: * but all standard implementations return the Unicode Canonical_Combining_Class value. michael@0: * @param c code point michael@0: * @return c's combining class michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual uint8_t michael@0: getCombiningClass(UChar32 c) const; michael@0: michael@0: /** michael@0: * Tests if the string is normalized. michael@0: * Internally, in cases where the quickCheck() method would return "maybe" michael@0: * (which is only possible for the two COMPOSE modes) this method michael@0: * resolves to "yes" or "no" to provide a definitive result, michael@0: * at the cost of doing more work in those cases. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return TRUE if s is normalized michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool michael@0: isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; michael@0: michael@0: /** michael@0: * Tests if the string is normalized. michael@0: * For the two COMPOSE modes, the result could be "maybe" in cases that michael@0: * would take a little more work to resolve definitively. michael@0: * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster michael@0: * combination of quick check + normalization, to avoid michael@0: * re-checking the "yes" prefix. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return UNormalizationCheckResult michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UNormalizationCheckResult michael@0: quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; michael@0: michael@0: /** michael@0: * Returns the end of the normalized substring of the input string. michael@0: * In other words, with end=spanQuickCheckYes(s, ec); michael@0: * the substring UnicodeString(s, 0, end) michael@0: * will pass the quick check with a "yes" result. michael@0: * michael@0: * The returned end index is usually one or more characters before the michael@0: * "no" or "maybe" character: The end index is at a normalization boundary. michael@0: * (See the class documentation for more about normalization boundaries.) michael@0: * michael@0: * When the goal is a normalized string and most input strings are expected michael@0: * to be normalized already, then call this method, michael@0: * and if it returns a prefix shorter than the input string, michael@0: * copy that prefix and use normalizeSecondAndAppend() for the remainder. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return "yes" span end index michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual int32_t michael@0: spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; michael@0: michael@0: /** michael@0: * Tests if the character always has a normalization boundary before it, michael@0: * regardless of context. michael@0: * If true, then the character does not normalization-interact with michael@0: * preceding characters. michael@0: * In other words, a string containing this character can be normalized michael@0: * by processing portions before this character and starting from this michael@0: * character independently. michael@0: * This is used for iterative normalization. See the class documentation for details. michael@0: * @param c character to test michael@0: * @return TRUE if c has a normalization boundary before it michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool hasBoundaryBefore(UChar32 c) const = 0; michael@0: michael@0: /** michael@0: * Tests if the character always has a normalization boundary after it, michael@0: * regardless of context. michael@0: * If true, then the character does not normalization-interact with michael@0: * following characters. michael@0: * In other words, a string containing this character can be normalized michael@0: * by processing portions up to this character and after this michael@0: * character independently. michael@0: * This is used for iterative normalization. See the class documentation for details. michael@0: * Note that this operation may be significantly slower than hasBoundaryBefore(). michael@0: * @param c character to test michael@0: * @return TRUE if c has a normalization boundary after it michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool hasBoundaryAfter(UChar32 c) const = 0; michael@0: michael@0: /** michael@0: * Tests if the character is normalization-inert. michael@0: * If true, then the character does not change, nor normalization-interact with michael@0: * preceding or following characters. michael@0: * In other words, a string containing this character can be normalized michael@0: * by processing portions before this character and after this michael@0: * character independently. michael@0: * This is used for iterative normalization. See the class documentation for details. michael@0: * Note that this operation may be significantly slower than hasBoundaryBefore(). michael@0: * @param c character to test michael@0: * @return TRUE if c is normalization-inert michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool isInert(UChar32 c) const = 0; michael@0: }; michael@0: michael@0: /** michael@0: * Normalization filtered by a UnicodeSet. michael@0: * Normalizes portions of the text contained in the filter set and leaves michael@0: * portions not contained in the filter set unchanged. michael@0: * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). michael@0: * Not-in-the-filter text is treated as "is normalized" and "quick check yes". michael@0: * This class implements all of (and only) the Normalizer2 API. michael@0: * An instance of this class is unmodifiable/immutable but is constructed and michael@0: * must be destructed by the owner. michael@0: * @stable ICU 4.4 michael@0: */ michael@0: class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { michael@0: public: michael@0: /** michael@0: * Constructs a filtered normalizer wrapping any Normalizer2 instance michael@0: * and a filter set. michael@0: * Both are aliased and must not be modified or deleted while this object michael@0: * is used. michael@0: * The filter set should be frozen; otherwise the performance will suffer greatly. michael@0: * @param n2 wrapped Normalizer2 instance michael@0: * @param filterSet UnicodeSet which determines the characters to be normalized michael@0: * @stable ICU 4.4 michael@0: */ michael@0: FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : michael@0: norm2(n2), set(filterSet) {} michael@0: michael@0: /** michael@0: * Destructor. michael@0: * @stable ICU 4.4 michael@0: */ michael@0: ~FilteredNormalizer2(); michael@0: michael@0: /** michael@0: * Writes the normalized form of the source string to the destination string michael@0: * (replacing its contents) and returns the destination string. michael@0: * The source and destination strings must be different objects. michael@0: * @param src source string michael@0: * @param dest destination string; its contents is replaced with normalized src michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return dest michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: normalize(const UnicodeString &src, michael@0: UnicodeString &dest, michael@0: UErrorCode &errorCode) const; michael@0: /** michael@0: * Appends the normalized form of the second string to the first string michael@0: * (merging them at the boundary) and returns the first string. michael@0: * The result is normalized if the first string was normalized. michael@0: * The first and second strings must be different objects. michael@0: * @param first string, should be normalized michael@0: * @param second string, will be normalized michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return first michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: normalizeSecondAndAppend(UnicodeString &first, michael@0: const UnicodeString &second, michael@0: UErrorCode &errorCode) const; michael@0: /** michael@0: * Appends the second string to the first string michael@0: * (merging them at the boundary) and returns the first string. michael@0: * The result is normalized if both the strings were normalized. michael@0: * The first and second strings must be different objects. michael@0: * @param first string, should be normalized michael@0: * @param second string, should be normalized michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return first michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UnicodeString & michael@0: append(UnicodeString &first, michael@0: const UnicodeString &second, michael@0: UErrorCode &errorCode) const; michael@0: michael@0: /** michael@0: * Gets the decomposition mapping of c. michael@0: * For details see the base class documentation. michael@0: * michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * @param c code point michael@0: * @param decomposition String object which will be set to c's michael@0: * decomposition mapping, if there is one. michael@0: * @return TRUE if c has a decomposition, otherwise FALSE michael@0: * @stable ICU 4.6 michael@0: */ michael@0: virtual UBool michael@0: getDecomposition(UChar32 c, UnicodeString &decomposition) const; michael@0: michael@0: /** michael@0: * Gets the raw decomposition mapping of c. michael@0: * For details see the base class documentation. michael@0: * michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * @param c code point michael@0: * @param decomposition String object which will be set to c's michael@0: * raw decomposition mapping, if there is one. michael@0: * @return TRUE if c has a decomposition, otherwise FALSE michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual UBool michael@0: getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; michael@0: michael@0: /** michael@0: * Performs pairwise composition of a & b and returns the composite if there is one. michael@0: * For details see the base class documentation. michael@0: * michael@0: * This function is independent of the mode of the Normalizer2. michael@0: * @param a A (normalization starter) code point. michael@0: * @param b Another code point. michael@0: * @return The non-negative composite code point if there is one; otherwise a negative value. michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual UChar32 michael@0: composePair(UChar32 a, UChar32 b) const; michael@0: michael@0: /** michael@0: * Gets the combining class of c. michael@0: * The default implementation returns 0 michael@0: * but all standard implementations return the Unicode Canonical_Combining_Class value. michael@0: * @param c code point michael@0: * @return c's combining class michael@0: * @stable ICU 49 michael@0: */ michael@0: virtual uint8_t michael@0: getCombiningClass(UChar32 c) const; michael@0: michael@0: /** michael@0: * Tests if the string is normalized. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return TRUE if s is normalized michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool michael@0: isNormalized(const UnicodeString &s, UErrorCode &errorCode) const; michael@0: /** michael@0: * Tests if the string is normalized. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return UNormalizationCheckResult michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UNormalizationCheckResult michael@0: quickCheck(const UnicodeString &s, UErrorCode &errorCode) const; michael@0: /** michael@0: * Returns the end of the normalized substring of the input string. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param s input string michael@0: * @param errorCode Standard ICU error code. Its input value must michael@0: * pass the U_SUCCESS() test, or else the function returns michael@0: * immediately. Check for U_FAILURE() on output or use with michael@0: * function chaining. (See User Guide for details.) michael@0: * @return "yes" span end index michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual int32_t michael@0: spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const; michael@0: michael@0: /** michael@0: * Tests if the character always has a normalization boundary before it, michael@0: * regardless of context. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param c character to test michael@0: * @return TRUE if c has a normalization boundary before it michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool hasBoundaryBefore(UChar32 c) const; michael@0: michael@0: /** michael@0: * Tests if the character always has a normalization boundary after it, michael@0: * regardless of context. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param c character to test michael@0: * @return TRUE if c has a normalization boundary after it michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool hasBoundaryAfter(UChar32 c) const; michael@0: michael@0: /** michael@0: * Tests if the character is normalization-inert. michael@0: * For details see the Normalizer2 base class documentation. michael@0: * @param c character to test michael@0: * @return TRUE if c is normalization-inert michael@0: * @stable ICU 4.4 michael@0: */ michael@0: virtual UBool isInert(UChar32 c) const; michael@0: private: michael@0: UnicodeString & michael@0: normalize(const UnicodeString &src, michael@0: UnicodeString &dest, michael@0: USetSpanCondition spanCondition, michael@0: UErrorCode &errorCode) const; michael@0: michael@0: UnicodeString & michael@0: normalizeSecondAndAppend(UnicodeString &first, michael@0: const UnicodeString &second, michael@0: UBool doNormalize, michael@0: UErrorCode &errorCode) const; michael@0: michael@0: const Normalizer2 &norm2; michael@0: const UnicodeSet &set; michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif // !UCONFIG_NO_NORMALIZATION michael@0: #endif // __NORMALIZER2_H__