michael@0: /* michael@0: ******************************************************************** michael@0: * COPYRIGHT: michael@0: * Copyright (c) 1996-2011, International Business Machines Corporation and michael@0: * others. All Rights Reserved. michael@0: ******************************************************************** michael@0: */ michael@0: michael@0: #ifndef NORMLZR_H michael@0: #define NORMLZR_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: /** michael@0: * \file michael@0: * \brief C++ API: Unicode Normalization michael@0: */ michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/chariter.h" michael@0: #include "unicode/normalizer2.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/unorm.h" michael@0: #include "unicode/uobject.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: /** michael@0: * The Normalizer class supports the standard normalization forms described in michael@0: * michael@0: * Unicode Standard Annex #15: Unicode Normalization Forms. michael@0: * michael@0: * Note: This API has been replaced by the Normalizer2 class and is only available michael@0: * for backward compatibility. This class simply delegates to the Normalizer2 class. michael@0: * There is one exception: The new API does not provide a replacement for Normalizer::compare(). michael@0: * michael@0: * The Normalizer class consists of two parts: michael@0: * - static functions that normalize strings or test if strings are normalized michael@0: * - a Normalizer object is an iterator that takes any kind of text and michael@0: * provides iteration over its normalized form michael@0: * michael@0: * The Normalizer class is not suitable for subclassing. michael@0: * michael@0: * For basic information about normalization forms and details about the C API michael@0: * please see the documentation in unorm.h. michael@0: * michael@0: * The iterator API with the Normalizer constructors and the non-static functions michael@0: * use a CharacterIterator as input. It is possible to pass a string which michael@0: * is then internally wrapped in a CharacterIterator. michael@0: * The input text is not normalized all at once, but incrementally where needed michael@0: * (providing efficient random access). michael@0: * This allows to pass in a large text but spend only a small amount of time michael@0: * normalizing a small part of that text. michael@0: * However, if the entire text is normalized, then the iterator will be michael@0: * slower than normalizing the entire text at once and iterating over the result. michael@0: * A possible use of the Normalizer iterator is also to report an index into the michael@0: * original text that is close to where the normalized characters come from. michael@0: * michael@0: * Important: The iterator API was cleaned up significantly for ICU 2.0. michael@0: * The earlier implementation reported the getIndex() inconsistently, michael@0: * and previous() could not be used after setIndex(), next(), first(), and current(). michael@0: * michael@0: * Normalizer allows to start normalizing from anywhere in the input text by michael@0: * calling setIndexOnly(), first(), or last(). michael@0: * Without calling any of these, the iterator will start at the beginning of the text. michael@0: * michael@0: * At any time, next() returns the next normalized code point (UChar32), michael@0: * with post-increment semantics (like CharacterIterator::next32PostInc()). michael@0: * previous() returns the previous normalized code point (UChar32), michael@0: * with pre-decrement semantics (like CharacterIterator::previous32()). michael@0: * michael@0: * current() returns the current code point michael@0: * (respectively the one at the newly set index) without moving michael@0: * the getIndex(). Note that if the text at the current position michael@0: * needs to be normalized, then these functions will do that. michael@0: * (This is why current() is not const.) michael@0: * It is more efficient to call setIndexOnly() instead, which does not michael@0: * normalize. michael@0: * michael@0: * getIndex() always refers to the position in the input text where the normalized michael@0: * code points are returned from. It does not always change with each returned michael@0: * code point. michael@0: * The code point that is returned from any of the functions michael@0: * corresponds to text at or after getIndex(), according to the michael@0: * function's iteration semantics (post-increment or pre-decrement). michael@0: * michael@0: * next() returns a code point from at or after the getIndex() michael@0: * from before the next() call. After the next() call, the getIndex() michael@0: * might have moved to where the next code point will be returned from michael@0: * (from a next() or current() call). michael@0: * This is semantically equivalent to array access with array[index++] michael@0: * (post-increment semantics). michael@0: * michael@0: * previous() returns a code point from at or after the getIndex() michael@0: * from after the previous() call. michael@0: * This is semantically equivalent to array access with array[--index] michael@0: * (pre-decrement semantics). michael@0: * michael@0: * Internally, the Normalizer iterator normalizes a small piece of text michael@0: * starting at the getIndex() and ending at a following "safe" index. michael@0: * The normalized results is stored in an internal string buffer, and michael@0: * the code points are iterated from there. michael@0: * With multiple iteration calls, this is repeated until the next piece michael@0: * of text needs to be normalized, and the getIndex() needs to be moved. michael@0: * michael@0: * The following "safe" index, the internal buffer, and the secondary michael@0: * iteration index into that buffer are not exposed on the API. michael@0: * This also means that it is currently not practical to return to michael@0: * a particular, arbitrary position in the text because one would need to michael@0: * know, and be able to set, in addition to the getIndex(), at least also the michael@0: * current index into the internal buffer. michael@0: * It is currently only possible to observe when getIndex() changes michael@0: * (with careful consideration of the iteration semantics), michael@0: * at which time the internal index will be 0. michael@0: * For example, if getIndex() is different after next() than before it, michael@0: * then the internal index is 0 and one can return to this getIndex() michael@0: * later with setIndexOnly(). michael@0: * michael@0: * Note: While the setIndex() and getIndex() refer to indices in the michael@0: * underlying Unicode input text, the next() and previous() methods michael@0: * iterate through characters in the normalized output. michael@0: * This means that there is not necessarily a one-to-one correspondence michael@0: * between characters returned by next() and previous() and the indices michael@0: * passed to and returned from setIndex() and getIndex(). michael@0: * It is for this reason that Normalizer does not implement the CharacterIterator interface. michael@0: * michael@0: * @author Laura Werner, Mark Davis, Markus Scherer michael@0: * @stable ICU 2.0 michael@0: */ michael@0: class U_COMMON_API Normalizer : public UObject { michael@0: public: michael@0: /** michael@0: * If DONE is returned from an iteration function that returns a code point, michael@0: * then there are no more normalization results available. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: enum { michael@0: DONE=0xffff michael@0: }; michael@0: michael@0: // Constructors michael@0: michael@0: /** michael@0: * Creates a new Normalizer object for iterating over the michael@0: * normalized form of a given string. michael@0: *

michael@0: * @param str The string to be normalized. The normalization michael@0: * will start at the beginning of the string. michael@0: * michael@0: * @param mode The normalization mode. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: Normalizer(const UnicodeString& str, UNormalizationMode mode); michael@0: michael@0: /** michael@0: * Creates a new Normalizer object for iterating over the michael@0: * normalized form of a given string. michael@0: *

michael@0: * @param str The string to be normalized. The normalization michael@0: * will start at the beginning of the string. michael@0: * michael@0: * @param length Length of the string, or -1 if NUL-terminated. michael@0: * @param mode The normalization mode. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: Normalizer(const UChar* str, int32_t length, UNormalizationMode mode); michael@0: michael@0: /** michael@0: * Creates a new Normalizer object for iterating over the michael@0: * normalized form of the given text. michael@0: *

michael@0: * @param iter The input text to be normalized. The normalization michael@0: * will start at the beginning of the string. michael@0: * michael@0: * @param mode The normalization mode. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: Normalizer(const CharacterIterator& iter, UNormalizationMode mode); michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: * @param copy The object to be copied. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: Normalizer(const Normalizer& copy); michael@0: michael@0: /** michael@0: * Destructor michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual ~Normalizer(); michael@0: michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Static utility methods michael@0: //------------------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Normalizes a UnicodeString according to the specified normalization mode. michael@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. michael@0: * michael@0: * The options parameter specifies which optional michael@0: * Normalizer features are to be enabled for this operation. michael@0: * michael@0: * @param source the input string to be normalized. michael@0: * @param mode the normalization mode michael@0: * @param options the optional features to be enabled (0 for no options) michael@0: * @param result The normalized string (on output). michael@0: * @param status The error code. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: static void U_EXPORT2 normalize(const UnicodeString& source, michael@0: UNormalizationMode mode, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Compose a UnicodeString. michael@0: * This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC. michael@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. michael@0: * michael@0: * The options parameter specifies which optional michael@0: * Normalizer features are to be enabled for this operation. michael@0: * michael@0: * @param source the string to be composed. michael@0: * @param compat Perform compatibility decomposition before composition. michael@0: * If this argument is FALSE, only canonical michael@0: * decomposition will be performed. michael@0: * @param options the optional features to be enabled (0 for no options) michael@0: * @param result The composed string (on output). michael@0: * @param status The error code. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: static void U_EXPORT2 compose(const UnicodeString& source, michael@0: UBool compat, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Static method to decompose a UnicodeString. michael@0: * This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD. michael@0: * This is a wrapper for unorm_normalize(), using UnicodeString's. michael@0: * michael@0: * The options parameter specifies which optional michael@0: * Normalizer features are to be enabled for this operation. michael@0: * michael@0: * @param source the string to be decomposed. michael@0: * @param compat Perform compatibility decomposition. michael@0: * If this argument is FALSE, only canonical michael@0: * decomposition will be performed. michael@0: * @param options the optional features to be enabled (0 for no options) michael@0: * @param result The decomposed string (on output). michael@0: * @param status The error code. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: static void U_EXPORT2 decompose(const UnicodeString& source, michael@0: UBool compat, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Performing quick check on a string, to quickly determine if the string is michael@0: * in a particular normalization format. michael@0: * This is a wrapper for unorm_quickCheck(), using a UnicodeString. michael@0: * michael@0: * Three types of result can be returned UNORM_YES, UNORM_NO or michael@0: * UNORM_MAYBE. Result UNORM_YES indicates that the argument michael@0: * string is in the desired normalized format, UNORM_NO determines that michael@0: * argument string is not in the desired normalized format. A michael@0: * UNORM_MAYBE result indicates that a more thorough check is required, michael@0: * the user may have to put the string in its normalized form and compare the michael@0: * results. michael@0: * @param source string for determining if it is in a normalized format michael@0: * @param mode normalization format michael@0: * @param status A reference to a UErrorCode to receive any errors michael@0: * @return UNORM_YES, UNORM_NO or UNORM_MAYBE michael@0: * michael@0: * @see isNormalized michael@0: * @stable ICU 2.0 michael@0: */ michael@0: static inline UNormalizationCheckResult michael@0: quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status); michael@0: michael@0: /** michael@0: * Performing quick check on a string; same as the other version of quickCheck michael@0: * but takes an extra options parameter like most normalization functions. michael@0: * michael@0: * @param source string for determining if it is in a normalized format michael@0: * @param mode normalization format michael@0: * @param options the optional features to be enabled (0 for no options) michael@0: * @param status A reference to a UErrorCode to receive any errors michael@0: * @return UNORM_YES, UNORM_NO or UNORM_MAYBE michael@0: * michael@0: * @see isNormalized michael@0: * @stable ICU 2.6 michael@0: */ michael@0: static UNormalizationCheckResult michael@0: quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status); michael@0: michael@0: /** michael@0: * Test if a string is in a given normalization form. michael@0: * This is semantically equivalent to source.equals(normalize(source, mode)) . michael@0: * michael@0: * Unlike unorm_quickCheck(), this function returns a definitive result, michael@0: * never a "maybe". michael@0: * For NFD, NFKD, and FCD, both functions work exactly the same. michael@0: * For NFC and NFKC where quickCheck may return "maybe", this function will michael@0: * perform further tests to arrive at a TRUE/FALSE result. michael@0: * michael@0: * @param src String that is to be tested if it is in a normalization format. michael@0: * @param mode Which normalization form to test for. michael@0: * @param errorCode ICU error code in/out parameter. michael@0: * Must fulfill U_SUCCESS before the function call. michael@0: * @return Boolean value indicating whether the source string is in the michael@0: * "mode" normalization form. michael@0: * michael@0: * @see quickCheck michael@0: * @stable ICU 2.2 michael@0: */ michael@0: static inline UBool michael@0: isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Test if a string is in a given normalization form; same as the other version of isNormalized michael@0: * but takes an extra options parameter like most normalization functions. michael@0: * michael@0: * @param src String that is to be tested if it is in a normalization format. michael@0: * @param mode Which normalization form to test for. michael@0: * @param options the optional features to be enabled (0 for no options) michael@0: * @param errorCode ICU error code in/out parameter. michael@0: * Must fulfill U_SUCCESS before the function call. michael@0: * @return Boolean value indicating whether the source string is in the michael@0: * "mode" normalization form. michael@0: * michael@0: * @see quickCheck michael@0: * @stable ICU 2.6 michael@0: */ michael@0: static UBool michael@0: isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Concatenate normalized strings, making sure that the result is normalized as well. michael@0: * michael@0: * If both the left and the right strings are in michael@0: * the normalization form according to "mode/options", michael@0: * then the result will be michael@0: * michael@0: * \code michael@0: * dest=normalize(left+right, mode, options) michael@0: * \endcode michael@0: * michael@0: * For details see unorm_concatenate in unorm.h. michael@0: * michael@0: * @param left Left source string. michael@0: * @param right Right source string. michael@0: * @param result The output string. michael@0: * @param mode The normalization mode. michael@0: * @param options A bit set of normalization options. michael@0: * @param errorCode ICU error code in/out parameter. michael@0: * Must fulfill U_SUCCESS before the function call. michael@0: * @return result michael@0: * michael@0: * @see unorm_concatenate michael@0: * @see normalize michael@0: * @see unorm_next michael@0: * @see unorm_previous michael@0: * michael@0: * @stable ICU 2.1 michael@0: */ michael@0: static UnicodeString & michael@0: U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right, michael@0: UnicodeString &result, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode &errorCode); michael@0: michael@0: /** michael@0: * Compare two strings for canonical equivalence. michael@0: * Further options include case-insensitive comparison and michael@0: * code point order (as opposed to code unit order). michael@0: * michael@0: * Canonical equivalence between two strings is defined as their normalized michael@0: * forms (NFD or NFC) being identical. michael@0: * This function compares strings incrementally instead of normalizing michael@0: * (and optionally case-folding) both strings entirely, michael@0: * improving performance significantly. michael@0: * michael@0: * Bulk normalization is only necessary if the strings do not fulfill the FCD michael@0: * conditions. Only in this case, and only if the strings are relatively long, michael@0: * is memory allocated temporarily. michael@0: * For FCD strings and short non-FCD strings there is no memory allocation. michael@0: * michael@0: * Semantically, this is equivalent to michael@0: * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) michael@0: * where code point order and foldCase are all optional. michael@0: * michael@0: * UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match michael@0: * the case folding must be performed first, then the normalization. michael@0: * michael@0: * @param s1 First source string. michael@0: * @param s2 Second source string. michael@0: * michael@0: * @param options A bit set of options: michael@0: * - U_FOLD_CASE_DEFAULT or 0 is used for default options: michael@0: * Case-sensitive comparison in code unit order, and the input strings michael@0: * are quick-checked for FCD. michael@0: * michael@0: * - UNORM_INPUT_IS_FCD michael@0: * Set if the caller knows that both s1 and s2 fulfill the FCD conditions. michael@0: * If not set, the function will quickCheck for FCD michael@0: * and normalize if necessary. michael@0: * michael@0: * - U_COMPARE_CODE_POINT_ORDER michael@0: * Set to choose code point order instead of code unit order michael@0: * (see u_strCompare for details). michael@0: * michael@0: * - U_COMPARE_IGNORE_CASE michael@0: * Set to compare strings case-insensitively using case folding, michael@0: * instead of case-sensitively. michael@0: * If set, then the following case folding options are used. michael@0: * michael@0: * - Options as used with case-insensitive comparisons, currently: michael@0: * michael@0: * - U_FOLD_CASE_EXCLUDE_SPECIAL_I michael@0: * (see u_strCaseCompare for details) michael@0: * michael@0: * - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT michael@0: * michael@0: * @param errorCode ICU error code in/out parameter. michael@0: * Must fulfill U_SUCCESS before the function call. michael@0: * @return <0 or 0 or >0 as usual for string comparisons michael@0: * michael@0: * @see unorm_compare michael@0: * @see normalize michael@0: * @see UNORM_FCD michael@0: * @see u_strCompare michael@0: * @see u_strCaseCompare michael@0: * michael@0: * @stable ICU 2.2 michael@0: */ michael@0: static inline int32_t michael@0: compare(const UnicodeString &s1, const UnicodeString &s2, michael@0: uint32_t options, michael@0: UErrorCode &errorCode); michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Iteration API michael@0: //------------------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Return the current character in the normalized text. michael@0: * current() may need to normalize some text at getIndex(). michael@0: * The getIndex() is not changed. michael@0: * michael@0: * @return the current normalized code point michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UChar32 current(void); michael@0: michael@0: /** michael@0: * Return the first character in the normalized text. michael@0: * This is equivalent to setIndexOnly(startIndex()) followed by next(). michael@0: * (Post-increment semantics.) michael@0: * michael@0: * @return the first normalized code point michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UChar32 first(void); michael@0: michael@0: /** michael@0: * Return the last character in the normalized text. michael@0: * This is equivalent to setIndexOnly(endIndex()) followed by previous(). michael@0: * (Pre-decrement semantics.) michael@0: * michael@0: * @return the last normalized code point michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UChar32 last(void); michael@0: michael@0: /** michael@0: * Return the next character in the normalized text. michael@0: * (Post-increment semantics.) michael@0: * If the end of the text has already been reached, DONE is returned. michael@0: * The DONE value could be confused with a U+FFFF non-character code point michael@0: * in the text. If this is possible, you can test getIndex()startIndex() || first()!=DONE). (Calling first() will change michael@0: * the iterator state!) michael@0: * michael@0: * The C API unorm_previous() is more efficient and does not have this ambiguity. michael@0: * michael@0: * @return the previous normalized code point michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UChar32 previous(void); michael@0: michael@0: /** michael@0: * Set the iteration position in the input text that is being normalized, michael@0: * without any immediate normalization. michael@0: * After setIndexOnly(), getIndex() will return the same index that is michael@0: * specified here. michael@0: * michael@0: * @param index the desired index in the input text. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setIndexOnly(int32_t index); michael@0: michael@0: /** michael@0: * Reset the index to the beginning of the text. michael@0: * This is equivalent to setIndexOnly(startIndex)). michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void reset(void); michael@0: michael@0: /** michael@0: * Retrieve the current iteration position in the input text that is michael@0: * being normalized. michael@0: * michael@0: * A following call to next() will return a normalized code point from michael@0: * the input text at or after this index. michael@0: * michael@0: * After a call to previous(), getIndex() will point at or before the michael@0: * position in the input text where the normalized code point michael@0: * was returned from with previous(). michael@0: * michael@0: * @return the current index in the input text michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t getIndex(void) const; michael@0: michael@0: /** michael@0: * Retrieve the index of the start of the input text. This is the begin index michael@0: * of the CharacterIterator or the start (i.e. index 0) of the string michael@0: * over which this Normalizer is iterating. michael@0: * michael@0: * @return the smallest index in the input text where the Normalizer operates michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t startIndex(void) const; michael@0: michael@0: /** michael@0: * Retrieve the index of the end of the input text. This is the end index michael@0: * of the CharacterIterator or the length of the string michael@0: * over which this Normalizer is iterating. michael@0: * This end index is exclusive, i.e., the Normalizer operates only on characters michael@0: * before this index. michael@0: * michael@0: * @return the first index in the input text where the Normalizer does not operate michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t endIndex(void) const; michael@0: michael@0: /** michael@0: * Returns TRUE when both iterators refer to the same character in the same michael@0: * input text. michael@0: * michael@0: * @param that a Normalizer object to compare this one to michael@0: * @return comparison result michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UBool operator==(const Normalizer& that) const; michael@0: michael@0: /** michael@0: * Returns FALSE when both iterators refer to the same character in the same michael@0: * input text. michael@0: * michael@0: * @param that a Normalizer object to compare this one to michael@0: * @return comparison result michael@0: * @stable ICU 2.0 michael@0: */ michael@0: inline UBool operator!=(const Normalizer& that) const; michael@0: michael@0: /** michael@0: * Returns a pointer to a new Normalizer that is a clone of this one. michael@0: * The caller is responsible for deleting the new clone. michael@0: * @return a pointer to a new Normalizer michael@0: * @stable ICU 2.0 michael@0: */ michael@0: Normalizer* clone(void) const; michael@0: michael@0: /** michael@0: * Generates a hash code for this iterator. michael@0: * michael@0: * @return the hash code michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t hashCode(void) const; michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Property access methods michael@0: //------------------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Set the normalization mode for this object. michael@0: *

michael@0: * Note:If the normalization mode is changed while iterating michael@0: * over a string, calls to {@link #next() } and {@link #previous() } may michael@0: * return previously buffers characters in the old normalization mode michael@0: * until the iteration is able to re-sync at the next base character. michael@0: * It is safest to call {@link #setIndexOnly }, {@link #reset() }, michael@0: * {@link #setText }, {@link #first() }, michael@0: * {@link #last() }, etc. after calling setMode. michael@0: *

michael@0: * @param newMode the new mode for this Normalizer. michael@0: * @see #getUMode michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setMode(UNormalizationMode newMode); michael@0: michael@0: /** michael@0: * Return the normalization mode for this object. michael@0: * michael@0: * This is an unusual name because there used to be a getMode() that michael@0: * returned a different type. michael@0: * michael@0: * @return the mode for this Normalizer michael@0: * @see #setMode michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UNormalizationMode getUMode(void) const; michael@0: michael@0: /** michael@0: * Set options that affect this Normalizer's operation. michael@0: * Options do not change the basic composition or decomposition operation michael@0: * that is being performed, but they control whether michael@0: * certain optional portions of the operation are done. michael@0: * Currently the only available option is obsolete. michael@0: * michael@0: * It is possible to specify multiple options that are all turned on or off. michael@0: * michael@0: * @param option the option(s) whose value is/are to be set. michael@0: * @param value the new setting for the option. Use TRUE to michael@0: * turn the option(s) on and FALSE to turn it/them off. michael@0: * michael@0: * @see #getOption michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setOption(int32_t option, michael@0: UBool value); michael@0: michael@0: /** michael@0: * Determine whether an option is turned on or off. michael@0: * If multiple options are specified, then the result is TRUE if any michael@0: * of them are set. michael@0: *

michael@0: * @param option the option(s) that are to be checked michael@0: * @return TRUE if any of the option(s) are set michael@0: * @see #setOption michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UBool getOption(int32_t option) const; michael@0: michael@0: /** michael@0: * Set the input text over which this Normalizer will iterate. michael@0: * The iteration position is set to the beginning. michael@0: * michael@0: * @param newText a string that replaces the current input text michael@0: * @param status a UErrorCode michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setText(const UnicodeString& newText, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Set the input text over which this Normalizer will iterate. michael@0: * The iteration position is set to the beginning. michael@0: * michael@0: * @param newText a CharacterIterator object that replaces the current input text michael@0: * @param status a UErrorCode michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setText(const CharacterIterator& newText, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Set the input text over which this Normalizer will iterate. michael@0: * The iteration position is set to the beginning. michael@0: * michael@0: * @param newText a string that replaces the current input text michael@0: * @param length the length of the string, or -1 if NUL-terminated michael@0: * @param status a UErrorCode michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setText(const UChar* newText, michael@0: int32_t length, michael@0: UErrorCode &status); michael@0: /** michael@0: * Copies the input text into the UnicodeString argument. michael@0: * michael@0: * @param result Receives a copy of the text under iteration. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void getText(UnicodeString& result); michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for this class. michael@0: * @returns a UClassID for this class. michael@0: * @stable ICU 2.2 michael@0: */ michael@0: static UClassID U_EXPORT2 getStaticClassID(); michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for the actual class. michael@0: * @return a UClassID for the actual class. michael@0: * @stable ICU 2.2 michael@0: */ michael@0: virtual UClassID getDynamicClassID() const; michael@0: michael@0: private: michael@0: //------------------------------------------------------------------------- michael@0: // Private functions michael@0: //------------------------------------------------------------------------- michael@0: michael@0: Normalizer(); // default constructor not implemented michael@0: Normalizer &operator=(const Normalizer &that); // assignment operator not implemented michael@0: michael@0: // Private utility methods for iteration michael@0: // For documentation, see the source code michael@0: UBool nextNormalize(); michael@0: UBool previousNormalize(); michael@0: michael@0: void init(); michael@0: void clearBuffer(void); michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Private data michael@0: //------------------------------------------------------------------------- michael@0: michael@0: FilteredNormalizer2*fFilteredNorm2; // owned if not NULL michael@0: const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2 michael@0: UNormalizationMode fUMode; michael@0: int32_t fOptions; michael@0: michael@0: // The input text and our position in it michael@0: CharacterIterator *text; michael@0: michael@0: // The normalization buffer is the result of normalization michael@0: // of the source in [currentIndex..nextIndex[ . michael@0: int32_t currentIndex, nextIndex; michael@0: michael@0: // A buffer for holding intermediate results michael@0: UnicodeString buffer; michael@0: int32_t bufferPos; michael@0: }; michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Inline implementations michael@0: //------------------------------------------------------------------------- michael@0: michael@0: inline UBool michael@0: Normalizer::operator!= (const Normalizer& other) const michael@0: { return ! operator==(other); } michael@0: michael@0: inline UNormalizationCheckResult michael@0: Normalizer::quickCheck(const UnicodeString& source, michael@0: UNormalizationMode mode, michael@0: UErrorCode &status) { michael@0: return quickCheck(source, mode, 0, status); michael@0: } michael@0: michael@0: inline UBool michael@0: Normalizer::isNormalized(const UnicodeString& source, michael@0: UNormalizationMode mode, michael@0: UErrorCode &status) { michael@0: return isNormalized(source, mode, 0, status); michael@0: } michael@0: michael@0: inline int32_t michael@0: Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2, michael@0: uint32_t options, michael@0: UErrorCode &errorCode) { michael@0: // all argument checking is done in unorm_compare michael@0: return unorm_compare(s1.getBuffer(), s1.length(), michael@0: s2.getBuffer(), s2.length(), michael@0: options, michael@0: &errorCode); michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */ michael@0: michael@0: #endif // NORMLZR_H