michael@0: /* michael@0: ************************************************************************* michael@0: * COPYRIGHT: michael@0: * Copyright (c) 1996-2012, International Business Machines Corporation and michael@0: * others. All Rights Reserved. michael@0: ************************************************************************* michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_NORMALIZATION michael@0: michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/chariter.h" michael@0: #include "unicode/schriter.h" michael@0: #include "unicode/uchriter.h" michael@0: #include "unicode/normlzr.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "normalizer2impl.h" michael@0: #include "uprops.h" // for uniset_getUnicode32Instance() michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Constructors and other boilerplate michael@0: //------------------------------------------------------------------------- michael@0: michael@0: Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : michael@0: UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), michael@0: text(new StringCharacterIterator(str)), michael@0: currentIndex(0), nextIndex(0), michael@0: buffer(), bufferPos(0) michael@0: { michael@0: init(); michael@0: } michael@0: michael@0: Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : michael@0: UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), michael@0: text(new UCharCharacterIterator(str, length)), michael@0: currentIndex(0), nextIndex(0), michael@0: buffer(), bufferPos(0) michael@0: { michael@0: init(); michael@0: } michael@0: michael@0: Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : michael@0: UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), michael@0: text(iter.clone()), michael@0: currentIndex(0), nextIndex(0), michael@0: buffer(), bufferPos(0) michael@0: { michael@0: init(); michael@0: } michael@0: michael@0: Normalizer::Normalizer(const Normalizer ©) : michael@0: UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), michael@0: text(copy.text->clone()), michael@0: currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), michael@0: buffer(copy.buffer), bufferPos(copy.bufferPos) michael@0: { michael@0: init(); michael@0: } michael@0: michael@0: void michael@0: Normalizer::init() { michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); michael@0: if(fOptions&UNORM_UNICODE_3_2) { michael@0: delete fFilteredNorm2; michael@0: fNorm2=fFilteredNorm2= michael@0: new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); michael@0: } michael@0: if(U_FAILURE(errorCode)) { michael@0: errorCode=U_ZERO_ERROR; michael@0: fNorm2=Normalizer2Factory::getNoopInstance(errorCode); michael@0: } michael@0: } michael@0: michael@0: Normalizer::~Normalizer() michael@0: { michael@0: delete fFilteredNorm2; michael@0: delete text; michael@0: } michael@0: michael@0: Normalizer* michael@0: Normalizer::clone() const michael@0: { michael@0: return new Normalizer(*this); michael@0: } michael@0: michael@0: /** michael@0: * Generates a hash code for this iterator. michael@0: */ michael@0: int32_t Normalizer::hashCode() const michael@0: { michael@0: return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; michael@0: } michael@0: michael@0: UBool Normalizer::operator==(const Normalizer& that) const michael@0: { michael@0: return michael@0: this==&that || michael@0: (fUMode==that.fUMode && michael@0: fOptions==that.fOptions && michael@0: *text==*that.text && michael@0: buffer==that.buffer && michael@0: bufferPos==that.bufferPos && michael@0: nextIndex==that.nextIndex); michael@0: } michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Static utility methods michael@0: //------------------------------------------------------------------------- michael@0: michael@0: void U_EXPORT2 michael@0: Normalizer::normalize(const UnicodeString& source, michael@0: UNormalizationMode mode, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status) { michael@0: if(source.isBogus() || U_FAILURE(status)) { michael@0: result.setToBogus(); michael@0: if(U_SUCCESS(status)) { michael@0: status=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } else { michael@0: UnicodeString localDest; michael@0: UnicodeString *dest; michael@0: michael@0: if(&source!=&result) { michael@0: dest=&result; michael@0: } else { michael@0: // the source and result strings are the same object, use a temporary one michael@0: dest=&localDest; michael@0: } michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); michael@0: if(U_SUCCESS(status)) { michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). michael@0: normalize(source, *dest, status); michael@0: } else { michael@0: n2->normalize(source, *dest, status); michael@0: } michael@0: } michael@0: if(dest==&localDest && U_SUCCESS(status)) { michael@0: result=*dest; michael@0: } michael@0: } michael@0: } michael@0: michael@0: void U_EXPORT2 michael@0: Normalizer::compose(const UnicodeString& source, michael@0: UBool compat, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status) { michael@0: normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); michael@0: } michael@0: michael@0: void U_EXPORT2 michael@0: Normalizer::decompose(const UnicodeString& source, michael@0: UBool compat, int32_t options, michael@0: UnicodeString& result, michael@0: UErrorCode &status) { michael@0: normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); michael@0: } michael@0: michael@0: UNormalizationCheckResult michael@0: Normalizer::quickCheck(const UnicodeString& source, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode &status) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); michael@0: if(U_SUCCESS(status)) { michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). michael@0: quickCheck(source, status); michael@0: } else { michael@0: return n2->quickCheck(source, status); michael@0: } michael@0: } else { michael@0: return UNORM_MAYBE; michael@0: } michael@0: } michael@0: michael@0: UBool michael@0: Normalizer::isNormalized(const UnicodeString& source, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode &status) { michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); michael@0: if(U_SUCCESS(status)) { michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). michael@0: isNormalized(source, status); michael@0: } else { michael@0: return n2->isNormalized(source, status); michael@0: } michael@0: } else { michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: UnicodeString & U_EXPORT2 michael@0: Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, michael@0: UnicodeString &result, michael@0: UNormalizationMode mode, int32_t options, michael@0: UErrorCode &errorCode) { michael@0: if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { michael@0: result.setToBogus(); michael@0: if(U_SUCCESS(errorCode)) { michael@0: errorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } else { michael@0: UnicodeString localDest; michael@0: UnicodeString *dest; michael@0: michael@0: if(&right!=&result) { michael@0: dest=&result; michael@0: } else { michael@0: // the right and result strings are the same object, use a temporary one michael@0: dest=&localDest; michael@0: } michael@0: *dest=left; michael@0: const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); michael@0: if(U_SUCCESS(errorCode)) { michael@0: if(options&UNORM_UNICODE_3_2) { michael@0: FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). michael@0: append(*dest, right, errorCode); michael@0: } else { michael@0: n2->append(*dest, right, errorCode); michael@0: } michael@0: } michael@0: if(dest==&localDest && U_SUCCESS(errorCode)) { michael@0: result=*dest; michael@0: } michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Iteration API michael@0: //------------------------------------------------------------------------- michael@0: michael@0: /** michael@0: * Return the current character in the normalized text. michael@0: */ michael@0: UChar32 Normalizer::current() { michael@0: if(bufferPos0 || previousNormalize()) { michael@0: UChar32 c=buffer.char32At(bufferPos-1); michael@0: bufferPos-=U16_LENGTH(c); michael@0: return c; michael@0: } else { michael@0: return DONE; michael@0: } michael@0: } michael@0: michael@0: void Normalizer::reset() { michael@0: currentIndex=nextIndex=text->setToStart(); michael@0: clearBuffer(); michael@0: } michael@0: michael@0: void michael@0: Normalizer::setIndexOnly(int32_t index) { michael@0: text->setIndex(index); // pins index michael@0: currentIndex=nextIndex=text->getIndex(); michael@0: clearBuffer(); michael@0: } michael@0: michael@0: /** michael@0: * Return the first character in the normalized text. This resets michael@0: * the Normalizer's position to the beginning of the text. michael@0: */ michael@0: UChar32 Normalizer::first() { michael@0: reset(); michael@0: return next(); michael@0: } michael@0: michael@0: /** michael@0: * Return the last character in the normalized text. This resets michael@0: * the Normalizer's position to be just before the michael@0: * the input text corresponding to that normalized character. michael@0: */ michael@0: UChar32 Normalizer::last() { michael@0: currentIndex=nextIndex=text->setToEnd(); michael@0: clearBuffer(); michael@0: return previous(); michael@0: } michael@0: michael@0: /** michael@0: * Retrieve the current iteration position in the input text that is michael@0: * being normalized. This method is useful in applications such as michael@0: * searching, where you need to be able to determine the position in michael@0: * the input text that corresponds to a given normalized output character. michael@0: *

michael@0: * Note: This method sets the position in the input, while michael@0: * {@link #next} and {@link #previous} iterate through characters in the michael@0: * output. This means that there is not necessarily a one-to-one michael@0: * correspondence between characters returned by next and michael@0: * previous and the indices passed to and returned from michael@0: * setIndex and {@link #getIndex}. michael@0: * michael@0: */ michael@0: int32_t Normalizer::getIndex() const { michael@0: if(bufferPosCharacterIterator or the start (i.e. 0) of the String michael@0: * over which this Normalizer is iterating michael@0: */ michael@0: int32_t Normalizer::startIndex() const { michael@0: return text->startIndex(); michael@0: } michael@0: michael@0: /** michael@0: * Retrieve the index of the end of the input text. This is the end index michael@0: * of the CharacterIterator or the length of the String michael@0: * over which this Normalizer is iterating michael@0: */ michael@0: int32_t Normalizer::endIndex() const { michael@0: return text->endIndex(); michael@0: } michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Property access methods michael@0: //------------------------------------------------------------------------- michael@0: michael@0: void michael@0: Normalizer::setMode(UNormalizationMode newMode) michael@0: { michael@0: fUMode = newMode; michael@0: init(); michael@0: } michael@0: michael@0: UNormalizationMode michael@0: Normalizer::getUMode() const michael@0: { michael@0: return fUMode; michael@0: } michael@0: michael@0: void michael@0: Normalizer::setOption(int32_t option, michael@0: UBool value) michael@0: { michael@0: if (value) { michael@0: fOptions |= option; michael@0: } else { michael@0: fOptions &= (~option); michael@0: } michael@0: init(); michael@0: } michael@0: michael@0: UBool michael@0: Normalizer::getOption(int32_t option) const michael@0: { michael@0: return (fOptions & option) != 0; michael@0: } michael@0: michael@0: /** michael@0: * Set the input text over which this Normalizer will iterate. michael@0: * The iteration position is set to the beginning of the input text. michael@0: */ michael@0: void michael@0: Normalizer::setText(const UnicodeString& newText, michael@0: UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: CharacterIterator *newIter = new StringCharacterIterator(newText); michael@0: if (newIter == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: delete text; michael@0: text = newIter; michael@0: reset(); michael@0: } michael@0: michael@0: /** michael@0: * Set the input text over which this Normalizer will iterate. michael@0: * The iteration position is set to the beginning of the string. michael@0: */ michael@0: void michael@0: Normalizer::setText(const CharacterIterator& newText, michael@0: UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: CharacterIterator *newIter = newText.clone(); michael@0: if (newIter == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: delete text; michael@0: text = newIter; michael@0: reset(); michael@0: } michael@0: michael@0: void michael@0: Normalizer::setText(const UChar* newText, michael@0: int32_t length, michael@0: UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: CharacterIterator *newIter = new UCharCharacterIterator(newText, length); michael@0: if (newIter == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: delete text; michael@0: text = newIter; michael@0: reset(); michael@0: } michael@0: michael@0: /** michael@0: * Copies the text under iteration into the UnicodeString referred to by "result". michael@0: * @param result Receives a copy of the text under iteration. michael@0: */ michael@0: void michael@0: Normalizer::getText(UnicodeString& result) michael@0: { michael@0: text->getText(result); michael@0: } michael@0: michael@0: //------------------------------------------------------------------------- michael@0: // Private utility methods michael@0: //------------------------------------------------------------------------- michael@0: michael@0: void Normalizer::clearBuffer() { michael@0: buffer.remove(); michael@0: bufferPos=0; michael@0: } michael@0: michael@0: UBool michael@0: Normalizer::nextNormalize() { michael@0: clearBuffer(); michael@0: currentIndex=nextIndex; michael@0: text->setIndex(nextIndex); michael@0: if(!text->hasNext()) { michael@0: return FALSE; michael@0: } michael@0: // Skip at least one character so we make progress. michael@0: UnicodeString segment(text->next32PostInc()); michael@0: while(text->hasNext()) { michael@0: UChar32 c; michael@0: if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { michael@0: text->move32(-1, CharacterIterator::kCurrent); michael@0: break; michael@0: } michael@0: segment.append(c); michael@0: } michael@0: nextIndex=text->getIndex(); michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: fNorm2->normalize(segment, buffer, errorCode); michael@0: return U_SUCCESS(errorCode) && !buffer.isEmpty(); michael@0: } michael@0: michael@0: UBool michael@0: Normalizer::previousNormalize() { michael@0: clearBuffer(); michael@0: nextIndex=currentIndex; michael@0: text->setIndex(currentIndex); michael@0: if(!text->hasPrevious()) { michael@0: return FALSE; michael@0: } michael@0: UnicodeString segment; michael@0: while(text->hasPrevious()) { michael@0: UChar32 c=text->previous32(); michael@0: segment.insert(0, c); michael@0: if(fNorm2->hasBoundaryBefore(c)) { michael@0: break; michael@0: } michael@0: } michael@0: currentIndex=text->getIndex(); michael@0: UErrorCode errorCode=U_ZERO_ERROR; michael@0: fNorm2->normalize(segment, buffer, errorCode); michael@0: bufferPos=buffer.length(); michael@0: return U_SUCCESS(errorCode) && !buffer.isEmpty(); michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */