michael@0: /*
michael@0:  *************************************************************************
michael@0:  * COPYRIGHT: 
michael@0:  * Copyright (c) 1996-2012, International Business Machines Corporation and
michael@0:  * others. All Rights Reserved.
michael@0:  *************************************************************************
michael@0:  */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_NORMALIZATION
michael@0: 
michael@0: #include "unicode/uniset.h"
michael@0: #include "unicode/unistr.h"
michael@0: #include "unicode/chariter.h"
michael@0: #include "unicode/schriter.h"
michael@0: #include "unicode/uchriter.h"
michael@0: #include "unicode/normlzr.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "cmemory.h"
michael@0: #include "normalizer2impl.h"
michael@0: #include "uprops.h"  // for uniset_getUnicode32Instance()
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
michael@0: 
michael@0: //-------------------------------------------------------------------------
michael@0: // Constructors and other boilerplate
michael@0: //-------------------------------------------------------------------------
michael@0: 
michael@0: Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
michael@0:     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0:     text(new StringCharacterIterator(str)),
michael@0:     currentIndex(0), nextIndex(0),
michael@0:     buffer(), bufferPos(0)
michael@0: {
michael@0:     init();
michael@0: }
michael@0: 
michael@0: Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
michael@0:     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0:     text(new UCharCharacterIterator(str, length)),
michael@0:     currentIndex(0), nextIndex(0),
michael@0:     buffer(), bufferPos(0)
michael@0: {
michael@0:     init();
michael@0: }
michael@0: 
michael@0: Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
michael@0:     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
michael@0:     text(iter.clone()),
michael@0:     currentIndex(0), nextIndex(0),
michael@0:     buffer(), bufferPos(0)
michael@0: {
michael@0:     init();
michael@0: }
michael@0: 
michael@0: Normalizer::Normalizer(const Normalizer &copy) :
michael@0:     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
michael@0:     text(copy.text->clone()),
michael@0:     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
michael@0:     buffer(copy.buffer), bufferPos(copy.bufferPos)
michael@0: {
michael@0:     init();
michael@0: }
michael@0: 
michael@0: void
michael@0: Normalizer::init() {
michael@0:     UErrorCode errorCode=U_ZERO_ERROR;
michael@0:     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
michael@0:     if(fOptions&UNORM_UNICODE_3_2) {
michael@0:         delete fFilteredNorm2;
michael@0:         fNorm2=fFilteredNorm2=
michael@0:             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
michael@0:     }
michael@0:     if(U_FAILURE(errorCode)) {
michael@0:         errorCode=U_ZERO_ERROR;
michael@0:         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
michael@0:     }
michael@0: }
michael@0: 
michael@0: Normalizer::~Normalizer()
michael@0: {
michael@0:     delete fFilteredNorm2;
michael@0:     delete text;
michael@0: }
michael@0: 
michael@0: Normalizer* 
michael@0: Normalizer::clone() const
michael@0: {
michael@0:     return new Normalizer(*this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Generates a hash code for this iterator.
michael@0:  */
michael@0: int32_t Normalizer::hashCode() const
michael@0: {
michael@0:     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
michael@0: }
michael@0:     
michael@0: UBool Normalizer::operator==(const Normalizer& that) const
michael@0: {
michael@0:     return
michael@0:         this==&that ||
michael@0:         (fUMode==that.fUMode &&
michael@0:         fOptions==that.fOptions &&
michael@0:         *text==*that.text &&
michael@0:         buffer==that.buffer &&
michael@0:         bufferPos==that.bufferPos &&
michael@0:         nextIndex==that.nextIndex);
michael@0: }
michael@0: 
michael@0: //-------------------------------------------------------------------------
michael@0: // Static utility methods
michael@0: //-------------------------------------------------------------------------
michael@0: 
michael@0: void U_EXPORT2
michael@0: Normalizer::normalize(const UnicodeString& source, 
michael@0:                       UNormalizationMode mode, int32_t options,
michael@0:                       UnicodeString& result, 
michael@0:                       UErrorCode &status) {
michael@0:     if(source.isBogus() || U_FAILURE(status)) {
michael@0:         result.setToBogus();
michael@0:         if(U_SUCCESS(status)) {
michael@0:             status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         }
michael@0:     } else {
michael@0:         UnicodeString localDest;
michael@0:         UnicodeString *dest;
michael@0: 
michael@0:         if(&source!=&result) {
michael@0:             dest=&result;
michael@0:         } else {
michael@0:             // the source and result strings are the same object, use a temporary one
michael@0:             dest=&localDest;
michael@0:         }
michael@0:         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0:         if(U_SUCCESS(status)) {
michael@0:             if(options&UNORM_UNICODE_3_2) {
michael@0:                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0:                     normalize(source, *dest, status);
michael@0:             } else {
michael@0:                 n2->normalize(source, *dest, status);
michael@0:             }
michael@0:         }
michael@0:         if(dest==&localDest && U_SUCCESS(status)) {
michael@0:             result=*dest;
michael@0:         }
michael@0:     }
michael@0: }
michael@0: 
michael@0: void U_EXPORT2
michael@0: Normalizer::compose(const UnicodeString& source, 
michael@0:                     UBool compat, int32_t options,
michael@0:                     UnicodeString& result, 
michael@0:                     UErrorCode &status) {
michael@0:     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
michael@0: }
michael@0: 
michael@0: void U_EXPORT2
michael@0: Normalizer::decompose(const UnicodeString& source, 
michael@0:                       UBool compat, int32_t options,
michael@0:                       UnicodeString& result, 
michael@0:                       UErrorCode &status) {
michael@0:     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
michael@0: }
michael@0: 
michael@0: UNormalizationCheckResult
michael@0: Normalizer::quickCheck(const UnicodeString& source,
michael@0:                        UNormalizationMode mode, int32_t options,
michael@0:                        UErrorCode &status) {
michael@0:     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0:     if(U_SUCCESS(status)) {
michael@0:         if(options&UNORM_UNICODE_3_2) {
michael@0:             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0:                 quickCheck(source, status);
michael@0:         } else {
michael@0:             return n2->quickCheck(source, status);
michael@0:         }
michael@0:     } else {
michael@0:         return UNORM_MAYBE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: UBool
michael@0: Normalizer::isNormalized(const UnicodeString& source,
michael@0:                          UNormalizationMode mode, int32_t options,
michael@0:                          UErrorCode &status) {
michael@0:     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
michael@0:     if(U_SUCCESS(status)) {
michael@0:         if(options&UNORM_UNICODE_3_2) {
michael@0:             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
michael@0:                 isNormalized(source, status);
michael@0:         } else {
michael@0:             return n2->isNormalized(source, status);
michael@0:         }
michael@0:     } else {
michael@0:         return FALSE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: UnicodeString & U_EXPORT2
michael@0: Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
michael@0:                         UnicodeString &result,
michael@0:                         UNormalizationMode mode, int32_t options,
michael@0:                         UErrorCode &errorCode) {
michael@0:     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
michael@0:         result.setToBogus();
michael@0:         if(U_SUCCESS(errorCode)) {
michael@0:             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         }
michael@0:     } else {
michael@0:         UnicodeString localDest;
michael@0:         UnicodeString *dest;
michael@0: 
michael@0:         if(&right!=&result) {
michael@0:             dest=&result;
michael@0:         } else {
michael@0:             // the right and result strings are the same object, use a temporary one
michael@0:             dest=&localDest;
michael@0:         }
michael@0:         *dest=left;
michael@0:         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
michael@0:         if(U_SUCCESS(errorCode)) {
michael@0:             if(options&UNORM_UNICODE_3_2) {
michael@0:                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
michael@0:                     append(*dest, right, errorCode);
michael@0:             } else {
michael@0:                 n2->append(*dest, right, errorCode);
michael@0:             }
michael@0:         }
michael@0:         if(dest==&localDest && U_SUCCESS(errorCode)) {
michael@0:             result=*dest;
michael@0:         }
michael@0:     }
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: //-------------------------------------------------------------------------
michael@0: // Iteration API
michael@0: //-------------------------------------------------------------------------
michael@0: 
michael@0: /**
michael@0:  * Return the current character in the normalized text.
michael@0:  */
michael@0: UChar32 Normalizer::current() {
michael@0:     if(bufferPos<buffer.length() || nextNormalize()) {
michael@0:         return buffer.char32At(bufferPos);
michael@0:     } else {
michael@0:         return DONE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Return the next character in the normalized text and advance
michael@0:  * the iteration position by one.  If the end
michael@0:  * of the text has already been reached, {@link #DONE} is returned.
michael@0:  */
michael@0: UChar32 Normalizer::next() {
michael@0:     if(bufferPos<buffer.length() ||  nextNormalize()) {
michael@0:         UChar32 c=buffer.char32At(bufferPos);
michael@0:         bufferPos+=U16_LENGTH(c);
michael@0:         return c;
michael@0:     } else {
michael@0:         return DONE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Return the previous character in the normalized text and decrement
michael@0:  * the iteration position by one.  If the beginning
michael@0:  * of the text has already been reached, {@link #DONE} is returned.
michael@0:  */
michael@0: UChar32 Normalizer::previous() {
michael@0:     if(bufferPos>0 || previousNormalize()) {
michael@0:         UChar32 c=buffer.char32At(bufferPos-1);
michael@0:         bufferPos-=U16_LENGTH(c);
michael@0:         return c;
michael@0:     } else {
michael@0:         return DONE;
michael@0:     }
michael@0: }
michael@0: 
michael@0: void Normalizer::reset() {
michael@0:     currentIndex=nextIndex=text->setToStart();
michael@0:     clearBuffer();
michael@0: }
michael@0: 
michael@0: void
michael@0: Normalizer::setIndexOnly(int32_t index) {
michael@0:     text->setIndex(index);  // pins index
michael@0:     currentIndex=nextIndex=text->getIndex();
michael@0:     clearBuffer();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Return the first character in the normalized text.  This resets
michael@0:  * the <tt>Normalizer's</tt> position to the beginning of the text.
michael@0:  */
michael@0: UChar32 Normalizer::first() {
michael@0:     reset();
michael@0:     return next();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Return the last character in the normalized text.  This resets
michael@0:  * the <tt>Normalizer's</tt> position to be just before the
michael@0:  * the input text corresponding to that normalized character.
michael@0:  */
michael@0: UChar32 Normalizer::last() {
michael@0:     currentIndex=nextIndex=text->setToEnd();
michael@0:     clearBuffer();
michael@0:     return previous();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retrieve the current iteration position in the input text that is
michael@0:  * being normalized.  This method is useful in applications such as
michael@0:  * searching, where you need to be able to determine the position in
michael@0:  * the input text that corresponds to a given normalized output character.
michael@0:  * <p>
michael@0:  * <b>Note:</b> This method sets the position in the <em>input</em>, while
michael@0:  * {@link #next} and {@link #previous} iterate through characters in the
michael@0:  * <em>output</em>.  This means that there is not necessarily a one-to-one
michael@0:  * correspondence between characters returned by <tt>next</tt> and
michael@0:  * <tt>previous</tt> and the indices passed to and returned from
michael@0:  * <tt>setIndex</tt> and {@link #getIndex}.
michael@0:  *
michael@0:  */
michael@0: int32_t Normalizer::getIndex() const {
michael@0:     if(bufferPos<buffer.length()) {
michael@0:         return currentIndex;
michael@0:     } else {
michael@0:         return nextIndex;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retrieve the index of the start of the input text.  This is the begin index
michael@0:  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
michael@0:  * over which this <tt>Normalizer</tt> is iterating
michael@0:  */
michael@0: int32_t Normalizer::startIndex() const {
michael@0:     return text->startIndex();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retrieve the index of the end of the input text.  This is the end index
michael@0:  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
michael@0:  * over which this <tt>Normalizer</tt> is iterating
michael@0:  */
michael@0: int32_t Normalizer::endIndex() const {
michael@0:     return text->endIndex();
michael@0: }
michael@0: 
michael@0: //-------------------------------------------------------------------------
michael@0: // Property access methods
michael@0: //-------------------------------------------------------------------------
michael@0: 
michael@0: void
michael@0: Normalizer::setMode(UNormalizationMode newMode) 
michael@0: {
michael@0:     fUMode = newMode;
michael@0:     init();
michael@0: }
michael@0: 
michael@0: UNormalizationMode
michael@0: Normalizer::getUMode() const
michael@0: {
michael@0:     return fUMode;
michael@0: }
michael@0: 
michael@0: void
michael@0: Normalizer::setOption(int32_t option, 
michael@0:                       UBool value) 
michael@0: {
michael@0:     if (value) {
michael@0:         fOptions |= option;
michael@0:     } else {
michael@0:         fOptions &= (~option);
michael@0:     }
michael@0:     init();
michael@0: }
michael@0: 
michael@0: UBool
michael@0: Normalizer::getOption(int32_t option) const
michael@0: {
michael@0:     return (fOptions & option) != 0;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Set the input text over which this <tt>Normalizer</tt> will iterate.
michael@0:  * The iteration position is set to the beginning of the input text.
michael@0:  */
michael@0: void
michael@0: Normalizer::setText(const UnicodeString& newText, 
michael@0:                     UErrorCode &status)
michael@0: {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     CharacterIterator *newIter = new StringCharacterIterator(newText);
michael@0:     if (newIter == NULL) {
michael@0:         status = U_MEMORY_ALLOCATION_ERROR;
michael@0:         return;
michael@0:     }
michael@0:     delete text;
michael@0:     text = newIter;
michael@0:     reset();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Set the input text over which this <tt>Normalizer</tt> will iterate.
michael@0:  * The iteration position is set to the beginning of the string.
michael@0:  */
michael@0: void
michael@0: Normalizer::setText(const CharacterIterator& newText, 
michael@0:                     UErrorCode &status) 
michael@0: {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     CharacterIterator *newIter = newText.clone();
michael@0:     if (newIter == NULL) {
michael@0:         status = U_MEMORY_ALLOCATION_ERROR;
michael@0:         return;
michael@0:     }
michael@0:     delete text;
michael@0:     text = newIter;
michael@0:     reset();
michael@0: }
michael@0: 
michael@0: void
michael@0: Normalizer::setText(const UChar* newText,
michael@0:                     int32_t length,
michael@0:                     UErrorCode &status)
michael@0: {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
michael@0:     if (newIter == NULL) {
michael@0:         status = U_MEMORY_ALLOCATION_ERROR;
michael@0:         return;
michael@0:     }
michael@0:     delete text;
michael@0:     text = newIter;
michael@0:     reset();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Copies the text under iteration into the UnicodeString referred to by "result".
michael@0:  * @param result Receives a copy of the text under iteration.
michael@0:  */
michael@0: void
michael@0: Normalizer::getText(UnicodeString&  result) 
michael@0: {
michael@0:     text->getText(result);
michael@0: }
michael@0: 
michael@0: //-------------------------------------------------------------------------
michael@0: // Private utility methods
michael@0: //-------------------------------------------------------------------------
michael@0: 
michael@0: void Normalizer::clearBuffer() {
michael@0:     buffer.remove();
michael@0:     bufferPos=0;
michael@0: }
michael@0: 
michael@0: UBool
michael@0: Normalizer::nextNormalize() {
michael@0:     clearBuffer();
michael@0:     currentIndex=nextIndex;
michael@0:     text->setIndex(nextIndex);
michael@0:     if(!text->hasNext()) {
michael@0:         return FALSE;
michael@0:     }
michael@0:     // Skip at least one character so we make progress.
michael@0:     UnicodeString segment(text->next32PostInc());
michael@0:     while(text->hasNext()) {
michael@0:         UChar32 c;
michael@0:         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
michael@0:             text->move32(-1, CharacterIterator::kCurrent);
michael@0:             break;
michael@0:         }
michael@0:         segment.append(c);
michael@0:     }
michael@0:     nextIndex=text->getIndex();
michael@0:     UErrorCode errorCode=U_ZERO_ERROR;
michael@0:     fNorm2->normalize(segment, buffer, errorCode);
michael@0:     return U_SUCCESS(errorCode) && !buffer.isEmpty();
michael@0: }
michael@0: 
michael@0: UBool
michael@0: Normalizer::previousNormalize() {
michael@0:     clearBuffer();
michael@0:     nextIndex=currentIndex;
michael@0:     text->setIndex(currentIndex);
michael@0:     if(!text->hasPrevious()) {
michael@0:         return FALSE;
michael@0:     }
michael@0:     UnicodeString segment;
michael@0:     while(text->hasPrevious()) {
michael@0:         UChar32 c=text->previous32();
michael@0:         segment.insert(0, c);
michael@0:         if(fNorm2->hasBoundaryBefore(c)) {
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:     currentIndex=text->getIndex();
michael@0:     UErrorCode errorCode=U_ZERO_ERROR;
michael@0:     fNorm2->normalize(segment, buffer, errorCode);
michael@0:     bufferPos=buffer.length();
michael@0:     return U_SUCCESS(errorCode) && !buffer.isEmpty();
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_NORMALIZATION */