michael@0: /*
michael@0: **********************************************************************
michael@0: *   Copyright (C) 2001-2011, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: **********************************************************************
michael@0: *   Date        Name        Description
michael@0: *   07/03/01    aliu        Creation.
michael@0: **********************************************************************
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_TRANSLITERATION
michael@0: 
michael@0: #include "unicode/normalizer2.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "cstring.h"
michael@0: #include "nortrans.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
michael@0: 
michael@0: static inline Transliterator::Token cstrToken(const char *s) {
michael@0:     return Transliterator::pointerToken((void *)s);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * System registration hook.
michael@0:  */
michael@0: void NormalizationTransliterator::registerIDs() {
michael@0:     // In the Token, the byte after the NUL is the UNormalization2Mode.
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
michael@0:                                      _create, cstrToken("nfc\0\0"));
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
michael@0:                                      _create, cstrToken("nfkc\0\0"));
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
michael@0:                                      _create, cstrToken("nfc\0\1"));
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
michael@0:                                      _create, cstrToken("nfkc\0\1"));
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
michael@0:                                      _create, cstrToken("nfc\0\2"));
michael@0:     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
michael@0:                                      _create, cstrToken("nfc\0\3"));
michael@0:     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
michael@0:                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
michael@0:     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
michael@0:                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
michael@0:     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
michael@0:                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
michael@0:     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
michael@0:                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Factory methods
michael@0:  */
michael@0: Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
michael@0:                                                      Token context) {
michael@0:     const char *name = (const char *)context.pointer;
michael@0:     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
michael@0:     UErrorCode errorCode = U_ZERO_ERROR;
michael@0:     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
michael@0:     if(U_SUCCESS(errorCode)) {
michael@0:         return new NormalizationTransliterator(ID, *norm2);
michael@0:     } else {
michael@0:         return NULL;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Constructs a transliterator.
michael@0:  */
michael@0: NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
michael@0:                                                          const Normalizer2 &norm2) :
michael@0:     Transliterator(id, 0), fNorm2(norm2) {}
michael@0: 
michael@0: /**
michael@0:  * Destructor.
michael@0:  */
michael@0: NormalizationTransliterator::~NormalizationTransliterator() {
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Copy constructor.
michael@0:  */
michael@0: NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
michael@0:     Transliterator(o), fNorm2(o.fNorm2) {}
michael@0: 
michael@0: /**
michael@0:  * Transliterator API.
michael@0:  */
michael@0: Transliterator* NormalizationTransliterator::clone(void) const {
michael@0:     return new NormalizationTransliterator(*this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Implements {@link Transliterator#handleTransliterate}.
michael@0:  */
michael@0: void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
michael@0:                                                       UBool isIncremental) const {
michael@0:     // start and limit of the input range
michael@0:     int32_t start = offsets.start;
michael@0:     int32_t limit = offsets.limit;
michael@0:     if(start >= limit) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     /*
michael@0:      * Normalize as short chunks at a time as possible even in
michael@0:      * bulk mode, so that styled text is minimally disrupted.
michael@0:      * In incremental mode, a chunk that ends with offsets.limit
michael@0:      * must not be normalized.
michael@0:      *
michael@0:      * If it was known that the input text is not styled, then
michael@0:      * a bulk mode normalization could look like this:
michael@0: 
michael@0:     UnicodeString input, normalized;
michael@0:     int32_t length = limit - start;
michael@0:     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
michael@0:     input.releaseBuffer(length);
michael@0: 
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     fNorm2.normalize(input, normalized, status);
michael@0: 
michael@0:     text.handleReplaceBetween(start, limit, normalized);
michael@0: 
michael@0:     int32_t delta = normalized.length() - length;
michael@0:     offsets.contextLimit += delta;
michael@0:     offsets.limit += delta;
michael@0:     offsets.start = limit + delta;
michael@0: 
michael@0:      */
michael@0:     UErrorCode errorCode = U_ZERO_ERROR;
michael@0:     UnicodeString segment;
michael@0:     UnicodeString normalized;
michael@0:     UChar32 c = text.char32At(start);
michael@0:     do {
michael@0:         int32_t prev = start;
michael@0:         // Skip at least one character so we make progress.
michael@0:         // c holds the character at start.
michael@0:         segment.remove();
michael@0:         do {
michael@0:             segment.append(c);
michael@0:             start += U16_LENGTH(c);
michael@0:         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
michael@0:         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
michael@0:             // stop in incremental mode when we reach the input limit
michael@0:             // in case there are additional characters that could change the
michael@0:             // normalization result
michael@0:             start=prev;
michael@0:             break;
michael@0:         }
michael@0:         fNorm2.normalize(segment, normalized, errorCode);
michael@0:         if(U_FAILURE(errorCode)) {
michael@0:             break;
michael@0:         }
michael@0:         if(segment != normalized) {
michael@0:             // replace the input chunk with its normalized form
michael@0:             text.handleReplaceBetween(prev, start, normalized);
michael@0: 
michael@0:             // update all necessary indexes accordingly
michael@0:             int32_t delta = normalized.length() - (start - prev);
michael@0:             start += delta;
michael@0:             limit += delta;
michael@0:         }
michael@0:     } while(start < limit);
michael@0: 
michael@0:     offsets.start = start;
michael@0:     offsets.contextLimit += limit - offsets.limit;
michael@0:     offsets.limit = limit;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */