intl/icu/source/i18n/nortrans.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2001-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 07/03/01 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/normalizer2.h"
michael@0 16 #include "unicode/utf16.h"
michael@0 17 #include "cstring.h"
michael@0 18 #include "nortrans.h"
michael@0 19
michael@0 20 U_NAMESPACE_BEGIN
michael@0 21
michael@0 22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
michael@0 23
michael@0 24 static inline Transliterator::Token cstrToken(const char *s) {
michael@0 25 return Transliterator::pointerToken((void *)s);
michael@0 26 }
michael@0 27
michael@0 28 /**
michael@0 29 * System registration hook.
michael@0 30 */
michael@0 31 void NormalizationTransliterator::registerIDs() {
michael@0 32 // In the Token, the byte after the NUL is the UNormalization2Mode.
michael@0 33 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
michael@0 34 _create, cstrToken("nfc\0\0"));
michael@0 35 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
michael@0 36 _create, cstrToken("nfkc\0\0"));
michael@0 37 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
michael@0 38 _create, cstrToken("nfc\0\1"));
michael@0 39 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
michael@0 40 _create, cstrToken("nfkc\0\1"));
michael@0 41 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
michael@0 42 _create, cstrToken("nfc\0\2"));
michael@0 43 Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
michael@0 44 _create, cstrToken("nfc\0\3"));
michael@0 45 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
michael@0 46 UNICODE_STRING_SIMPLE("NFD"), TRUE);
michael@0 47 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
michael@0 48 UNICODE_STRING_SIMPLE("NFKD"), TRUE);
michael@0 49 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
michael@0 50 UNICODE_STRING_SIMPLE("NFD"), FALSE);
michael@0 51 Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
michael@0 52 UNICODE_STRING_SIMPLE("FCD"), FALSE);
michael@0 53 }
michael@0 54
michael@0 55 /**
michael@0 56 * Factory methods
michael@0 57 */
michael@0 58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
michael@0 59 Token context) {
michael@0 60 const char *name = (const char *)context.pointer;
michael@0 61 UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
michael@0 62 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 63 const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
michael@0 64 if(U_SUCCESS(errorCode)) {
michael@0 65 return new NormalizationTransliterator(ID, *norm2);
michael@0 66 } else {
michael@0 67 return NULL;
michael@0 68 }
michael@0 69 }
michael@0 70
michael@0 71 /**
michael@0 72 * Constructs a transliterator.
michael@0 73 */
michael@0 74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
michael@0 75 const Normalizer2 &norm2) :
michael@0 76 Transliterator(id, 0), fNorm2(norm2) {}
michael@0 77
michael@0 78 /**
michael@0 79 * Destructor.
michael@0 80 */
michael@0 81 NormalizationTransliterator::~NormalizationTransliterator() {
michael@0 82 }
michael@0 83
michael@0 84 /**
michael@0 85 * Copy constructor.
michael@0 86 */
michael@0 87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
michael@0 88 Transliterator(o), fNorm2(o.fNorm2) {}
michael@0 89
michael@0 90 /**
michael@0 91 * Transliterator API.
michael@0 92 */
michael@0 93 Transliterator* NormalizationTransliterator::clone(void) const {
michael@0 94 return new NormalizationTransliterator(*this);
michael@0 95 }
michael@0 96
michael@0 97 /**
michael@0 98 * Implements {@link Transliterator#handleTransliterate}.
michael@0 99 */
michael@0 100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
michael@0 101 UBool isIncremental) const {
michael@0 102 // start and limit of the input range
michael@0 103 int32_t start = offsets.start;
michael@0 104 int32_t limit = offsets.limit;
michael@0 105 if(start >= limit) {
michael@0 106 return;
michael@0 107 }
michael@0 108
michael@0 109 /*
michael@0 110 * Normalize as short chunks at a time as possible even in
michael@0 111 * bulk mode, so that styled text is minimally disrupted.
michael@0 112 * In incremental mode, a chunk that ends with offsets.limit
michael@0 113 * must not be normalized.
michael@0 114 *
michael@0 115 * If it was known that the input text is not styled, then
michael@0 116 * a bulk mode normalization could look like this:
michael@0 117
michael@0 118 UnicodeString input, normalized;
michael@0 119 int32_t length = limit - start;
michael@0 120 _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
michael@0 121 input.releaseBuffer(length);
michael@0 122
michael@0 123 UErrorCode status = U_ZERO_ERROR;
michael@0 124 fNorm2.normalize(input, normalized, status);
michael@0 125
michael@0 126 text.handleReplaceBetween(start, limit, normalized);
michael@0 127
michael@0 128 int32_t delta = normalized.length() - length;
michael@0 129 offsets.contextLimit += delta;
michael@0 130 offsets.limit += delta;
michael@0 131 offsets.start = limit + delta;
michael@0 132
michael@0 133 */
michael@0 134 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 135 UnicodeString segment;
michael@0 136 UnicodeString normalized;
michael@0 137 UChar32 c = text.char32At(start);
michael@0 138 do {
michael@0 139 int32_t prev = start;
michael@0 140 // Skip at least one character so we make progress.
michael@0 141 // c holds the character at start.
michael@0 142 segment.remove();
michael@0 143 do {
michael@0 144 segment.append(c);
michael@0 145 start += U16_LENGTH(c);
michael@0 146 } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
michael@0 147 if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
michael@0 148 // stop in incremental mode when we reach the input limit
michael@0 149 // in case there are additional characters that could change the
michael@0 150 // normalization result
michael@0 151 start=prev;
michael@0 152 break;
michael@0 153 }
michael@0 154 fNorm2.normalize(segment, normalized, errorCode);
michael@0 155 if(U_FAILURE(errorCode)) {
michael@0 156 break;
michael@0 157 }
michael@0 158 if(segment != normalized) {
michael@0 159 // replace the input chunk with its normalized form
michael@0 160 text.handleReplaceBetween(prev, start, normalized);
michael@0 161
michael@0 162 // update all necessary indexes accordingly
michael@0 163 int32_t delta = normalized.length() - (start - prev);
michael@0 164 start += delta;
michael@0 165 limit += delta;
michael@0 166 }
michael@0 167 } while(start < limit);
michael@0 168
michael@0 169 offsets.start = start;
michael@0 170 offsets.contextLimit += limit - offsets.limit;
michael@0 171 offsets.limit = limit;
michael@0 172 }
michael@0 173
michael@0 174 U_NAMESPACE_END
michael@0 175
michael@0 176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

mercurial