1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/nortrans.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,176 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2001-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 07/03/01 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/normalizer2.h" 1.19 +#include "unicode/utf16.h" 1.20 +#include "cstring.h" 1.21 +#include "nortrans.h" 1.22 + 1.23 +U_NAMESPACE_BEGIN 1.24 + 1.25 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator) 1.26 + 1.27 +static inline Transliterator::Token cstrToken(const char *s) { 1.28 + return Transliterator::pointerToken((void *)s); 1.29 +} 1.30 + 1.31 +/** 1.32 + * System registration hook. 1.33 + */ 1.34 +void NormalizationTransliterator::registerIDs() { 1.35 + // In the Token, the byte after the NUL is the UNormalization2Mode. 1.36 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"), 1.37 + _create, cstrToken("nfc\0\0")); 1.38 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"), 1.39 + _create, cstrToken("nfkc\0\0")); 1.40 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"), 1.41 + _create, cstrToken("nfc\0\1")); 1.42 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"), 1.43 + _create, cstrToken("nfkc\0\1")); 1.44 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"), 1.45 + _create, cstrToken("nfc\0\2")); 1.46 + Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"), 1.47 + _create, cstrToken("nfc\0\3")); 1.48 + Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"), 1.49 + UNICODE_STRING_SIMPLE("NFD"), TRUE); 1.50 + Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"), 1.51 + UNICODE_STRING_SIMPLE("NFKD"), TRUE); 1.52 + Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"), 1.53 + UNICODE_STRING_SIMPLE("NFD"), FALSE); 1.54 + Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"), 1.55 + UNICODE_STRING_SIMPLE("FCD"), FALSE); 1.56 +} 1.57 + 1.58 +/** 1.59 + * Factory methods 1.60 + */ 1.61 +Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID, 1.62 + Token context) { 1.63 + const char *name = (const char *)context.pointer; 1.64 + UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1]; 1.65 + UErrorCode errorCode = U_ZERO_ERROR; 1.66 + const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode); 1.67 + if(U_SUCCESS(errorCode)) { 1.68 + return new NormalizationTransliterator(ID, *norm2); 1.69 + } else { 1.70 + return NULL; 1.71 + } 1.72 +} 1.73 + 1.74 +/** 1.75 + * Constructs a transliterator. 1.76 + */ 1.77 +NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id, 1.78 + const Normalizer2 &norm2) : 1.79 + Transliterator(id, 0), fNorm2(norm2) {} 1.80 + 1.81 +/** 1.82 + * Destructor. 1.83 + */ 1.84 +NormalizationTransliterator::~NormalizationTransliterator() { 1.85 +} 1.86 + 1.87 +/** 1.88 + * Copy constructor. 1.89 + */ 1.90 +NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) : 1.91 + Transliterator(o), fNorm2(o.fNorm2) {} 1.92 + 1.93 +/** 1.94 + * Transliterator API. 1.95 + */ 1.96 +Transliterator* NormalizationTransliterator::clone(void) const { 1.97 + return new NormalizationTransliterator(*this); 1.98 +} 1.99 + 1.100 +/** 1.101 + * Implements {@link Transliterator#handleTransliterate}. 1.102 + */ 1.103 +void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 1.104 + UBool isIncremental) const { 1.105 + // start and limit of the input range 1.106 + int32_t start = offsets.start; 1.107 + int32_t limit = offsets.limit; 1.108 + if(start >= limit) { 1.109 + return; 1.110 + } 1.111 + 1.112 + /* 1.113 + * Normalize as short chunks at a time as possible even in 1.114 + * bulk mode, so that styled text is minimally disrupted. 1.115 + * In incremental mode, a chunk that ends with offsets.limit 1.116 + * must not be normalized. 1.117 + * 1.118 + * If it was known that the input text is not styled, then 1.119 + * a bulk mode normalization could look like this: 1.120 + 1.121 + UnicodeString input, normalized; 1.122 + int32_t length = limit - start; 1.123 + _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); 1.124 + input.releaseBuffer(length); 1.125 + 1.126 + UErrorCode status = U_ZERO_ERROR; 1.127 + fNorm2.normalize(input, normalized, status); 1.128 + 1.129 + text.handleReplaceBetween(start, limit, normalized); 1.130 + 1.131 + int32_t delta = normalized.length() - length; 1.132 + offsets.contextLimit += delta; 1.133 + offsets.limit += delta; 1.134 + offsets.start = limit + delta; 1.135 + 1.136 + */ 1.137 + UErrorCode errorCode = U_ZERO_ERROR; 1.138 + UnicodeString segment; 1.139 + UnicodeString normalized; 1.140 + UChar32 c = text.char32At(start); 1.141 + do { 1.142 + int32_t prev = start; 1.143 + // Skip at least one character so we make progress. 1.144 + // c holds the character at start. 1.145 + segment.remove(); 1.146 + do { 1.147 + segment.append(c); 1.148 + start += U16_LENGTH(c); 1.149 + } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))); 1.150 + if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) { 1.151 + // stop in incremental mode when we reach the input limit 1.152 + // in case there are additional characters that could change the 1.153 + // normalization result 1.154 + start=prev; 1.155 + break; 1.156 + } 1.157 + fNorm2.normalize(segment, normalized, errorCode); 1.158 + if(U_FAILURE(errorCode)) { 1.159 + break; 1.160 + } 1.161 + if(segment != normalized) { 1.162 + // replace the input chunk with its normalized form 1.163 + text.handleReplaceBetween(prev, start, normalized); 1.164 + 1.165 + // update all necessary indexes accordingly 1.166 + int32_t delta = normalized.length() - (start - prev); 1.167 + start += delta; 1.168 + limit += delta; 1.169 + } 1.170 + } while(start < limit); 1.171 + 1.172 + offsets.start = start; 1.173 + offsets.contextLimit += limit - offsets.limit; 1.174 + offsets.limit = limit; 1.175 +} 1.176 + 1.177 +U_NAMESPACE_END 1.178 + 1.179 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */