1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/normlzr.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,521 @@ 1.4 +/* 1.5 + ************************************************************************* 1.6 + * COPYRIGHT: 1.7 + * Copyright (c) 1996-2012, International Business Machines Corporation and 1.8 + * others. All Rights Reserved. 1.9 + ************************************************************************* 1.10 + */ 1.11 + 1.12 +#include "unicode/utypes.h" 1.13 + 1.14 +#if !UCONFIG_NO_NORMALIZATION 1.15 + 1.16 +#include "unicode/uniset.h" 1.17 +#include "unicode/unistr.h" 1.18 +#include "unicode/chariter.h" 1.19 +#include "unicode/schriter.h" 1.20 +#include "unicode/uchriter.h" 1.21 +#include "unicode/normlzr.h" 1.22 +#include "unicode/utf16.h" 1.23 +#include "cmemory.h" 1.24 +#include "normalizer2impl.h" 1.25 +#include "uprops.h" // for uniset_getUnicode32Instance() 1.26 + 1.27 +U_NAMESPACE_BEGIN 1.28 + 1.29 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) 1.30 + 1.31 +//------------------------------------------------------------------------- 1.32 +// Constructors and other boilerplate 1.33 +//------------------------------------------------------------------------- 1.34 + 1.35 +Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : 1.36 + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 1.37 + text(new StringCharacterIterator(str)), 1.38 + currentIndex(0), nextIndex(0), 1.39 + buffer(), bufferPos(0) 1.40 +{ 1.41 + init(); 1.42 +} 1.43 + 1.44 +Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : 1.45 + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 1.46 + text(new UCharCharacterIterator(str, length)), 1.47 + currentIndex(0), nextIndex(0), 1.48 + buffer(), bufferPos(0) 1.49 +{ 1.50 + init(); 1.51 +} 1.52 + 1.53 +Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : 1.54 + UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), 1.55 + text(iter.clone()), 1.56 + currentIndex(0), nextIndex(0), 1.57 + buffer(), bufferPos(0) 1.58 +{ 1.59 + init(); 1.60 +} 1.61 + 1.62 +Normalizer::Normalizer(const Normalizer ©) : 1.63 + UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), 1.64 + text(copy.text->clone()), 1.65 + currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), 1.66 + buffer(copy.buffer), bufferPos(copy.bufferPos) 1.67 +{ 1.68 + init(); 1.69 +} 1.70 + 1.71 +void 1.72 +Normalizer::init() { 1.73 + UErrorCode errorCode=U_ZERO_ERROR; 1.74 + fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); 1.75 + if(fOptions&UNORM_UNICODE_3_2) { 1.76 + delete fFilteredNorm2; 1.77 + fNorm2=fFilteredNorm2= 1.78 + new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); 1.79 + } 1.80 + if(U_FAILURE(errorCode)) { 1.81 + errorCode=U_ZERO_ERROR; 1.82 + fNorm2=Normalizer2Factory::getNoopInstance(errorCode); 1.83 + } 1.84 +} 1.85 + 1.86 +Normalizer::~Normalizer() 1.87 +{ 1.88 + delete fFilteredNorm2; 1.89 + delete text; 1.90 +} 1.91 + 1.92 +Normalizer* 1.93 +Normalizer::clone() const 1.94 +{ 1.95 + return new Normalizer(*this); 1.96 +} 1.97 + 1.98 +/** 1.99 + * Generates a hash code for this iterator. 1.100 + */ 1.101 +int32_t Normalizer::hashCode() const 1.102 +{ 1.103 + return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; 1.104 +} 1.105 + 1.106 +UBool Normalizer::operator==(const Normalizer& that) const 1.107 +{ 1.108 + return 1.109 + this==&that || 1.110 + (fUMode==that.fUMode && 1.111 + fOptions==that.fOptions && 1.112 + *text==*that.text && 1.113 + buffer==that.buffer && 1.114 + bufferPos==that.bufferPos && 1.115 + nextIndex==that.nextIndex); 1.116 +} 1.117 + 1.118 +//------------------------------------------------------------------------- 1.119 +// Static utility methods 1.120 +//------------------------------------------------------------------------- 1.121 + 1.122 +void U_EXPORT2 1.123 +Normalizer::normalize(const UnicodeString& source, 1.124 + UNormalizationMode mode, int32_t options, 1.125 + UnicodeString& result, 1.126 + UErrorCode &status) { 1.127 + if(source.isBogus() || U_FAILURE(status)) { 1.128 + result.setToBogus(); 1.129 + if(U_SUCCESS(status)) { 1.130 + status=U_ILLEGAL_ARGUMENT_ERROR; 1.131 + } 1.132 + } else { 1.133 + UnicodeString localDest; 1.134 + UnicodeString *dest; 1.135 + 1.136 + if(&source!=&result) { 1.137 + dest=&result; 1.138 + } else { 1.139 + // the source and result strings are the same object, use a temporary one 1.140 + dest=&localDest; 1.141 + } 1.142 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 1.143 + if(U_SUCCESS(status)) { 1.144 + if(options&UNORM_UNICODE_3_2) { 1.145 + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 1.146 + normalize(source, *dest, status); 1.147 + } else { 1.148 + n2->normalize(source, *dest, status); 1.149 + } 1.150 + } 1.151 + if(dest==&localDest && U_SUCCESS(status)) { 1.152 + result=*dest; 1.153 + } 1.154 + } 1.155 +} 1.156 + 1.157 +void U_EXPORT2 1.158 +Normalizer::compose(const UnicodeString& source, 1.159 + UBool compat, int32_t options, 1.160 + UnicodeString& result, 1.161 + UErrorCode &status) { 1.162 + normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); 1.163 +} 1.164 + 1.165 +void U_EXPORT2 1.166 +Normalizer::decompose(const UnicodeString& source, 1.167 + UBool compat, int32_t options, 1.168 + UnicodeString& result, 1.169 + UErrorCode &status) { 1.170 + normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); 1.171 +} 1.172 + 1.173 +UNormalizationCheckResult 1.174 +Normalizer::quickCheck(const UnicodeString& source, 1.175 + UNormalizationMode mode, int32_t options, 1.176 + UErrorCode &status) { 1.177 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 1.178 + if(U_SUCCESS(status)) { 1.179 + if(options&UNORM_UNICODE_3_2) { 1.180 + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 1.181 + quickCheck(source, status); 1.182 + } else { 1.183 + return n2->quickCheck(source, status); 1.184 + } 1.185 + } else { 1.186 + return UNORM_MAYBE; 1.187 + } 1.188 +} 1.189 + 1.190 +UBool 1.191 +Normalizer::isNormalized(const UnicodeString& source, 1.192 + UNormalizationMode mode, int32_t options, 1.193 + UErrorCode &status) { 1.194 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); 1.195 + if(U_SUCCESS(status)) { 1.196 + if(options&UNORM_UNICODE_3_2) { 1.197 + return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). 1.198 + isNormalized(source, status); 1.199 + } else { 1.200 + return n2->isNormalized(source, status); 1.201 + } 1.202 + } else { 1.203 + return FALSE; 1.204 + } 1.205 +} 1.206 + 1.207 +UnicodeString & U_EXPORT2 1.208 +Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, 1.209 + UnicodeString &result, 1.210 + UNormalizationMode mode, int32_t options, 1.211 + UErrorCode &errorCode) { 1.212 + if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { 1.213 + result.setToBogus(); 1.214 + if(U_SUCCESS(errorCode)) { 1.215 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.216 + } 1.217 + } else { 1.218 + UnicodeString localDest; 1.219 + UnicodeString *dest; 1.220 + 1.221 + if(&right!=&result) { 1.222 + dest=&result; 1.223 + } else { 1.224 + // the right and result strings are the same object, use a temporary one 1.225 + dest=&localDest; 1.226 + } 1.227 + *dest=left; 1.228 + const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); 1.229 + if(U_SUCCESS(errorCode)) { 1.230 + if(options&UNORM_UNICODE_3_2) { 1.231 + FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). 1.232 + append(*dest, right, errorCode); 1.233 + } else { 1.234 + n2->append(*dest, right, errorCode); 1.235 + } 1.236 + } 1.237 + if(dest==&localDest && U_SUCCESS(errorCode)) { 1.238 + result=*dest; 1.239 + } 1.240 + } 1.241 + return result; 1.242 +} 1.243 + 1.244 +//------------------------------------------------------------------------- 1.245 +// Iteration API 1.246 +//------------------------------------------------------------------------- 1.247 + 1.248 +/** 1.249 + * Return the current character in the normalized text. 1.250 + */ 1.251 +UChar32 Normalizer::current() { 1.252 + if(bufferPos<buffer.length() || nextNormalize()) { 1.253 + return buffer.char32At(bufferPos); 1.254 + } else { 1.255 + return DONE; 1.256 + } 1.257 +} 1.258 + 1.259 +/** 1.260 + * Return the next character in the normalized text and advance 1.261 + * the iteration position by one. If the end 1.262 + * of the text has already been reached, {@link #DONE} is returned. 1.263 + */ 1.264 +UChar32 Normalizer::next() { 1.265 + if(bufferPos<buffer.length() || nextNormalize()) { 1.266 + UChar32 c=buffer.char32At(bufferPos); 1.267 + bufferPos+=U16_LENGTH(c); 1.268 + return c; 1.269 + } else { 1.270 + return DONE; 1.271 + } 1.272 +} 1.273 + 1.274 +/** 1.275 + * Return the previous character in the normalized text and decrement 1.276 + * the iteration position by one. If the beginning 1.277 + * of the text has already been reached, {@link #DONE} is returned. 1.278 + */ 1.279 +UChar32 Normalizer::previous() { 1.280 + if(bufferPos>0 || previousNormalize()) { 1.281 + UChar32 c=buffer.char32At(bufferPos-1); 1.282 + bufferPos-=U16_LENGTH(c); 1.283 + return c; 1.284 + } else { 1.285 + return DONE; 1.286 + } 1.287 +} 1.288 + 1.289 +void Normalizer::reset() { 1.290 + currentIndex=nextIndex=text->setToStart(); 1.291 + clearBuffer(); 1.292 +} 1.293 + 1.294 +void 1.295 +Normalizer::setIndexOnly(int32_t index) { 1.296 + text->setIndex(index); // pins index 1.297 + currentIndex=nextIndex=text->getIndex(); 1.298 + clearBuffer(); 1.299 +} 1.300 + 1.301 +/** 1.302 + * Return the first character in the normalized text. This resets 1.303 + * the <tt>Normalizer's</tt> position to the beginning of the text. 1.304 + */ 1.305 +UChar32 Normalizer::first() { 1.306 + reset(); 1.307 + return next(); 1.308 +} 1.309 + 1.310 +/** 1.311 + * Return the last character in the normalized text. This resets 1.312 + * the <tt>Normalizer's</tt> position to be just before the 1.313 + * the input text corresponding to that normalized character. 1.314 + */ 1.315 +UChar32 Normalizer::last() { 1.316 + currentIndex=nextIndex=text->setToEnd(); 1.317 + clearBuffer(); 1.318 + return previous(); 1.319 +} 1.320 + 1.321 +/** 1.322 + * Retrieve the current iteration position in the input text that is 1.323 + * being normalized. This method is useful in applications such as 1.324 + * searching, where you need to be able to determine the position in 1.325 + * the input text that corresponds to a given normalized output character. 1.326 + * <p> 1.327 + * <b>Note:</b> This method sets the position in the <em>input</em>, while 1.328 + * {@link #next} and {@link #previous} iterate through characters in the 1.329 + * <em>output</em>. This means that there is not necessarily a one-to-one 1.330 + * correspondence between characters returned by <tt>next</tt> and 1.331 + * <tt>previous</tt> and the indices passed to and returned from 1.332 + * <tt>setIndex</tt> and {@link #getIndex}. 1.333 + * 1.334 + */ 1.335 +int32_t Normalizer::getIndex() const { 1.336 + if(bufferPos<buffer.length()) { 1.337 + return currentIndex; 1.338 + } else { 1.339 + return nextIndex; 1.340 + } 1.341 +} 1.342 + 1.343 +/** 1.344 + * Retrieve the index of the start of the input text. This is the begin index 1.345 + * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> 1.346 + * over which this <tt>Normalizer</tt> is iterating 1.347 + */ 1.348 +int32_t Normalizer::startIndex() const { 1.349 + return text->startIndex(); 1.350 +} 1.351 + 1.352 +/** 1.353 + * Retrieve the index of the end of the input text. This is the end index 1.354 + * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1.355 + * over which this <tt>Normalizer</tt> is iterating 1.356 + */ 1.357 +int32_t Normalizer::endIndex() const { 1.358 + return text->endIndex(); 1.359 +} 1.360 + 1.361 +//------------------------------------------------------------------------- 1.362 +// Property access methods 1.363 +//------------------------------------------------------------------------- 1.364 + 1.365 +void 1.366 +Normalizer::setMode(UNormalizationMode newMode) 1.367 +{ 1.368 + fUMode = newMode; 1.369 + init(); 1.370 +} 1.371 + 1.372 +UNormalizationMode 1.373 +Normalizer::getUMode() const 1.374 +{ 1.375 + return fUMode; 1.376 +} 1.377 + 1.378 +void 1.379 +Normalizer::setOption(int32_t option, 1.380 + UBool value) 1.381 +{ 1.382 + if (value) { 1.383 + fOptions |= option; 1.384 + } else { 1.385 + fOptions &= (~option); 1.386 + } 1.387 + init(); 1.388 +} 1.389 + 1.390 +UBool 1.391 +Normalizer::getOption(int32_t option) const 1.392 +{ 1.393 + return (fOptions & option) != 0; 1.394 +} 1.395 + 1.396 +/** 1.397 + * Set the input text over which this <tt>Normalizer</tt> will iterate. 1.398 + * The iteration position is set to the beginning of the input text. 1.399 + */ 1.400 +void 1.401 +Normalizer::setText(const UnicodeString& newText, 1.402 + UErrorCode &status) 1.403 +{ 1.404 + if (U_FAILURE(status)) { 1.405 + return; 1.406 + } 1.407 + CharacterIterator *newIter = new StringCharacterIterator(newText); 1.408 + if (newIter == NULL) { 1.409 + status = U_MEMORY_ALLOCATION_ERROR; 1.410 + return; 1.411 + } 1.412 + delete text; 1.413 + text = newIter; 1.414 + reset(); 1.415 +} 1.416 + 1.417 +/** 1.418 + * Set the input text over which this <tt>Normalizer</tt> will iterate. 1.419 + * The iteration position is set to the beginning of the string. 1.420 + */ 1.421 +void 1.422 +Normalizer::setText(const CharacterIterator& newText, 1.423 + UErrorCode &status) 1.424 +{ 1.425 + if (U_FAILURE(status)) { 1.426 + return; 1.427 + } 1.428 + CharacterIterator *newIter = newText.clone(); 1.429 + if (newIter == NULL) { 1.430 + status = U_MEMORY_ALLOCATION_ERROR; 1.431 + return; 1.432 + } 1.433 + delete text; 1.434 + text = newIter; 1.435 + reset(); 1.436 +} 1.437 + 1.438 +void 1.439 +Normalizer::setText(const UChar* newText, 1.440 + int32_t length, 1.441 + UErrorCode &status) 1.442 +{ 1.443 + if (U_FAILURE(status)) { 1.444 + return; 1.445 + } 1.446 + CharacterIterator *newIter = new UCharCharacterIterator(newText, length); 1.447 + if (newIter == NULL) { 1.448 + status = U_MEMORY_ALLOCATION_ERROR; 1.449 + return; 1.450 + } 1.451 + delete text; 1.452 + text = newIter; 1.453 + reset(); 1.454 +} 1.455 + 1.456 +/** 1.457 + * Copies the text under iteration into the UnicodeString referred to by "result". 1.458 + * @param result Receives a copy of the text under iteration. 1.459 + */ 1.460 +void 1.461 +Normalizer::getText(UnicodeString& result) 1.462 +{ 1.463 + text->getText(result); 1.464 +} 1.465 + 1.466 +//------------------------------------------------------------------------- 1.467 +// Private utility methods 1.468 +//------------------------------------------------------------------------- 1.469 + 1.470 +void Normalizer::clearBuffer() { 1.471 + buffer.remove(); 1.472 + bufferPos=0; 1.473 +} 1.474 + 1.475 +UBool 1.476 +Normalizer::nextNormalize() { 1.477 + clearBuffer(); 1.478 + currentIndex=nextIndex; 1.479 + text->setIndex(nextIndex); 1.480 + if(!text->hasNext()) { 1.481 + return FALSE; 1.482 + } 1.483 + // Skip at least one character so we make progress. 1.484 + UnicodeString segment(text->next32PostInc()); 1.485 + while(text->hasNext()) { 1.486 + UChar32 c; 1.487 + if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { 1.488 + text->move32(-1, CharacterIterator::kCurrent); 1.489 + break; 1.490 + } 1.491 + segment.append(c); 1.492 + } 1.493 + nextIndex=text->getIndex(); 1.494 + UErrorCode errorCode=U_ZERO_ERROR; 1.495 + fNorm2->normalize(segment, buffer, errorCode); 1.496 + return U_SUCCESS(errorCode) && !buffer.isEmpty(); 1.497 +} 1.498 + 1.499 +UBool 1.500 +Normalizer::previousNormalize() { 1.501 + clearBuffer(); 1.502 + nextIndex=currentIndex; 1.503 + text->setIndex(currentIndex); 1.504 + if(!text->hasPrevious()) { 1.505 + return FALSE; 1.506 + } 1.507 + UnicodeString segment; 1.508 + while(text->hasPrevious()) { 1.509 + UChar32 c=text->previous32(); 1.510 + segment.insert(0, c); 1.511 + if(fNorm2->hasBoundaryBefore(c)) { 1.512 + break; 1.513 + } 1.514 + } 1.515 + currentIndex=text->getIndex(); 1.516 + UErrorCode errorCode=U_ZERO_ERROR; 1.517 + fNorm2->normalize(segment, buffer, errorCode); 1.518 + bufferPos=buffer.length(); 1.519 + return U_SUCCESS(errorCode) && !buffer.isEmpty(); 1.520 +} 1.521 + 1.522 +U_NAMESPACE_END 1.523 + 1.524 +#endif /* #if !UCONFIG_NO_NORMALIZATION */