intl/icu/source/common/normlzr.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/normlzr.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,521 @@
     1.4 +/*
     1.5 + *************************************************************************
     1.6 + * COPYRIGHT: 
     1.7 + * Copyright (c) 1996-2012, International Business Machines Corporation and
     1.8 + * others. All Rights Reserved.
     1.9 + *************************************************************************
    1.10 + */
    1.11 +
    1.12 +#include "unicode/utypes.h"
    1.13 +
    1.14 +#if !UCONFIG_NO_NORMALIZATION
    1.15 +
    1.16 +#include "unicode/uniset.h"
    1.17 +#include "unicode/unistr.h"
    1.18 +#include "unicode/chariter.h"
    1.19 +#include "unicode/schriter.h"
    1.20 +#include "unicode/uchriter.h"
    1.21 +#include "unicode/normlzr.h"
    1.22 +#include "unicode/utf16.h"
    1.23 +#include "cmemory.h"
    1.24 +#include "normalizer2impl.h"
    1.25 +#include "uprops.h"  // for uniset_getUnicode32Instance()
    1.26 +
    1.27 +U_NAMESPACE_BEGIN
    1.28 +
    1.29 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
    1.30 +
    1.31 +//-------------------------------------------------------------------------
    1.32 +// Constructors and other boilerplate
    1.33 +//-------------------------------------------------------------------------
    1.34 +
    1.35 +Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
    1.36 +    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    1.37 +    text(new StringCharacterIterator(str)),
    1.38 +    currentIndex(0), nextIndex(0),
    1.39 +    buffer(), bufferPos(0)
    1.40 +{
    1.41 +    init();
    1.42 +}
    1.43 +
    1.44 +Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
    1.45 +    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    1.46 +    text(new UCharCharacterIterator(str, length)),
    1.47 +    currentIndex(0), nextIndex(0),
    1.48 +    buffer(), bufferPos(0)
    1.49 +{
    1.50 +    init();
    1.51 +}
    1.52 +
    1.53 +Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
    1.54 +    UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    1.55 +    text(iter.clone()),
    1.56 +    currentIndex(0), nextIndex(0),
    1.57 +    buffer(), bufferPos(0)
    1.58 +{
    1.59 +    init();
    1.60 +}
    1.61 +
    1.62 +Normalizer::Normalizer(const Normalizer &copy) :
    1.63 +    UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
    1.64 +    text(copy.text->clone()),
    1.65 +    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
    1.66 +    buffer(copy.buffer), bufferPos(copy.bufferPos)
    1.67 +{
    1.68 +    init();
    1.69 +}
    1.70 +
    1.71 +void
    1.72 +Normalizer::init() {
    1.73 +    UErrorCode errorCode=U_ZERO_ERROR;
    1.74 +    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
    1.75 +    if(fOptions&UNORM_UNICODE_3_2) {
    1.76 +        delete fFilteredNorm2;
    1.77 +        fNorm2=fFilteredNorm2=
    1.78 +            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
    1.79 +    }
    1.80 +    if(U_FAILURE(errorCode)) {
    1.81 +        errorCode=U_ZERO_ERROR;
    1.82 +        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
    1.83 +    }
    1.84 +}
    1.85 +
    1.86 +Normalizer::~Normalizer()
    1.87 +{
    1.88 +    delete fFilteredNorm2;
    1.89 +    delete text;
    1.90 +}
    1.91 +
    1.92 +Normalizer* 
    1.93 +Normalizer::clone() const
    1.94 +{
    1.95 +    return new Normalizer(*this);
    1.96 +}
    1.97 +
    1.98 +/**
    1.99 + * Generates a hash code for this iterator.
   1.100 + */
   1.101 +int32_t Normalizer::hashCode() const
   1.102 +{
   1.103 +    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
   1.104 +}
   1.105 +    
   1.106 +UBool Normalizer::operator==(const Normalizer& that) const
   1.107 +{
   1.108 +    return
   1.109 +        this==&that ||
   1.110 +        (fUMode==that.fUMode &&
   1.111 +        fOptions==that.fOptions &&
   1.112 +        *text==*that.text &&
   1.113 +        buffer==that.buffer &&
   1.114 +        bufferPos==that.bufferPos &&
   1.115 +        nextIndex==that.nextIndex);
   1.116 +}
   1.117 +
   1.118 +//-------------------------------------------------------------------------
   1.119 +// Static utility methods
   1.120 +//-------------------------------------------------------------------------
   1.121 +
   1.122 +void U_EXPORT2
   1.123 +Normalizer::normalize(const UnicodeString& source, 
   1.124 +                      UNormalizationMode mode, int32_t options,
   1.125 +                      UnicodeString& result, 
   1.126 +                      UErrorCode &status) {
   1.127 +    if(source.isBogus() || U_FAILURE(status)) {
   1.128 +        result.setToBogus();
   1.129 +        if(U_SUCCESS(status)) {
   1.130 +            status=U_ILLEGAL_ARGUMENT_ERROR;
   1.131 +        }
   1.132 +    } else {
   1.133 +        UnicodeString localDest;
   1.134 +        UnicodeString *dest;
   1.135 +
   1.136 +        if(&source!=&result) {
   1.137 +            dest=&result;
   1.138 +        } else {
   1.139 +            // the source and result strings are the same object, use a temporary one
   1.140 +            dest=&localDest;
   1.141 +        }
   1.142 +        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   1.143 +        if(U_SUCCESS(status)) {
   1.144 +            if(options&UNORM_UNICODE_3_2) {
   1.145 +                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   1.146 +                    normalize(source, *dest, status);
   1.147 +            } else {
   1.148 +                n2->normalize(source, *dest, status);
   1.149 +            }
   1.150 +        }
   1.151 +        if(dest==&localDest && U_SUCCESS(status)) {
   1.152 +            result=*dest;
   1.153 +        }
   1.154 +    }
   1.155 +}
   1.156 +
   1.157 +void U_EXPORT2
   1.158 +Normalizer::compose(const UnicodeString& source, 
   1.159 +                    UBool compat, int32_t options,
   1.160 +                    UnicodeString& result, 
   1.161 +                    UErrorCode &status) {
   1.162 +    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
   1.163 +}
   1.164 +
   1.165 +void U_EXPORT2
   1.166 +Normalizer::decompose(const UnicodeString& source, 
   1.167 +                      UBool compat, int32_t options,
   1.168 +                      UnicodeString& result, 
   1.169 +                      UErrorCode &status) {
   1.170 +    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
   1.171 +}
   1.172 +
   1.173 +UNormalizationCheckResult
   1.174 +Normalizer::quickCheck(const UnicodeString& source,
   1.175 +                       UNormalizationMode mode, int32_t options,
   1.176 +                       UErrorCode &status) {
   1.177 +    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   1.178 +    if(U_SUCCESS(status)) {
   1.179 +        if(options&UNORM_UNICODE_3_2) {
   1.180 +            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   1.181 +                quickCheck(source, status);
   1.182 +        } else {
   1.183 +            return n2->quickCheck(source, status);
   1.184 +        }
   1.185 +    } else {
   1.186 +        return UNORM_MAYBE;
   1.187 +    }
   1.188 +}
   1.189 +
   1.190 +UBool
   1.191 +Normalizer::isNormalized(const UnicodeString& source,
   1.192 +                         UNormalizationMode mode, int32_t options,
   1.193 +                         UErrorCode &status) {
   1.194 +    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   1.195 +    if(U_SUCCESS(status)) {
   1.196 +        if(options&UNORM_UNICODE_3_2) {
   1.197 +            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   1.198 +                isNormalized(source, status);
   1.199 +        } else {
   1.200 +            return n2->isNormalized(source, status);
   1.201 +        }
   1.202 +    } else {
   1.203 +        return FALSE;
   1.204 +    }
   1.205 +}
   1.206 +
   1.207 +UnicodeString & U_EXPORT2
   1.208 +Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
   1.209 +                        UnicodeString &result,
   1.210 +                        UNormalizationMode mode, int32_t options,
   1.211 +                        UErrorCode &errorCode) {
   1.212 +    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
   1.213 +        result.setToBogus();
   1.214 +        if(U_SUCCESS(errorCode)) {
   1.215 +            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.216 +        }
   1.217 +    } else {
   1.218 +        UnicodeString localDest;
   1.219 +        UnicodeString *dest;
   1.220 +
   1.221 +        if(&right!=&result) {
   1.222 +            dest=&result;
   1.223 +        } else {
   1.224 +            // the right and result strings are the same object, use a temporary one
   1.225 +            dest=&localDest;
   1.226 +        }
   1.227 +        *dest=left;
   1.228 +        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
   1.229 +        if(U_SUCCESS(errorCode)) {
   1.230 +            if(options&UNORM_UNICODE_3_2) {
   1.231 +                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
   1.232 +                    append(*dest, right, errorCode);
   1.233 +            } else {
   1.234 +                n2->append(*dest, right, errorCode);
   1.235 +            }
   1.236 +        }
   1.237 +        if(dest==&localDest && U_SUCCESS(errorCode)) {
   1.238 +            result=*dest;
   1.239 +        }
   1.240 +    }
   1.241 +    return result;
   1.242 +}
   1.243 +
   1.244 +//-------------------------------------------------------------------------
   1.245 +// Iteration API
   1.246 +//-------------------------------------------------------------------------
   1.247 +
   1.248 +/**
   1.249 + * Return the current character in the normalized text.
   1.250 + */
   1.251 +UChar32 Normalizer::current() {
   1.252 +    if(bufferPos<buffer.length() || nextNormalize()) {
   1.253 +        return buffer.char32At(bufferPos);
   1.254 +    } else {
   1.255 +        return DONE;
   1.256 +    }
   1.257 +}
   1.258 +
   1.259 +/**
   1.260 + * Return the next character in the normalized text and advance
   1.261 + * the iteration position by one.  If the end
   1.262 + * of the text has already been reached, {@link #DONE} is returned.
   1.263 + */
   1.264 +UChar32 Normalizer::next() {
   1.265 +    if(bufferPos<buffer.length() ||  nextNormalize()) {
   1.266 +        UChar32 c=buffer.char32At(bufferPos);
   1.267 +        bufferPos+=U16_LENGTH(c);
   1.268 +        return c;
   1.269 +    } else {
   1.270 +        return DONE;
   1.271 +    }
   1.272 +}
   1.273 +
   1.274 +/**
   1.275 + * Return the previous character in the normalized text and decrement
   1.276 + * the iteration position by one.  If the beginning
   1.277 + * of the text has already been reached, {@link #DONE} is returned.
   1.278 + */
   1.279 +UChar32 Normalizer::previous() {
   1.280 +    if(bufferPos>0 || previousNormalize()) {
   1.281 +        UChar32 c=buffer.char32At(bufferPos-1);
   1.282 +        bufferPos-=U16_LENGTH(c);
   1.283 +        return c;
   1.284 +    } else {
   1.285 +        return DONE;
   1.286 +    }
   1.287 +}
   1.288 +
   1.289 +void Normalizer::reset() {
   1.290 +    currentIndex=nextIndex=text->setToStart();
   1.291 +    clearBuffer();
   1.292 +}
   1.293 +
   1.294 +void
   1.295 +Normalizer::setIndexOnly(int32_t index) {
   1.296 +    text->setIndex(index);  // pins index
   1.297 +    currentIndex=nextIndex=text->getIndex();
   1.298 +    clearBuffer();
   1.299 +}
   1.300 +
   1.301 +/**
   1.302 + * Return the first character in the normalized text.  This resets
   1.303 + * the <tt>Normalizer's</tt> position to the beginning of the text.
   1.304 + */
   1.305 +UChar32 Normalizer::first() {
   1.306 +    reset();
   1.307 +    return next();
   1.308 +}
   1.309 +
   1.310 +/**
   1.311 + * Return the last character in the normalized text.  This resets
   1.312 + * the <tt>Normalizer's</tt> position to be just before the
   1.313 + * the input text corresponding to that normalized character.
   1.314 + */
   1.315 +UChar32 Normalizer::last() {
   1.316 +    currentIndex=nextIndex=text->setToEnd();
   1.317 +    clearBuffer();
   1.318 +    return previous();
   1.319 +}
   1.320 +
   1.321 +/**
   1.322 + * Retrieve the current iteration position in the input text that is
   1.323 + * being normalized.  This method is useful in applications such as
   1.324 + * searching, where you need to be able to determine the position in
   1.325 + * the input text that corresponds to a given normalized output character.
   1.326 + * <p>
   1.327 + * <b>Note:</b> This method sets the position in the <em>input</em>, while
   1.328 + * {@link #next} and {@link #previous} iterate through characters in the
   1.329 + * <em>output</em>.  This means that there is not necessarily a one-to-one
   1.330 + * correspondence between characters returned by <tt>next</tt> and
   1.331 + * <tt>previous</tt> and the indices passed to and returned from
   1.332 + * <tt>setIndex</tt> and {@link #getIndex}.
   1.333 + *
   1.334 + */
   1.335 +int32_t Normalizer::getIndex() const {
   1.336 +    if(bufferPos<buffer.length()) {
   1.337 +        return currentIndex;
   1.338 +    } else {
   1.339 +        return nextIndex;
   1.340 +    }
   1.341 +}
   1.342 +
   1.343 +/**
   1.344 + * Retrieve the index of the start of the input text.  This is the begin index
   1.345 + * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
   1.346 + * over which this <tt>Normalizer</tt> is iterating
   1.347 + */
   1.348 +int32_t Normalizer::startIndex() const {
   1.349 +    return text->startIndex();
   1.350 +}
   1.351 +
   1.352 +/**
   1.353 + * Retrieve the index of the end of the input text.  This is the end index
   1.354 + * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
   1.355 + * over which this <tt>Normalizer</tt> is iterating
   1.356 + */
   1.357 +int32_t Normalizer::endIndex() const {
   1.358 +    return text->endIndex();
   1.359 +}
   1.360 +
   1.361 +//-------------------------------------------------------------------------
   1.362 +// Property access methods
   1.363 +//-------------------------------------------------------------------------
   1.364 +
   1.365 +void
   1.366 +Normalizer::setMode(UNormalizationMode newMode) 
   1.367 +{
   1.368 +    fUMode = newMode;
   1.369 +    init();
   1.370 +}
   1.371 +
   1.372 +UNormalizationMode
   1.373 +Normalizer::getUMode() const
   1.374 +{
   1.375 +    return fUMode;
   1.376 +}
   1.377 +
   1.378 +void
   1.379 +Normalizer::setOption(int32_t option, 
   1.380 +                      UBool value) 
   1.381 +{
   1.382 +    if (value) {
   1.383 +        fOptions |= option;
   1.384 +    } else {
   1.385 +        fOptions &= (~option);
   1.386 +    }
   1.387 +    init();
   1.388 +}
   1.389 +
   1.390 +UBool
   1.391 +Normalizer::getOption(int32_t option) const
   1.392 +{
   1.393 +    return (fOptions & option) != 0;
   1.394 +}
   1.395 +
   1.396 +/**
   1.397 + * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1.398 + * The iteration position is set to the beginning of the input text.
   1.399 + */
   1.400 +void
   1.401 +Normalizer::setText(const UnicodeString& newText, 
   1.402 +                    UErrorCode &status)
   1.403 +{
   1.404 +    if (U_FAILURE(status)) {
   1.405 +        return;
   1.406 +    }
   1.407 +    CharacterIterator *newIter = new StringCharacterIterator(newText);
   1.408 +    if (newIter == NULL) {
   1.409 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.410 +        return;
   1.411 +    }
   1.412 +    delete text;
   1.413 +    text = newIter;
   1.414 +    reset();
   1.415 +}
   1.416 +
   1.417 +/**
   1.418 + * Set the input text over which this <tt>Normalizer</tt> will iterate.
   1.419 + * The iteration position is set to the beginning of the string.
   1.420 + */
   1.421 +void
   1.422 +Normalizer::setText(const CharacterIterator& newText, 
   1.423 +                    UErrorCode &status) 
   1.424 +{
   1.425 +    if (U_FAILURE(status)) {
   1.426 +        return;
   1.427 +    }
   1.428 +    CharacterIterator *newIter = newText.clone();
   1.429 +    if (newIter == NULL) {
   1.430 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.431 +        return;
   1.432 +    }
   1.433 +    delete text;
   1.434 +    text = newIter;
   1.435 +    reset();
   1.436 +}
   1.437 +
   1.438 +void
   1.439 +Normalizer::setText(const UChar* newText,
   1.440 +                    int32_t length,
   1.441 +                    UErrorCode &status)
   1.442 +{
   1.443 +    if (U_FAILURE(status)) {
   1.444 +        return;
   1.445 +    }
   1.446 +    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
   1.447 +    if (newIter == NULL) {
   1.448 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.449 +        return;
   1.450 +    }
   1.451 +    delete text;
   1.452 +    text = newIter;
   1.453 +    reset();
   1.454 +}
   1.455 +
   1.456 +/**
   1.457 + * Copies the text under iteration into the UnicodeString referred to by "result".
   1.458 + * @param result Receives a copy of the text under iteration.
   1.459 + */
   1.460 +void
   1.461 +Normalizer::getText(UnicodeString&  result) 
   1.462 +{
   1.463 +    text->getText(result);
   1.464 +}
   1.465 +
   1.466 +//-------------------------------------------------------------------------
   1.467 +// Private utility methods
   1.468 +//-------------------------------------------------------------------------
   1.469 +
   1.470 +void Normalizer::clearBuffer() {
   1.471 +    buffer.remove();
   1.472 +    bufferPos=0;
   1.473 +}
   1.474 +
   1.475 +UBool
   1.476 +Normalizer::nextNormalize() {
   1.477 +    clearBuffer();
   1.478 +    currentIndex=nextIndex;
   1.479 +    text->setIndex(nextIndex);
   1.480 +    if(!text->hasNext()) {
   1.481 +        return FALSE;
   1.482 +    }
   1.483 +    // Skip at least one character so we make progress.
   1.484 +    UnicodeString segment(text->next32PostInc());
   1.485 +    while(text->hasNext()) {
   1.486 +        UChar32 c;
   1.487 +        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
   1.488 +            text->move32(-1, CharacterIterator::kCurrent);
   1.489 +            break;
   1.490 +        }
   1.491 +        segment.append(c);
   1.492 +    }
   1.493 +    nextIndex=text->getIndex();
   1.494 +    UErrorCode errorCode=U_ZERO_ERROR;
   1.495 +    fNorm2->normalize(segment, buffer, errorCode);
   1.496 +    return U_SUCCESS(errorCode) && !buffer.isEmpty();
   1.497 +}
   1.498 +
   1.499 +UBool
   1.500 +Normalizer::previousNormalize() {
   1.501 +    clearBuffer();
   1.502 +    nextIndex=currentIndex;
   1.503 +    text->setIndex(currentIndex);
   1.504 +    if(!text->hasPrevious()) {
   1.505 +        return FALSE;
   1.506 +    }
   1.507 +    UnicodeString segment;
   1.508 +    while(text->hasPrevious()) {
   1.509 +        UChar32 c=text->previous32();
   1.510 +        segment.insert(0, c);
   1.511 +        if(fNorm2->hasBoundaryBefore(c)) {
   1.512 +            break;
   1.513 +        }
   1.514 +    }
   1.515 +    currentIndex=text->getIndex();
   1.516 +    UErrorCode errorCode=U_ZERO_ERROR;
   1.517 +    fNorm2->normalize(segment, buffer, errorCode);
   1.518 +    bufferPos=buffer.length();
   1.519 +    return U_SUCCESS(errorCode) && !buffer.isEmpty();
   1.520 +}
   1.521 +
   1.522 +U_NAMESPACE_END
   1.523 +
   1.524 +#endif /* #if !UCONFIG_NO_NORMALIZATION */

mercurial