intl/icu/source/common/normlzr.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  *************************************************************************
     3  * COPYRIGHT: 
     4  * Copyright (c) 1996-2012, International Business Machines Corporation and
     5  * others. All Rights Reserved.
     6  *************************************************************************
     7  */
     9 #include "unicode/utypes.h"
    11 #if !UCONFIG_NO_NORMALIZATION
    13 #include "unicode/uniset.h"
    14 #include "unicode/unistr.h"
    15 #include "unicode/chariter.h"
    16 #include "unicode/schriter.h"
    17 #include "unicode/uchriter.h"
    18 #include "unicode/normlzr.h"
    19 #include "unicode/utf16.h"
    20 #include "cmemory.h"
    21 #include "normalizer2impl.h"
    22 #include "uprops.h"  // for uniset_getUnicode32Instance()
    24 U_NAMESPACE_BEGIN
    26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
    28 //-------------------------------------------------------------------------
    29 // Constructors and other boilerplate
    30 //-------------------------------------------------------------------------
    32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
    33     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    34     text(new StringCharacterIterator(str)),
    35     currentIndex(0), nextIndex(0),
    36     buffer(), bufferPos(0)
    37 {
    38     init();
    39 }
    41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
    42     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    43     text(new UCharCharacterIterator(str, length)),
    44     currentIndex(0), nextIndex(0),
    45     buffer(), bufferPos(0)
    46 {
    47     init();
    48 }
    50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
    51     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
    52     text(iter.clone()),
    53     currentIndex(0), nextIndex(0),
    54     buffer(), bufferPos(0)
    55 {
    56     init();
    57 }
    59 Normalizer::Normalizer(const Normalizer &copy) :
    60     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
    61     text(copy.text->clone()),
    62     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
    63     buffer(copy.buffer), bufferPos(copy.bufferPos)
    64 {
    65     init();
    66 }
    68 void
    69 Normalizer::init() {
    70     UErrorCode errorCode=U_ZERO_ERROR;
    71     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
    72     if(fOptions&UNORM_UNICODE_3_2) {
    73         delete fFilteredNorm2;
    74         fNorm2=fFilteredNorm2=
    75             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
    76     }
    77     if(U_FAILURE(errorCode)) {
    78         errorCode=U_ZERO_ERROR;
    79         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
    80     }
    81 }
    83 Normalizer::~Normalizer()
    84 {
    85     delete fFilteredNorm2;
    86     delete text;
    87 }
    89 Normalizer* 
    90 Normalizer::clone() const
    91 {
    92     return new Normalizer(*this);
    93 }
    95 /**
    96  * Generates a hash code for this iterator.
    97  */
    98 int32_t Normalizer::hashCode() const
    99 {
   100     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
   101 }
   103 UBool Normalizer::operator==(const Normalizer& that) const
   104 {
   105     return
   106         this==&that ||
   107         (fUMode==that.fUMode &&
   108         fOptions==that.fOptions &&
   109         *text==*that.text &&
   110         buffer==that.buffer &&
   111         bufferPos==that.bufferPos &&
   112         nextIndex==that.nextIndex);
   113 }
   115 //-------------------------------------------------------------------------
   116 // Static utility methods
   117 //-------------------------------------------------------------------------
   119 void U_EXPORT2
   120 Normalizer::normalize(const UnicodeString& source, 
   121                       UNormalizationMode mode, int32_t options,
   122                       UnicodeString& result, 
   123                       UErrorCode &status) {
   124     if(source.isBogus() || U_FAILURE(status)) {
   125         result.setToBogus();
   126         if(U_SUCCESS(status)) {
   127             status=U_ILLEGAL_ARGUMENT_ERROR;
   128         }
   129     } else {
   130         UnicodeString localDest;
   131         UnicodeString *dest;
   133         if(&source!=&result) {
   134             dest=&result;
   135         } else {
   136             // the source and result strings are the same object, use a temporary one
   137             dest=&localDest;
   138         }
   139         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   140         if(U_SUCCESS(status)) {
   141             if(options&UNORM_UNICODE_3_2) {
   142                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   143                     normalize(source, *dest, status);
   144             } else {
   145                 n2->normalize(source, *dest, status);
   146             }
   147         }
   148         if(dest==&localDest && U_SUCCESS(status)) {
   149             result=*dest;
   150         }
   151     }
   152 }
   154 void U_EXPORT2
   155 Normalizer::compose(const UnicodeString& source, 
   156                     UBool compat, int32_t options,
   157                     UnicodeString& result, 
   158                     UErrorCode &status) {
   159     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
   160 }
   162 void U_EXPORT2
   163 Normalizer::decompose(const UnicodeString& source, 
   164                       UBool compat, int32_t options,
   165                       UnicodeString& result, 
   166                       UErrorCode &status) {
   167     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
   168 }
   170 UNormalizationCheckResult
   171 Normalizer::quickCheck(const UnicodeString& source,
   172                        UNormalizationMode mode, int32_t options,
   173                        UErrorCode &status) {
   174     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   175     if(U_SUCCESS(status)) {
   176         if(options&UNORM_UNICODE_3_2) {
   177             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   178                 quickCheck(source, status);
   179         } else {
   180             return n2->quickCheck(source, status);
   181         }
   182     } else {
   183         return UNORM_MAYBE;
   184     }
   185 }
   187 UBool
   188 Normalizer::isNormalized(const UnicodeString& source,
   189                          UNormalizationMode mode, int32_t options,
   190                          UErrorCode &status) {
   191     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
   192     if(U_SUCCESS(status)) {
   193         if(options&UNORM_UNICODE_3_2) {
   194             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
   195                 isNormalized(source, status);
   196         } else {
   197             return n2->isNormalized(source, status);
   198         }
   199     } else {
   200         return FALSE;
   201     }
   202 }
   204 UnicodeString & U_EXPORT2
   205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
   206                         UnicodeString &result,
   207                         UNormalizationMode mode, int32_t options,
   208                         UErrorCode &errorCode) {
   209     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
   210         result.setToBogus();
   211         if(U_SUCCESS(errorCode)) {
   212             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   213         }
   214     } else {
   215         UnicodeString localDest;
   216         UnicodeString *dest;
   218         if(&right!=&result) {
   219             dest=&result;
   220         } else {
   221             // the right and result strings are the same object, use a temporary one
   222             dest=&localDest;
   223         }
   224         *dest=left;
   225         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
   226         if(U_SUCCESS(errorCode)) {
   227             if(options&UNORM_UNICODE_3_2) {
   228                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
   229                     append(*dest, right, errorCode);
   230             } else {
   231                 n2->append(*dest, right, errorCode);
   232             }
   233         }
   234         if(dest==&localDest && U_SUCCESS(errorCode)) {
   235             result=*dest;
   236         }
   237     }
   238     return result;
   239 }
   241 //-------------------------------------------------------------------------
   242 // Iteration API
   243 //-------------------------------------------------------------------------
   245 /**
   246  * Return the current character in the normalized text.
   247  */
   248 UChar32 Normalizer::current() {
   249     if(bufferPos<buffer.length() || nextNormalize()) {
   250         return buffer.char32At(bufferPos);
   251     } else {
   252         return DONE;
   253     }
   254 }
   256 /**
   257  * Return the next character in the normalized text and advance
   258  * the iteration position by one.  If the end
   259  * of the text has already been reached, {@link #DONE} is returned.
   260  */
   261 UChar32 Normalizer::next() {
   262     if(bufferPos<buffer.length() ||  nextNormalize()) {
   263         UChar32 c=buffer.char32At(bufferPos);
   264         bufferPos+=U16_LENGTH(c);
   265         return c;
   266     } else {
   267         return DONE;
   268     }
   269 }
   271 /**
   272  * Return the previous character in the normalized text and decrement
   273  * the iteration position by one.  If the beginning
   274  * of the text has already been reached, {@link #DONE} is returned.
   275  */
   276 UChar32 Normalizer::previous() {
   277     if(bufferPos>0 || previousNormalize()) {
   278         UChar32 c=buffer.char32At(bufferPos-1);
   279         bufferPos-=U16_LENGTH(c);
   280         return c;
   281     } else {
   282         return DONE;
   283     }
   284 }
   286 void Normalizer::reset() {
   287     currentIndex=nextIndex=text->setToStart();
   288     clearBuffer();
   289 }
   291 void
   292 Normalizer::setIndexOnly(int32_t index) {
   293     text->setIndex(index);  // pins index
   294     currentIndex=nextIndex=text->getIndex();
   295     clearBuffer();
   296 }
   298 /**
   299  * Return the first character in the normalized text.  This resets
   300  * the <tt>Normalizer's</tt> position to the beginning of the text.
   301  */
   302 UChar32 Normalizer::first() {
   303     reset();
   304     return next();
   305 }
   307 /**
   308  * Return the last character in the normalized text.  This resets
   309  * the <tt>Normalizer's</tt> position to be just before the
   310  * the input text corresponding to that normalized character.
   311  */
   312 UChar32 Normalizer::last() {
   313     currentIndex=nextIndex=text->setToEnd();
   314     clearBuffer();
   315     return previous();
   316 }
   318 /**
   319  * Retrieve the current iteration position in the input text that is
   320  * being normalized.  This method is useful in applications such as
   321  * searching, where you need to be able to determine the position in
   322  * the input text that corresponds to a given normalized output character.
   323  * <p>
   324  * <b>Note:</b> This method sets the position in the <em>input</em>, while
   325  * {@link #next} and {@link #previous} iterate through characters in the
   326  * <em>output</em>.  This means that there is not necessarily a one-to-one
   327  * correspondence between characters returned by <tt>next</tt> and
   328  * <tt>previous</tt> and the indices passed to and returned from
   329  * <tt>setIndex</tt> and {@link #getIndex}.
   330  *
   331  */
   332 int32_t Normalizer::getIndex() const {
   333     if(bufferPos<buffer.length()) {
   334         return currentIndex;
   335     } else {
   336         return nextIndex;
   337     }
   338 }
   340 /**
   341  * Retrieve the index of the start of the input text.  This is the begin index
   342  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
   343  * over which this <tt>Normalizer</tt> is iterating
   344  */
   345 int32_t Normalizer::startIndex() const {
   346     return text->startIndex();
   347 }
   349 /**
   350  * Retrieve the index of the end of the input text.  This is the end index
   351  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
   352  * over which this <tt>Normalizer</tt> is iterating
   353  */
   354 int32_t Normalizer::endIndex() const {
   355     return text->endIndex();
   356 }
   358 //-------------------------------------------------------------------------
   359 // Property access methods
   360 //-------------------------------------------------------------------------
   362 void
   363 Normalizer::setMode(UNormalizationMode newMode) 
   364 {
   365     fUMode = newMode;
   366     init();
   367 }
   369 UNormalizationMode
   370 Normalizer::getUMode() const
   371 {
   372     return fUMode;
   373 }
   375 void
   376 Normalizer::setOption(int32_t option, 
   377                       UBool value) 
   378 {
   379     if (value) {
   380         fOptions |= option;
   381     } else {
   382         fOptions &= (~option);
   383     }
   384     init();
   385 }
   387 UBool
   388 Normalizer::getOption(int32_t option) const
   389 {
   390     return (fOptions & option) != 0;
   391 }
   393 /**
   394  * Set the input text over which this <tt>Normalizer</tt> will iterate.
   395  * The iteration position is set to the beginning of the input text.
   396  */
   397 void
   398 Normalizer::setText(const UnicodeString& newText, 
   399                     UErrorCode &status)
   400 {
   401     if (U_FAILURE(status)) {
   402         return;
   403     }
   404     CharacterIterator *newIter = new StringCharacterIterator(newText);
   405     if (newIter == NULL) {
   406         status = U_MEMORY_ALLOCATION_ERROR;
   407         return;
   408     }
   409     delete text;
   410     text = newIter;
   411     reset();
   412 }
   414 /**
   415  * Set the input text over which this <tt>Normalizer</tt> will iterate.
   416  * The iteration position is set to the beginning of the string.
   417  */
   418 void
   419 Normalizer::setText(const CharacterIterator& newText, 
   420                     UErrorCode &status) 
   421 {
   422     if (U_FAILURE(status)) {
   423         return;
   424     }
   425     CharacterIterator *newIter = newText.clone();
   426     if (newIter == NULL) {
   427         status = U_MEMORY_ALLOCATION_ERROR;
   428         return;
   429     }
   430     delete text;
   431     text = newIter;
   432     reset();
   433 }
   435 void
   436 Normalizer::setText(const UChar* newText,
   437                     int32_t length,
   438                     UErrorCode &status)
   439 {
   440     if (U_FAILURE(status)) {
   441         return;
   442     }
   443     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
   444     if (newIter == NULL) {
   445         status = U_MEMORY_ALLOCATION_ERROR;
   446         return;
   447     }
   448     delete text;
   449     text = newIter;
   450     reset();
   451 }
   453 /**
   454  * Copies the text under iteration into the UnicodeString referred to by "result".
   455  * @param result Receives a copy of the text under iteration.
   456  */
   457 void
   458 Normalizer::getText(UnicodeString&  result) 
   459 {
   460     text->getText(result);
   461 }
   463 //-------------------------------------------------------------------------
   464 // Private utility methods
   465 //-------------------------------------------------------------------------
   467 void Normalizer::clearBuffer() {
   468     buffer.remove();
   469     bufferPos=0;
   470 }
   472 UBool
   473 Normalizer::nextNormalize() {
   474     clearBuffer();
   475     currentIndex=nextIndex;
   476     text->setIndex(nextIndex);
   477     if(!text->hasNext()) {
   478         return FALSE;
   479     }
   480     // Skip at least one character so we make progress.
   481     UnicodeString segment(text->next32PostInc());
   482     while(text->hasNext()) {
   483         UChar32 c;
   484         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
   485             text->move32(-1, CharacterIterator::kCurrent);
   486             break;
   487         }
   488         segment.append(c);
   489     }
   490     nextIndex=text->getIndex();
   491     UErrorCode errorCode=U_ZERO_ERROR;
   492     fNorm2->normalize(segment, buffer, errorCode);
   493     return U_SUCCESS(errorCode) && !buffer.isEmpty();
   494 }
   496 UBool
   497 Normalizer::previousNormalize() {
   498     clearBuffer();
   499     nextIndex=currentIndex;
   500     text->setIndex(currentIndex);
   501     if(!text->hasPrevious()) {
   502         return FALSE;
   503     }
   504     UnicodeString segment;
   505     while(text->hasPrevious()) {
   506         UChar32 c=text->previous32();
   507         segment.insert(0, c);
   508         if(fNorm2->hasBoundaryBefore(c)) {
   509             break;
   510         }
   511     }
   512     currentIndex=text->getIndex();
   513     UErrorCode errorCode=U_ZERO_ERROR;
   514     fNorm2->normalize(segment, buffer, errorCode);
   515     bufferPos=buffer.length();
   516     return U_SUCCESS(errorCode) && !buffer.isEmpty();
   517 }
   519 U_NAMESPACE_END
   521 #endif /* #if !UCONFIG_NO_NORMALIZATION */

mercurial