The Tor Browser: intl/icu/source/common/normlzr.cpp@fc2d59ddac77

     1 /*

     2  *************************************************************************

     3  * COPYRIGHT:

     4  * Copyright (c) 1996-2012, International Business Machines Corporation and

     5  * others. All Rights Reserved.

     6  *************************************************************************

     7  */

     9 #include "unicode/utypes.h"

    11 #if !UCONFIG_NO_NORMALIZATION

    13 #include "unicode/uniset.h"

    14 #include "unicode/unistr.h"

    15 #include "unicode/chariter.h"

    16 #include "unicode/schriter.h"

    17 #include "unicode/uchriter.h"

    18 #include "unicode/normlzr.h"

    19 #include "unicode/utf16.h"

    20 #include "cmemory.h"

    21 #include "normalizer2impl.h"

    22 #include "uprops.h"  // for uniset_getUnicode32Instance()

    24 U_NAMESPACE_BEGIN

    26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)

    28 //-------------------------------------------------------------------------

    29 // Constructors and other boilerplate

    30 //-------------------------------------------------------------------------

    32 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :

    33     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

    34     text(new StringCharacterIterator(str)),

    35     currentIndex(0), nextIndex(0),

    36     buffer(), bufferPos(0)

    37 {

    38     init();

    39 }

    41 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :

    42     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

    43     text(new UCharCharacterIterator(str, length)),

    44     currentIndex(0), nextIndex(0),

    45     buffer(), bufferPos(0)

    46 {

    47     init();

    48 }

    50 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :

    51     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

    52     text(iter.clone()),

    53     currentIndex(0), nextIndex(0),

    54     buffer(), bufferPos(0)

    55 {

    56     init();

    57 }

    59 Normalizer::Normalizer(const Normalizer &copy) :

    60     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),

    61     text(copy.text->clone()),

    62     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),

    63     buffer(copy.buffer), bufferPos(copy.bufferPos)

    64 {

    65     init();

    66 }

    68 void

    69 Normalizer::init() {

    70     UErrorCode errorCode=U_ZERO_ERROR;

    71     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);

    72     if(fOptions&UNORM_UNICODE_3_2) {

    73         delete fFilteredNorm2;

    74         fNorm2=fFilteredNorm2=

    75             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));

    76     }

    77     if(U_FAILURE(errorCode)) {

    78         errorCode=U_ZERO_ERROR;

    79         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);

    80     }

    81 }

    83 Normalizer::~Normalizer()

    84 {

    85     delete fFilteredNorm2;

    86     delete text;

    87 }

    89 Normalizer*

    90 Normalizer::clone() const

    91 {

    92     return new Normalizer(*this);

    93 }

    95 /**

    96  * Generates a hash code for this iterator.

    97  */

    98 int32_t Normalizer::hashCode() const

    99 {

   100     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;

   101 }

   103 UBool Normalizer::operator==(const Normalizer& that) const

   104 {

   105     return

   106         this==&that ||

   107         (fUMode==that.fUMode &&

   108         fOptions==that.fOptions &&

   109         *text==*that.text &&

   110         buffer==that.buffer &&

   111         bufferPos==that.bufferPos &&

   112         nextIndex==that.nextIndex);

   113 }

   115 //-------------------------------------------------------------------------

   116 // Static utility methods

   117 //-------------------------------------------------------------------------

   119 void U_EXPORT2

   120 Normalizer::normalize(const UnicodeString& source,

   121                       UNormalizationMode mode, int32_t options,

   122                       UnicodeString& result,

   123                       UErrorCode &status) {

   124     if(source.isBogus() || U_FAILURE(status)) {

   125         result.setToBogus();

   126         if(U_SUCCESS(status)) {

   127             status=U_ILLEGAL_ARGUMENT_ERROR;

   128         }

   129     } else {

   130         UnicodeString localDest;

   131         UnicodeString *dest;

   133         if(&source!=&result) {

   134             dest=&result;

   135         } else {

   136             // the source and result strings are the same object, use a temporary one

   137             dest=&localDest;

   138         }

   139         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

   140         if(U_SUCCESS(status)) {

   141             if(options&UNORM_UNICODE_3_2) {

   142                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

   143                     normalize(source, *dest, status);

   144             } else {

   145                 n2->normalize(source, *dest, status);

   146             }

   147         }

   148         if(dest==&localDest && U_SUCCESS(status)) {

   149             result=*dest;

   150         }

   151     }

   152 }

   154 void U_EXPORT2

   155 Normalizer::compose(const UnicodeString& source,

   156                     UBool compat, int32_t options,

   157                     UnicodeString& result,

   158                     UErrorCode &status) {

   159     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);

   160 }

   162 void U_EXPORT2

   163 Normalizer::decompose(const UnicodeString& source,

   164                       UBool compat, int32_t options,

   165                       UnicodeString& result,

   166                       UErrorCode &status) {

   167     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);

   168 }

   170 UNormalizationCheckResult

   171 Normalizer::quickCheck(const UnicodeString& source,

   172                        UNormalizationMode mode, int32_t options,

   173                        UErrorCode &status) {

   174     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

   175     if(U_SUCCESS(status)) {

   176         if(options&UNORM_UNICODE_3_2) {

   177             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

   178                 quickCheck(source, status);

   179         } else {

   180             return n2->quickCheck(source, status);

   181         }

   182     } else {

   183         return UNORM_MAYBE;

   184     }

   185 }

   187 UBool

   188 Normalizer::isNormalized(const UnicodeString& source,

   189                          UNormalizationMode mode, int32_t options,

   190                          UErrorCode &status) {

   191     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

   192     if(U_SUCCESS(status)) {

   193         if(options&UNORM_UNICODE_3_2) {

   194             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).

   195                 isNormalized(source, status);

   196         } else {

   197             return n2->isNormalized(source, status);

   198         }

   199     } else {

   200         return FALSE;

   201     }

   202 }

   204 UnicodeString & U_EXPORT2

   205 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,

   206                         UnicodeString &result,

   207                         UNormalizationMode mode, int32_t options,

   208                         UErrorCode &errorCode) {

   209     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {

   210         result.setToBogus();

   211         if(U_SUCCESS(errorCode)) {

   212             errorCode=U_ILLEGAL_ARGUMENT_ERROR;

   213         }

   214     } else {

   215         UnicodeString localDest;

   216         UnicodeString *dest;

   218         if(&right!=&result) {

   219             dest=&result;

   220         } else {

   221             // the right and result strings are the same object, use a temporary one

   222             dest=&localDest;

   223         }

   224         *dest=left;

   225         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);

   226         if(U_SUCCESS(errorCode)) {

   227             if(options&UNORM_UNICODE_3_2) {

   228                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).

   229                     append(*dest, right, errorCode);

   230             } else {

   231                 n2->append(*dest, right, errorCode);

   232             }

   233         }

   234         if(dest==&localDest && U_SUCCESS(errorCode)) {

   235             result=*dest;

   236         }

   237     }

   238     return result;

   239 }

   241 //-------------------------------------------------------------------------

   242 // Iteration API

   243 //-------------------------------------------------------------------------

   245 /**

   246  * Return the current character in the normalized text.

   247  */

   248 UChar32 Normalizer::current() {

   249     if(bufferPos<buffer.length() || nextNormalize()) {

   250         return buffer.char32At(bufferPos);

   251     } else {

   252         return DONE;

   253     }

   254 }

   256 /**

   257  * Return the next character in the normalized text and advance

   258  * the iteration position by one.  If the end

   259  * of the text has already been reached, {@link #DONE} is returned.

   260  */

   261 UChar32 Normalizer::next() {

   262     if(bufferPos<buffer.length() ||  nextNormalize()) {

   263         UChar32 c=buffer.char32At(bufferPos);

   264         bufferPos+=U16_LENGTH(c);

   265         return c;

   266     } else {

   267         return DONE;

   268     }

   269 }

   271 /**

   272  * Return the previous character in the normalized text and decrement

   273  * the iteration position by one.  If the beginning

   274  * of the text has already been reached, {@link #DONE} is returned.

   275  */

   276 UChar32 Normalizer::previous() {

   277     if(bufferPos>0 || previousNormalize()) {

   278         UChar32 c=buffer.char32At(bufferPos-1);

   279         bufferPos-=U16_LENGTH(c);

   280         return c;

   281     } else {

   282         return DONE;

   283     }

   284 }

   286 void Normalizer::reset() {

   287     currentIndex=nextIndex=text->setToStart();

   288     clearBuffer();

   289 }

   291 void

   292 Normalizer::setIndexOnly(int32_t index) {

   293     text->setIndex(index);  // pins index

   294     currentIndex=nextIndex=text->getIndex();

   295     clearBuffer();

   296 }

   298 /**

   299  * Return the first character in the normalized text.  This resets

   300  * the <tt>Normalizer's</tt> position to the beginning of the text.

   301  */

   302 UChar32 Normalizer::first() {

   303     reset();

   304     return next();

   305 }

   307 /**

   308  * Return the last character in the normalized text.  This resets

   309  * the <tt>Normalizer's</tt> position to be just before the

   310  * the input text corresponding to that normalized character.

   311  */

   312 UChar32 Normalizer::last() {

   313     currentIndex=nextIndex=text->setToEnd();

   314     clearBuffer();

   315     return previous();

   316 }

   318 /**

   319  * Retrieve the current iteration position in the input text that is

   320  * being normalized.  This method is useful in applications such as

   321  * searching, where you need to be able to determine the position in

   322  * the input text that corresponds to a given normalized output character.

   323  * <p>

   324  * <b>Note:</b> This method sets the position in the <em>input</em>, while

   325  * {@link #next} and {@link #previous} iterate through characters in the

   326  * <em>output</em>.  This means that there is not necessarily a one-to-one

   327  * correspondence between characters returned by <tt>next</tt> and

   328  * <tt>previous</tt> and the indices passed to and returned from

   329  * <tt>setIndex</tt> and {@link #getIndex}.

   330  *

   331  */

   332 int32_t Normalizer::getIndex() const {

   333     if(bufferPos<buffer.length()) {

   334         return currentIndex;

   335     } else {

   336         return nextIndex;

   337     }

   338 }

   340 /**

   341  * Retrieve the index of the start of the input text.  This is the begin index

   342  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>

   343  * over which this <tt>Normalizer</tt> is iterating

   344  */

   345 int32_t Normalizer::startIndex() const {

   346     return text->startIndex();

   347 }

   349 /**

   350  * Retrieve the index of the end of the input text.  This is the end index

   351  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>

   352  * over which this <tt>Normalizer</tt> is iterating

   353  */

   354 int32_t Normalizer::endIndex() const {

   355     return text->endIndex();

   356 }

   358 //-------------------------------------------------------------------------

   359 // Property access methods

   360 //-------------------------------------------------------------------------

   362 void

   363 Normalizer::setMode(UNormalizationMode newMode)

   364 {

   365     fUMode = newMode;

   366     init();

   367 }

   369 UNormalizationMode

   370 Normalizer::getUMode() const

   371 {

   372     return fUMode;

   373 }

   375 void

   376 Normalizer::setOption(int32_t option,

   377                       UBool value)

   378 {

   379     if (value) {

   380         fOptions |= option;

   381     } else {

   382         fOptions &= (~option);

   383     }

   384     init();

   385 }

   387 UBool

   388 Normalizer::getOption(int32_t option) const

   389 {

   390     return (fOptions & option) != 0;

   391 }

   393 /**

   394  * Set the input text over which this <tt>Normalizer</tt> will iterate.

   395  * The iteration position is set to the beginning of the input text.

   396  */

   397 void

   398 Normalizer::setText(const UnicodeString& newText,

   399                     UErrorCode &status)

   400 {

   401     if (U_FAILURE(status)) {

   402         return;

   403     }

   404     CharacterIterator *newIter = new StringCharacterIterator(newText);

   405     if (newIter == NULL) {

   406         status = U_MEMORY_ALLOCATION_ERROR;

   407         return;

   408     }

   409     delete text;

   410     text = newIter;

   411     reset();

   412 }

   414 /**

   415  * Set the input text over which this <tt>Normalizer</tt> will iterate.

   416  * The iteration position is set to the beginning of the string.

   417  */

   418 void

   419 Normalizer::setText(const CharacterIterator& newText,

   420                     UErrorCode &status)

   421 {

   422     if (U_FAILURE(status)) {

   423         return;

   424     }

   425     CharacterIterator *newIter = newText.clone();

   426     if (newIter == NULL) {

   427         status = U_MEMORY_ALLOCATION_ERROR;

   428         return;

   429     }

   430     delete text;

   431     text = newIter;

   432     reset();

   433 }

   435 void

   436 Normalizer::setText(const UChar* newText,

   437                     int32_t length,

   438                     UErrorCode &status)

   439 {

   440     if (U_FAILURE(status)) {

   441         return;

   442     }

   443     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);

   444     if (newIter == NULL) {

   445         status = U_MEMORY_ALLOCATION_ERROR;

   446         return;

   447     }

   448     delete text;

   449     text = newIter;

   450     reset();

   451 }

   453 /**

   454  * Copies the text under iteration into the UnicodeString referred to by "result".

   455  * @param result Receives a copy of the text under iteration.

   456  */

   457 void

   458 Normalizer::getText(UnicodeString&  result)

   459 {

   460     text->getText(result);

   461 }

   463 //-------------------------------------------------------------------------

   464 // Private utility methods

   465 //-------------------------------------------------------------------------

   467 void Normalizer::clearBuffer() {

   468     buffer.remove();

   469     bufferPos=0;

   470 }

   472 UBool

   473 Normalizer::nextNormalize() {

   474     clearBuffer();

   475     currentIndex=nextIndex;

   476     text->setIndex(nextIndex);

   477     if(!text->hasNext()) {

   478         return FALSE;

   479     }

   480     // Skip at least one character so we make progress.

   481     UnicodeString segment(text->next32PostInc());

   482     while(text->hasNext()) {

   483         UChar32 c;

   484         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {

   485             text->move32(-1, CharacterIterator::kCurrent);

   486             break;

   487         }

   488         segment.append(c);

   489     }

   490     nextIndex=text->getIndex();

   491     UErrorCode errorCode=U_ZERO_ERROR;

   492     fNorm2->normalize(segment, buffer, errorCode);

   493     return U_SUCCESS(errorCode) && !buffer.isEmpty();

   494 }

   496 UBool

   497 Normalizer::previousNormalize() {

   498     clearBuffer();

   499     nextIndex=currentIndex;

   500     text->setIndex(currentIndex);

   501     if(!text->hasPrevious()) {

   502         return FALSE;

   503     }

   504     UnicodeString segment;

   505     while(text->hasPrevious()) {

   506         UChar32 c=text->previous32();

   507         segment.insert(0, c);

   508         if(fNorm2->hasBoundaryBefore(c)) {

   509             break;

   510         }

   511     }

   512     currentIndex=text->getIndex();

   513     UErrorCode errorCode=U_ZERO_ERROR;

   514     fNorm2->normalize(segment, buffer, errorCode);

   515     bufferPos=buffer.length();

   516     return U_SUCCESS(errorCode) && !buffer.isEmpty();

   517 }

   519 U_NAMESPACE_END

   521 #endif /* #if !UCONFIG_NO_NORMALIZATION */

The Tor Browser / file revision

intl/icu/source/common/normlzr.cpp@fc2d59ddac77

intl/icu/source/common/normlzr.cpp