The Tor Browser: intl/icu/source/common/normalizer2impl.cpp@fc2d59ddac77

     1 /*

     2 *******************************************************************************

3 *

     4 *   Copyright (C) 2009-2013, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 *******************************************************************************

     8 *   file name:  normalizer2impl.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2009nov22

    14 *   created by: Markus W. Scherer

    15 */

    17 #include "unicode/utypes.h"

    19 #if !UCONFIG_NO_NORMALIZATION

    21 #include "unicode/normalizer2.h"

    22 #include "unicode/udata.h"

    23 #include "unicode/ustring.h"

    24 #include "unicode/utf16.h"

    25 #include "cmemory.h"

    26 #include "mutex.h"

    27 #include "normalizer2impl.h"

    28 #include "putilimp.h"

    29 #include "uassert.h"

    30 #include "uset_imp.h"

    31 #include "utrie2.h"

    32 #include "uvector.h"

    34 U_NAMESPACE_BEGIN

    36 // ReorderingBuffer -------------------------------------------------------- ***

    38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {

    39     int32_t length=str.length();

    40     start=str.getBuffer(destCapacity);

    41     if(start==NULL) {

    42         // getBuffer() already did str.setToBogus()

    43         errorCode=U_MEMORY_ALLOCATION_ERROR;

    44         return FALSE;

    45     }

    46     limit=start+length;

    47     remainingCapacity=str.getCapacity()-length;

    48     reorderStart=start;

    49     if(start==limit) {

    50         lastCC=0;

    51     } else {

    52         setIterator();

    53         lastCC=previousCC();

    54         // Set reorderStart after the last code point with cc<=1 if there is one.

    55         if(lastCC>1) {

    56             while(previousCC()>1) {}

    57         }

    58         reorderStart=codePointLimit;

    59     }

    60     return TRUE;

    61 }

    63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {

    64     int32_t length=(int32_t)(limit-start);

    65     return

    66         length==(int32_t)(otherLimit-otherStart) &&

    67         0==u_memcmp(start, otherStart, length);

    68 }

    70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {

    71     if(remainingCapacity<2 && !resize(2, errorCode)) {

    72         return FALSE;

    73     }

    74     if(lastCC<=cc || cc==0) {

    75         limit[0]=U16_LEAD(c);

    76         limit[1]=U16_TRAIL(c);

    77         limit+=2;

    78         lastCC=cc;

    79         if(cc<=1) {

    80             reorderStart=limit;

    81         }

    82     } else {

    83         insert(c, cc);

    84     }

    85     remainingCapacity-=2;

    86     return TRUE;

    87 }

    89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,

    90                                uint8_t leadCC, uint8_t trailCC,

    91                                UErrorCode &errorCode) {

    92     if(length==0) {

    93         return TRUE;

    94     }

    95     if(remainingCapacity<length && !resize(length, errorCode)) {

    96         return FALSE;

    97     }

    98     remainingCapacity-=length;

    99     if(lastCC<=leadCC || leadCC==0) {

   100         if(trailCC<=1) {

   101             reorderStart=limit+length;

   102         } else if(leadCC<=1) {

   103             reorderStart=limit+1;  // Ok if not a code point boundary.

   104         }

   105         const UChar *sLimit=s+length;

   106         do { *limit++=*s++; } while(s!=sLimit);

   107         lastCC=trailCC;

   108     } else {

   109         int32_t i=0;

   110         UChar32 c;

   111         U16_NEXT(s, i, length, c);

   112         insert(c, leadCC);  // insert first code point

   113         while(i<length) {

   114             U16_NEXT(s, i, length, c);

   115             if(i<length) {

   116                 // s must be in NFD, otherwise we need to use getCC().

   117                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));

   118             } else {

   119                 leadCC=trailCC;

   120             }

   121             append(c, leadCC, errorCode);

   122         }

   123     }

   124     return TRUE;

   125 }

   127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {

   128     int32_t cpLength=U16_LENGTH(c);

   129     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {

   130         return FALSE;

   131     }

   132     remainingCapacity-=cpLength;

   133     if(cpLength==1) {

   134         *limit++=(UChar)c;

   135     } else {

   136         limit[0]=U16_LEAD(c);

   137         limit[1]=U16_TRAIL(c);

   138         limit+=2;

   139     }

   140     lastCC=0;

   141     reorderStart=limit;

   142     return TRUE;

   143 }

   145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {

   146     if(s==sLimit) {

   147         return TRUE;

   148     }

   149     int32_t length=(int32_t)(sLimit-s);

   150     if(remainingCapacity<length && !resize(length, errorCode)) {

   151         return FALSE;

   152     }

   153     u_memcpy(limit, s, length);

   154     limit+=length;

   155     remainingCapacity-=length;

   156     lastCC=0;

   157     reorderStart=limit;

   158     return TRUE;

   159 }

   161 void ReorderingBuffer::remove() {

   162     reorderStart=limit=start;

   163     remainingCapacity=str.getCapacity();

   164     lastCC=0;

   165 }

   167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {

   168     if(suffixLength<(limit-start)) {

   169         limit-=suffixLength;

   170         remainingCapacity+=suffixLength;

   171     } else {

   172         limit=start;

   173         remainingCapacity=str.getCapacity();

   174     }

   175     lastCC=0;

   176     reorderStart=limit;

   177 }

   179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {

   180     int32_t reorderStartIndex=(int32_t)(reorderStart-start);

   181     int32_t length=(int32_t)(limit-start);

   182     str.releaseBuffer(length);

   183     int32_t newCapacity=length+appendLength;

   184     int32_t doubleCapacity=2*str.getCapacity();

   185     if(newCapacity<doubleCapacity) {

   186         newCapacity=doubleCapacity;

   187     }

   188     if(newCapacity<256) {

   189         newCapacity=256;

   190     }

   191     start=str.getBuffer(newCapacity);

   192     if(start==NULL) {

   193         // getBuffer() already did str.setToBogus()

   194         errorCode=U_MEMORY_ALLOCATION_ERROR;

   195         return FALSE;

   196     }

   197     reorderStart=start+reorderStartIndex;

   198     limit=start+length;

   199     remainingCapacity=str.getCapacity()-length;

   200     return TRUE;

   201 }

   203 void ReorderingBuffer::skipPrevious() {

   204     codePointLimit=codePointStart;

   205     UChar c=*--codePointStart;

   206     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {

   207         --codePointStart;

   208     }

   209 }

   211 uint8_t ReorderingBuffer::previousCC() {

   212     codePointLimit=codePointStart;

   213     if(reorderStart>=codePointStart) {

   214         return 0;

   215     }

   216     UChar32 c=*--codePointStart;

   217     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {

   218         return 0;

   219     }

   221     UChar c2;

   222     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {

   223         --codePointStart;

   224         c=U16_GET_SUPPLEMENTARY(c2, c);

   225     }

   226     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));

   227 }

   229 // Inserts c somewhere before the last character.

   230 // Requires 0<cc<lastCC which implies reorderStart<limit.

   231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {

   232     for(setIterator(), skipPrevious(); previousCC()>cc;) {}

   233     // insert c at codePointLimit, after the character with prevCC<=cc

   234     UChar *q=limit;

   235     UChar *r=limit+=U16_LENGTH(c);

   236     do {

   237         *--r=*--q;

   238     } while(codePointLimit!=q);

   239     writeCodePoint(q, c);

   240     if(cc<=1) {

   241         reorderStart=r;

   242     }

   243 }

   245 // Normalizer2Impl --------------------------------------------------------- ***

   247 struct CanonIterData : public UMemory {

   248     CanonIterData(UErrorCode &errorCode);

   249     ~CanonIterData();

   250     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);

   251     UTrie2 *trie;

   252     UVector canonStartSets;  // contains UnicodeSet *

   253 };

   255 Normalizer2Impl::~Normalizer2Impl() {

   256     udata_close(memory);

   257     utrie2_close(normTrie);

   258     delete fCanonIterData;

   259 }

   261 UBool U_CALLCONV

   262 Normalizer2Impl::isAcceptable(void *context,

   263                               const char * /* type */, const char * /*name*/,

   264                               const UDataInfo *pInfo) {

   265     if(

   266         pInfo->size>=20 &&

   267         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&

   268         pInfo->charsetFamily==U_CHARSET_FAMILY &&

   269         pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */

   270         pInfo->dataFormat[1]==0x72 &&

   271         pInfo->dataFormat[2]==0x6d &&

   272         pInfo->dataFormat[3]==0x32 &&

   273         pInfo->formatVersion[0]==2

   274     ) {

   275         Normalizer2Impl *me=(Normalizer2Impl *)context;

   276         uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);

   277         return TRUE;

   278     } else {

   279         return FALSE;

   280     }

   281 }

   283 void

   284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {

   285     if(U_FAILURE(errorCode)) {

   286         return;

   287     }

   288     memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);

   289     if(U_FAILURE(errorCode)) {

   290         return;

   291     }

   292     const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);

   293     const int32_t *inIndexes=(const int32_t *)inBytes;

   294     int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;

   295     if(indexesLength<=IX_MIN_MAYBE_YES) {

   296         errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.

   297         return;

   298     }

   300     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];

   301     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];

   303     minYesNo=inIndexes[IX_MIN_YES_NO];

   304     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];

   305     minNoNo=inIndexes[IX_MIN_NO_NO];

   306     limitNoNo=inIndexes[IX_LIMIT_NO_NO];

   307     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];

   309     int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];

   310     int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];

   311     normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,

   312                                        inBytes+offset, nextOffset-offset, NULL,

   313                                        &errorCode);

   314     if(U_FAILURE(errorCode)) {

   315         return;

   316     }

   318     offset=nextOffset;

   319     nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];

   320     maybeYesCompositions=(const uint16_t *)(inBytes+offset);

   321     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);

   323     // smallFCD: new in formatVersion 2

   324     offset=nextOffset;

   325     smallFCD=inBytes+offset;

   327     // Build tccc180[].

   328     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.

   329     uint8_t bits=0;

   330     for(UChar c=0; c<0x180; bits>>=1) {

   331         if((c&0xff)==0) {

   332             bits=smallFCD[c>>8];  // one byte per 0x100 code points

   333         }

   334         if(bits&1) {

   335             for(int i=0; i<0x20; ++i, ++c) {

   336                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);

   337             }

   338         } else {

   339             uprv_memset(tccc180+c, 0, 0x20);

   340             c+=0x20;

   341         }

   342     }

   343 }

   345 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {

   346     UChar32 c;

   347     if(cpStart==(cpLimit-1)) {

   348         c=*cpStart;

   349     } else {

   350         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);

   351     }

   352     uint16_t prevNorm16=getNorm16(c);

   353     if(prevNorm16<=minYesNo) {

   354         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0

   355     } else {

   356         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo

   357     }

   358 }

   360 U_CDECL_BEGIN

   362 static UBool U_CALLCONV

   363 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {

   364     /* add the start code point to the USet */

   365     const USetAdder *sa=(const USetAdder *)context;

   366     sa->add(sa->set, start);

   367     return TRUE;

   368 }

   370 static uint32_t U_CALLCONV

   371 segmentStarterMapper(const void * /*context*/, uint32_t value) {

   372     return value&CANON_NOT_SEGMENT_STARTER;

   373 }

   375 U_CDECL_END

   377 void

   378 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {

   379     /* add the start code point of each same-value range of each trie */

   380     utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);

   382     /* add Hangul LV syllables and LV+1 because of skippables */

   383     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {

   384         sa->add(sa->set, c);

   385         sa->add(sa->set, c+1);

   386     }

   387     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */

   388 }

   390 void

   391 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {

   392     /* add the start code point of each same-value range of the canonical iterator data trie */

   393     if(ensureCanonIterData(errorCode)) {

   394         // currently only used for the SEGMENT_STARTER property

   395         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);

   396     }

   397 }

   399 const UChar *

   400 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,

   401                                                 UChar32 minNeedDataCP,

   402                                                 ReorderingBuffer *buffer,

   403                                                 UErrorCode &errorCode) const {

   404     // Make some effort to support NUL-terminated strings reasonably.

   405     // Take the part of the fast quick check loop that does not look up

   406     // data and check the first part of the string.

   407     // After this prefix, determine the string length to simplify the rest

   408     // of the code.

   409     const UChar *prevSrc=src;

   410     UChar c;

   411     while((c=*src++)<minNeedDataCP && c!=0) {}

   412     // Back out the last character for full processing.

   413     // Copy this prefix.

   414     if(--src!=prevSrc) {

   415         if(buffer!=NULL) {

   416             buffer->appendZeroCC(prevSrc, src, errorCode);

   417         }

   418     }

   419     return src;

   420 }

   422 // Dual functionality:

   423 // buffer!=NULL: normalize

   424 // buffer==NULL: isNormalized/spanQuickCheckYes

   425 const UChar *

   426 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,

   427                            ReorderingBuffer *buffer,

   428                            UErrorCode &errorCode) const {

   429     UChar32 minNoCP=minDecompNoCP;

   430     if(limit==NULL) {

   431         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);

   432         if(U_FAILURE(errorCode)) {

   433             return src;

   434         }

   435         limit=u_strchr(src, 0);

   436     }

   438     const UChar *prevSrc;

   439     UChar32 c=0;

   440     uint16_t norm16=0;

   442     // only for quick check

   443     const UChar *prevBoundary=src;

   444     uint8_t prevCC=0;

   446     for(;;) {

   447         // count code units below the minimum or with irrelevant data for the quick check

   448         for(prevSrc=src; src!=limit;) {

   449             if( (c=*src)<minNoCP ||

   450                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))

   451             ) {

   452                 ++src;

   453             } else if(!U16_IS_SURROGATE(c)) {

   454                 break;

   455             } else {

   456                 UChar c2;

   457                 if(U16_IS_SURROGATE_LEAD(c)) {

   458                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

   459                         c=U16_GET_SUPPLEMENTARY(c, c2);

   460                     }

   461                 } else /* trail surrogate */ {

   462                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

   463                         --src;

   464                         c=U16_GET_SUPPLEMENTARY(c2, c);

   465                     }

   466                 }

   467                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {

   468                     src+=U16_LENGTH(c);

   469                 } else {

   470                     break;

   471                 }

   472             }

   473         }

   474         // copy these code units all at once

   475         if(src!=prevSrc) {

   476             if(buffer!=NULL) {

   477                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {

   478                     break;

   479                 }

   480             } else {

   481                 prevCC=0;

   482                 prevBoundary=src;

   483             }

   484         }

   485         if(src==limit) {

   486             break;

   487         }

   489         // Check one above-minimum, relevant code point.

   490         src+=U16_LENGTH(c);

   491         if(buffer!=NULL) {

   492             if(!decompose(c, norm16, *buffer, errorCode)) {

   493                 break;

   494             }

   495         } else {

   496             if(isDecompYes(norm16)) {

   497                 uint8_t cc=getCCFromYesOrMaybe(norm16);

   498                 if(prevCC<=cc || cc==0) {

   499                     prevCC=cc;

   500                     if(cc<=1) {

   501                         prevBoundary=src;

   502                     }

   503                     continue;

   504                 }

   505             }

   506             return prevBoundary;  // "no" or cc out of order

   507         }

   508     }

   509     return src;

   510 }

   512 // Decompose a short piece of text which is likely to contain characters that

   513 // fail the quick check loop and/or where the quick check loop's overhead

   514 // is unlikely to be amortized.

   515 // Called by the compose() and makeFCD() implementations.

   516 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,

   517                                       ReorderingBuffer &buffer,

   518                                       UErrorCode &errorCode) const {

   519     while(src<limit) {

   520         UChar32 c;

   521         uint16_t norm16;

   522         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);

   523         if(!decompose(c, norm16, buffer, errorCode)) {

   524             return FALSE;

   525         }

   526     }

   527     return TRUE;

   528 }

   530 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,

   531                                  ReorderingBuffer &buffer,

   532                                  UErrorCode &errorCode) const {

   533     // Only loops for 1:1 algorithmic mappings.

   534     for(;;) {

   535         // get the decomposition and the lead and trail cc's

   536         if(isDecompYes(norm16)) {

   537             // c does not decompose

   538             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);

   539         } else if(isHangul(norm16)) {

   540             // Hangul syllable: decompose algorithmically

   541             UChar jamos[3];

   542             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);

   543         } else if(isDecompNoAlgorithmic(norm16)) {

   544             c=mapAlgorithmic(c, norm16);

   545             norm16=getNorm16(c);

   546         } else {

   547             // c decomposes, get everything from the variable-length extra data

   548             const uint16_t *mapping=getMapping(norm16);

   549             uint16_t firstUnit=*mapping;

   550             int32_t length=firstUnit&MAPPING_LENGTH_MASK;

   551             uint8_t leadCC, trailCC;

   552             trailCC=(uint8_t)(firstUnit>>8);

   553             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

   554                 leadCC=(uint8_t)(*(mapping-1)>>8);

   555             } else {

   556                 leadCC=0;

   557             }

   558             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);

   559         }

   560     }

   561 }

   563 const UChar *

   564 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {

   565     const UChar *decomp=NULL;

   566     uint16_t norm16;

   567     for(;;) {

   568         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {

   569             // c does not decompose

   570             return decomp;

   571         } else if(isHangul(norm16)) {

   572             // Hangul syllable: decompose algorithmically

   573             length=Hangul::decompose(c, buffer);

   574             return buffer;

   575         } else if(isDecompNoAlgorithmic(norm16)) {

   576             c=mapAlgorithmic(c, norm16);

   577             decomp=buffer;

   578             length=0;

   579             U16_APPEND_UNSAFE(buffer, length, c);

   580         } else {

   581             // c decomposes, get everything from the variable-length extra data

   582             const uint16_t *mapping=getMapping(norm16);

   583             length=*mapping&MAPPING_LENGTH_MASK;

   584             return (const UChar *)mapping+1;

   585         }

   586     }

   587 }

   589 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1

   590 // so that a raw mapping fits that consists of one unit ("rm0")

   591 // plus all but the first two code units of the normal mapping.

   592 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.

   593 const UChar *

   594 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {

   595     // We do not loop in this method because an algorithmic mapping itself

   596     // becomes a final result rather than having to be decomposed recursively.

   597     uint16_t norm16;

   598     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {

   599         // c does not decompose

   600         return NULL;

   601     } else if(isHangul(norm16)) {

   602         // Hangul syllable: decompose algorithmically

   603         Hangul::getRawDecomposition(c, buffer);

   604         length=2;

   605         return buffer;

   606     } else if(isDecompNoAlgorithmic(norm16)) {

   607         c=mapAlgorithmic(c, norm16);

   608         length=0;

   609         U16_APPEND_UNSAFE(buffer, length, c);

   610         return buffer;

   611     } else {

   612         // c decomposes, get everything from the variable-length extra data

   613         const uint16_t *mapping=getMapping(norm16);

   614         uint16_t firstUnit=*mapping;

   615         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping

   616         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {

   617             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.

   618             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD

   619             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;

   620             uint16_t rm0=*rawMapping;

   621             if(rm0<=MAPPING_LENGTH_MASK) {

   622                 length=rm0;

   623                 return (const UChar *)rawMapping-rm0;

   624             } else {

   625                 // Copy the normal mapping and replace its first two code units with rm0.

   626                 buffer[0]=(UChar)rm0;

   627                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);

   628                 length=mLength-1;

   629                 return buffer;

   630             }

   631         } else {

   632             length=mLength;

   633             return (const UChar *)mapping+1;

   634         }

   635     }

   636 }

   638 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,

   639                                          UBool doDecompose,

   640                                          UnicodeString &safeMiddle,

   641                                          ReorderingBuffer &buffer,

   642                                          UErrorCode &errorCode) const {

   643     buffer.copyReorderableSuffixTo(safeMiddle);

   644     if(doDecompose) {

   645         decompose(src, limit, &buffer, errorCode);

   646         return;

   647     }

   648     // Just merge the strings at the boundary.

   649     ForwardUTrie2StringIterator iter(normTrie, src, limit);

   650     uint8_t firstCC, prevCC, cc;

   651     firstCC=prevCC=cc=getCC(iter.next16());

   652     while(cc!=0) {

   653         prevCC=cc;

   654         cc=getCC(iter.next16());

   655     };

   656     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL

   657         limit=u_strchr(iter.codePointStart, 0);

   658     }

   660     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {

   661         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);

   662     }

   663 }

   665 // Note: hasDecompBoundary() could be implemented as aliases to

   666 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()

   667 // at the cost of building the FCD trie for a decomposition normalizer.

   668 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {

   669     for(;;) {

   670         if(c<minDecompNoCP) {

   671             return TRUE;

   672         }

   673         uint16_t norm16=getNorm16(c);

   674         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {

   675             return TRUE;

   676         } else if(norm16>MIN_NORMAL_MAYBE_YES) {

   677             return FALSE;  // ccc!=0

   678         } else if(isDecompNoAlgorithmic(norm16)) {

   679             c=mapAlgorithmic(c, norm16);

   680         } else {

   681             // c decomposes, get everything from the variable-length extra data

   682             const uint16_t *mapping=getMapping(norm16);

   683             uint16_t firstUnit=*mapping;

   684             if((firstUnit&MAPPING_LENGTH_MASK)==0) {

   685                 return FALSE;

   686             }

   687             if(!before) {

   688                 // decomp after-boundary: same as hasFCDBoundaryAfter(),

   689                 // fcd16<=1 || trailCC==0

   690                 if(firstUnit>0x1ff) {

   691                     return FALSE;  // trailCC>1

   692                 }

   693                 if(firstUnit<=0xff) {

   694                     return TRUE;  // trailCC==0

   695                 }

   696                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary

   697             }

   698             // TRUE if leadCC==0 (hasFCDBoundaryBefore())

   699             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;

   700         }

   701     }

   702 }

   704 /*

   705  * Finds the recomposition result for

   706  * a forward-combining "lead" character,

   707  * specified with a pointer to its compositions list,

   708  * and a backward-combining "trail" character.

   709  *

   710  * If the lead and trail characters combine, then this function returns

   711  * the following "compositeAndFwd" value:

   712  * Bits 21..1  composite character

   713  * Bit      0  set if the composite is a forward-combining starter

   714  * otherwise it returns -1.

   715  *

   716  * The compositions list has (trail, compositeAndFwd) pair entries,

   717  * encoded as either pairs or triples of 16-bit units.

   718  * The last entry has the high bit of its first unit set.

   719  *

   720  * The list is sorted by ascending trail characters (there are no duplicates).

   721  * A linear search is used.

   722  *

   723  * See normalizer2impl.h for a more detailed description

   724  * of the compositions list format.

   725  */

   726 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {

   727     uint16_t key1, firstUnit;

   728     if(trail<COMP_1_TRAIL_LIMIT) {

   729         // trail character is 0..33FF

   730         // result entry may have 2 or 3 units

   731         key1=(uint16_t)(trail<<1);

   732         while(key1>(firstUnit=*list)) {

   733             list+=2+(firstUnit&COMP_1_TRIPLE);

   734         }

   735         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

   736             if(firstUnit&COMP_1_TRIPLE) {

   737                 return ((int32_t)list[1]<<16)|list[2];

   738             } else {

   739                 return list[1];

   740             }

   741         }

   742     } else {

   743         // trail character is 3400..10FFFF

   744         // result entry has 3 units

   745         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+

   746                         (((trail>>COMP_1_TRAIL_SHIFT))&

   747                           ~COMP_1_TRIPLE));

   748         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);

   749         uint16_t secondUnit;

   750         for(;;) {

   751             if(key1>(firstUnit=*list)) {

   752                 list+=2+(firstUnit&COMP_1_TRIPLE);

   753             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {

   754                 if(key2>(secondUnit=list[1])) {

   755                     if(firstUnit&COMP_1_LAST_TUPLE) {

   756                         break;

   757                     } else {

   758                         list+=3;

   759                     }

   760                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {

   761                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];

   762                 } else {

   763                     break;

   764                 }

   765             } else {

   766                 break;

   767             }

   768         }

   769     }

   770     return -1;

   771 }

   773 /**

   774   * @param list some character's compositions list

   775   * @param set recursively receives the composites from these compositions

   776   */

   777 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {

   778     uint16_t firstUnit;

   779     int32_t compositeAndFwd;

   780     do {

   781         firstUnit=*list;

   782         if((firstUnit&COMP_1_TRIPLE)==0) {

   783             compositeAndFwd=list[1];

   784             list+=2;

   785         } else {

   786             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];

   787             list+=3;

   788         }

   789         UChar32 composite=compositeAndFwd>>1;

   790         if((compositeAndFwd&1)!=0) {

   791             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);

   792         }

   793         set.add(composite);

   794     } while((firstUnit&COMP_1_LAST_TUPLE)==0);

   795 }

   797 /*

   798  * Recomposes the buffer text starting at recomposeStartIndex

   799  * (which is in NFD - decomposed and canonically ordered),

   800  * and truncates the buffer contents.

   801  *

   802  * Note that recomposition never lengthens the text:

   803  * Any character consists of either one or two code units;

   804  * a composition may contain at most one more code unit than the original starter,

   805  * while the combining mark that is removed has at least one code unit.

   806  */

   807 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,

   808                                 UBool onlyContiguous) const {

   809     UChar *p=buffer.getStart()+recomposeStartIndex;

   810     UChar *limit=buffer.getLimit();

   811     if(p==limit) {

   812         return;

   813     }

   815     UChar *starter, *pRemove, *q, *r;

   816     const uint16_t *compositionsList;

   817     UChar32 c, compositeAndFwd;

   818     uint16_t norm16;

   819     uint8_t cc, prevCC;

   820     UBool starterIsSupplementary;

   822     // Some of the following variables are not used until we have a forward-combining starter

   823     // and are only initialized now to avoid compiler warnings.

   824     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter

   825     starter=NULL;

   826     starterIsSupplementary=FALSE;

   827     prevCC=0;

   829     for(;;) {

   830         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);

   831         cc=getCCFromYesOrMaybe(norm16);

   832         if( // this character combines backward and

   833             isMaybe(norm16) &&

   834             // we have seen a starter that combines forward and

   835             compositionsList!=NULL &&

   836             // the backward-combining character is not blocked

   837             (prevCC<cc || prevCC==0)

   838         ) {

   839             if(isJamoVT(norm16)) {

   840                 // c is a Jamo V/T, see if we can compose it with the previous character.

   841                 if(c<Hangul::JAMO_T_BASE) {

   842                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.

   843                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);

   844                     if(prev<Hangul::JAMO_L_COUNT) {

   845                         pRemove=p-1;

   846                         UChar syllable=(UChar)

   847                             (Hangul::HANGUL_BASE+

   848                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*

   849                              Hangul::JAMO_T_COUNT);

   850                         UChar t;

   851                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {

   852                             ++p;

   853                             syllable+=t;  // The next character was a Jamo T.

   854                         }

   855                         *starter=syllable;

   856                         // remove the Jamo V/T

   857                         q=pRemove;

   858                         r=p;

   859                         while(r<limit) {

   860                             *q++=*r++;

   861                         }

   862                         limit=q;

   863                         p=pRemove;

   864                     }

   865                 }

   866                 /*

   867                  * No "else" for Jamo T:

   868                  * Since the input is in NFD, there are no Hangul LV syllables that

   869                  * a Jamo T could combine with.

   870                  * All Jamo Ts are combined above when handling Jamo Vs.

   871                  */

   872                 if(p==limit) {

   873                     break;

   874                 }

   875                 compositionsList=NULL;

   876                 continue;

   877             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {

   878                 // The starter and the combining mark (c) do combine.

   879                 UChar32 composite=compositeAndFwd>>1;

   881                 // Replace the starter with the composite, remove the combining mark.

   882                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark

   883                 if(starterIsSupplementary) {

   884                     if(U_IS_SUPPLEMENTARY(composite)) {

   885                         // both are supplementary

   886                         starter[0]=U16_LEAD(composite);

   887                         starter[1]=U16_TRAIL(composite);

   888                     } else {

   889                         *starter=(UChar)composite;

   890                         // The composite is shorter than the starter,

   891                         // move the intermediate characters forward one.

   892                         starterIsSupplementary=FALSE;

   893                         q=starter+1;

   894                         r=q+1;

   895                         while(r<pRemove) {

   896                             *q++=*r++;

   897                         }

   898                         --pRemove;

   899                     }

   900                 } else if(U_IS_SUPPLEMENTARY(composite)) {

   901                     // The composite is longer than the starter,

   902                     // move the intermediate characters back one.

   903                     starterIsSupplementary=TRUE;

   904                     ++starter;  // temporarily increment for the loop boundary

   905                     q=pRemove;

   906                     r=++pRemove;

   907                     while(starter<q) {

   908                         *--r=*--q;

   909                     }

   910                     *starter=U16_TRAIL(composite);

   911                     *--starter=U16_LEAD(composite);  // undo the temporary increment

   912                 } else {

   913                     // both are on the BMP

   914                     *starter=(UChar)composite;

   915                 }

   917                 /* remove the combining mark by moving the following text over it */

   918                 if(pRemove<p) {

   919                     q=pRemove;

   920                     r=p;

   921                     while(r<limit) {

   922                         *q++=*r++;

   923                     }

   924                     limit=q;

   925                     p=pRemove;

   926                 }

   927                 // Keep prevCC because we removed the combining mark.

   929                 if(p==limit) {

   930                     break;

   931                 }

   932                 // Is the composite a starter that combines forward?

   933                 if(compositeAndFwd&1) {

   934                     compositionsList=

   935                         getCompositionsListForComposite(getNorm16(composite));

   936                 } else {

   937                     compositionsList=NULL;

   938                 }

   940                 // We combined; continue with looking for compositions.

   941                 continue;

   942             }

   943         }

   945         // no combination this time

   946         prevCC=cc;

   947         if(p==limit) {

   948             break;

   949         }

   951         // If c did not combine, then check if it is a starter.

   952         if(cc==0) {

   953             // Found a new starter.

   954             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {

   955                 // It may combine with something, prepare for it.

   956                 if(U_IS_BMP(c)) {

   957                     starterIsSupplementary=FALSE;

   958                     starter=p-1;

   959                 } else {

   960                     starterIsSupplementary=TRUE;

   961                     starter=p-2;

   962                 }

   963             }

   964         } else if(onlyContiguous) {

   965             // FCC: no discontiguous compositions; any intervening character blocks.

   966             compositionsList=NULL;

   967         }

   968     }

   969     buffer.setReorderingLimit(limit);

   970 }

   972 UChar32

   973 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {

   974     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0

   975     const uint16_t *list;

   976     if(isInert(norm16)) {

   977         return U_SENTINEL;

   978     } else if(norm16<minYesNoMappingsOnly) {

   979         if(isJamoL(norm16)) {

   980             b-=Hangul::JAMO_V_BASE;

   981             if(0<=b && b<Hangul::JAMO_V_COUNT) {

   982                 return

   983                     (Hangul::HANGUL_BASE+

   984                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*

   985                      Hangul::JAMO_T_COUNT);

   986             } else {

   987                 return U_SENTINEL;

   988             }

   989         } else if(isHangul(norm16)) {

   990             b-=Hangul::JAMO_T_BASE;

   991             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!

   992                 return a+b;

   993             } else {

   994                 return U_SENTINEL;

   995             }

   996         } else {

   997             // 'a' has a compositions list in extraData

   998             list=extraData+norm16;

   999             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list

  1000                 list+=  // mapping pointer

  1001                     1+  // +1 to skip the first unit with the mapping lenth

  1002                     (*list&MAPPING_LENGTH_MASK);  // + mapping length

  1003             }

  1004         }

  1005     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {

  1006         return U_SENTINEL;

  1007     } else {

  1008         list=maybeYesCompositions+norm16-minMaybeYes;

  1009     }

  1010     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b

  1011         return U_SENTINEL;

  1012     }

  1013 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC

  1014     return combine(list, b)>>1;

  1015 #else

  1016     int32_t compositeAndFwd=combine(list, b);

  1017     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;

  1018 #endif

  1019 }

  1021 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.

  1022 // doCompose: normalize

  1023 // !doCompose: isNormalized (buffer must be empty and initialized)

  1024 UBool

  1025 Normalizer2Impl::compose(const UChar *src, const UChar *limit,

  1026                          UBool onlyContiguous,

  1027                          UBool doCompose,

  1028                          ReorderingBuffer &buffer,

  1029                          UErrorCode &errorCode) const {

  1030     /*

  1031      * prevBoundary points to the last character before the current one

  1032      * that has a composition boundary before it with ccc==0 and quick check "yes".

  1033      * Keeping track of prevBoundary saves us looking for a composition boundary

  1034      * when we find a "no" or "maybe".

  1035      *

  1036      * When we back out from prevSrc back to prevBoundary,

  1037      * then we also remove those same characters (which had been simply copied

  1038      * or canonically-order-inserted) from the ReorderingBuffer.

  1039      * Therefore, at all times, the [prevBoundary..prevSrc[ source units

  1040      * must correspond 1:1 to destination units at the end of the destination buffer.

  1041      */

  1042     const UChar *prevBoundary=src;

  1043     UChar32 minNoMaybeCP=minCompNoMaybeCP;

  1044     if(limit==NULL) {

  1045         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,

  1046                                            doCompose ? &buffer : NULL,

  1047                                            errorCode);

  1048         if(U_FAILURE(errorCode)) {

  1049             return FALSE;

  1050         }

  1051         if(prevBoundary<src) {

  1052             // Set prevBoundary to the last character in the prefix.

  1053             prevBoundary=src-1;

  1054         }

  1055         limit=u_strchr(src, 0);

  1056     }

  1058     const UChar *prevSrc;

  1059     UChar32 c=0;

  1060     uint16_t norm16=0;

  1062     // only for isNormalized

  1063     uint8_t prevCC=0;

  1065     for(;;) {

  1066         // count code units below the minimum or with irrelevant data for the quick check

  1067         for(prevSrc=src; src!=limit;) {

  1068             if( (c=*src)<minNoMaybeCP ||

  1069                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))

  1070             ) {

  1071                 ++src;

  1072             } else if(!U16_IS_SURROGATE(c)) {

  1073                 break;

  1074             } else {

  1075                 UChar c2;

  1076                 if(U16_IS_SURROGATE_LEAD(c)) {

  1077                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

  1078                         c=U16_GET_SUPPLEMENTARY(c, c2);

  1079                     }

  1080                 } else /* trail surrogate */ {

  1081                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

  1082                         --src;

  1083                         c=U16_GET_SUPPLEMENTARY(c2, c);

  1084                     }

  1085                 }

  1086                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {

  1087                     src+=U16_LENGTH(c);

  1088                 } else {

  1089                     break;

  1090                 }

  1091             }

  1092         }

  1093         // copy these code units all at once

  1094         if(src!=prevSrc) {

  1095             if(doCompose) {

  1096                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {

  1097                     break;

  1098                 }

  1099             } else {

  1100                 prevCC=0;

  1101             }

  1102             if(src==limit) {

  1103                 break;

  1104             }

  1105             // Set prevBoundary to the last character in the quick check loop.

  1106             prevBoundary=src-1;

  1107             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&

  1108                 U16_IS_LEAD(*(prevBoundary-1))

  1109             ) {

  1110                 --prevBoundary;

  1111             }

  1112             // The start of the current character (c).

  1113             prevSrc=src;

  1114         } else if(src==limit) {

  1115             break;

  1116         }

  1118         src+=U16_LENGTH(c);

  1119         /*

  1120          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

  1121          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)

  1122          * or has ccc!=0.

  1123          * Check for Jamo V/T, then for regular characters.

  1124          * c is not a Hangul syllable or Jamo L because those have "yes" properties.

  1125          */

  1126         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {

  1127             UChar prev=*(prevSrc-1);

  1128             UBool needToDecompose=FALSE;

  1129             if(c<Hangul::JAMO_T_BASE) {

  1130                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.

  1131                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);

  1132                 if(prev<Hangul::JAMO_L_COUNT) {

  1133                     if(!doCompose) {

  1134                         return FALSE;

  1135                     }

  1136                     UChar syllable=(UChar)

  1137                         (Hangul::HANGUL_BASE+

  1138                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*

  1139                          Hangul::JAMO_T_COUNT);

  1140                     UChar t;

  1141                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {

  1142                         ++src;

  1143                         syllable+=t;  // The next character was a Jamo T.

  1144                         prevBoundary=src;

  1145                         buffer.setLastChar(syllable);

  1146                         continue;

  1147                     }

  1148                     // If we see L+V+x where x!=T then we drop to the slow path,

  1149                     // decompose and recompose.

  1150                     // This is to deal with NFKC finding normal L and V but a

  1151                     // compatibility variant of a T. We need to either fully compose that

  1152                     // combination here (which would complicate the code and may not work

  1153                     // with strange custom data) or use the slow path -- or else our replacing

  1154                     // two input characters (L+V) with one output character (LV syllable)

  1155                     // would violate the invariant that [prevBoundary..prevSrc[ has the same

  1156                     // length as what we appended to the buffer since prevBoundary.

  1157                     needToDecompose=TRUE;

  1158                 }

  1159             } else if(Hangul::isHangulWithoutJamoT(prev)) {

  1160                 // c is a Jamo Trailing consonant,

  1161                 // compose with previous Hangul LV that does not contain a Jamo T.

  1162                 if(!doCompose) {

  1163                     return FALSE;

  1164                 }

  1165                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));

  1166                 prevBoundary=src;

  1167                 continue;

  1168             }

  1169             if(!needToDecompose) {

  1170                 // The Jamo V/T did not compose into a Hangul syllable.

  1171                 if(doCompose) {

  1172                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {

  1173                         break;

  1174                     }

  1175                 } else {

  1176                     prevCC=0;

  1177                 }

  1178                 continue;

  1179             }

  1180         }

  1181         /*

  1182          * Source buffer pointers:

  1183          *

  1184          *  all done      quick check   current char  not yet

  1185          *                "yes" but     (c)           processed

  1186          *                may combine

  1187          *                forward

  1188          * [-------------[-------------[-------------[-------------[

  1189          * |             |             |             |             |

  1190          * orig. src     prevBoundary  prevSrc       src           limit

  1191          *

  1192          *

  1193          * Destination buffer pointers inside the ReorderingBuffer:

  1194          *

  1195          *  all done      might take    not filled yet

  1196          *                characters for

  1197          *                reordering

  1198          * [-------------[-------------[-------------[

  1199          * |             |             |             |

  1200          * start         reorderStart  limit         |

  1201          *                             +remainingCap.+

  1202          */

  1203         if(norm16>=MIN_YES_YES_WITH_CC) {

  1204             uint8_t cc=(uint8_t)norm16;  // cc!=0

  1205             if( onlyContiguous &&  // FCC

  1206                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&

  1207                 prevBoundary<prevSrc &&

  1208                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that

  1209                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)

  1210                 // passed the quick check "yes && ccc==0" test.

  1211                 // Check whether the last character was a "yesYes" or a "yesNo".

  1212                 // If a "yesNo", then we get its trailing ccc from its

  1213                 // mapping and check for canonical order.

  1214                 // All other cases are ok.

  1215                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc

  1216             ) {

  1217                 // Fails FCD test, need to decompose and contiguously recompose.

  1218                 if(!doCompose) {

  1219                     return FALSE;

  1220                 }

  1221             } else if(doCompose) {

  1222                 if(!buffer.append(c, cc, errorCode)) {

  1223                     break;

  1224                 }

  1225                 continue;

  1226             } else if(prevCC<=cc) {

  1227                 prevCC=cc;

  1228                 continue;

  1229             } else {

  1230                 return FALSE;

  1231             }

  1232         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {

  1233             return FALSE;

  1234         }

  1236         /*

  1237          * Find appropriate boundaries around this character,

  1238          * decompose the source text from between the boundaries,

  1239          * and recompose it.

  1240          *

  1241          * We may need to remove the last few characters from the ReorderingBuffer

  1242          * to account for source text that was copied or appended

  1243          * but needs to take part in the recomposition.

  1244          */

  1246         /*

  1247          * Find the last composition boundary in [prevBoundary..src[.

  1248          * It is either the decomposition of the current character (at prevSrc),

  1249          * or prevBoundary.

  1250          */

  1251         if(hasCompBoundaryBefore(c, norm16)) {

  1252             prevBoundary=prevSrc;

  1253         } else if(doCompose) {

  1254             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));

  1255         }

  1257         // Find the next composition boundary in [src..limit[ -

  1258         // modifies src to point to the next starter.

  1259         src=(UChar *)findNextCompBoundary(src, limit);

  1261         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.

  1262         int32_t recomposeStartIndex=buffer.length();

  1263         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {

  1264             break;

  1265         }

  1266         recompose(buffer, recomposeStartIndex, onlyContiguous);

  1267         if(!doCompose) {

  1268             if(!buffer.equals(prevBoundary, src)) {

  1269                 return FALSE;

  1270             }

  1271             buffer.remove();

  1272             prevCC=0;

  1273         }

  1275         // Move to the next starter. We never need to look back before this point again.

  1276         prevBoundary=src;

  1277     }

  1278     return TRUE;

  1279 }

  1281 // Very similar to compose(): Make the same changes in both places if relevant.

  1282 // pQCResult==NULL: spanQuickCheckYes

  1283 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)

  1284 const UChar *

  1285 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,

  1286                                    UBool onlyContiguous,

  1287                                    UNormalizationCheckResult *pQCResult) const {

  1288     /*

  1289      * prevBoundary points to the last character before the current one

  1290      * that has a composition boundary before it with ccc==0 and quick check "yes".

  1291      */

  1292     const UChar *prevBoundary=src;

  1293     UChar32 minNoMaybeCP=minCompNoMaybeCP;

  1294     if(limit==NULL) {

  1295         UErrorCode errorCode=U_ZERO_ERROR;

  1296         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);

  1297         if(prevBoundary<src) {

  1298             // Set prevBoundary to the last character in the prefix.

  1299             prevBoundary=src-1;

  1300         }

  1301         limit=u_strchr(src, 0);

  1302     }

  1304     const UChar *prevSrc;

  1305     UChar32 c=0;

  1306     uint16_t norm16=0;

  1307     uint8_t prevCC=0;

  1309     for(;;) {

  1310         // count code units below the minimum or with irrelevant data for the quick check

  1311         for(prevSrc=src;;) {

  1312             if(src==limit) {

  1313                 return src;

  1314             }

  1315             if( (c=*src)<minNoMaybeCP ||

  1316                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))

  1317             ) {

  1318                 ++src;

  1319             } else if(!U16_IS_SURROGATE(c)) {

  1320                 break;

  1321             } else {

  1322                 UChar c2;

  1323                 if(U16_IS_SURROGATE_LEAD(c)) {

  1324                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

  1325                         c=U16_GET_SUPPLEMENTARY(c, c2);

  1326                     }

  1327                 } else /* trail surrogate */ {

  1328                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

  1329                         --src;

  1330                         c=U16_GET_SUPPLEMENTARY(c2, c);

  1331                     }

  1332                 }

  1333                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {

  1334                     src+=U16_LENGTH(c);

  1335                 } else {

  1336                     break;

  1337                 }

  1338             }

  1339         }

  1340         if(src!=prevSrc) {

  1341             // Set prevBoundary to the last character in the quick check loop.

  1342             prevBoundary=src-1;

  1343             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&

  1344                 U16_IS_LEAD(*(prevBoundary-1))

  1345             ) {

  1346                 --prevBoundary;

  1347             }

  1348             prevCC=0;

  1349             // The start of the current character (c).

  1350             prevSrc=src;

  1351         }

  1353         src+=U16_LENGTH(c);

  1354         /*

  1355          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.

  1356          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)

  1357          * or has ccc!=0.

  1358          */

  1359         if(isMaybeOrNonZeroCC(norm16)) {

  1360             uint8_t cc=getCCFromYesOrMaybe(norm16);

  1361             if( onlyContiguous &&  // FCC

  1362                 cc!=0 &&

  1363                 prevCC==0 &&

  1364                 prevBoundary<prevSrc &&

  1365                 // prevCC==0 && prevBoundary<prevSrc tell us that

  1366                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)

  1367                 // passed the quick check "yes && ccc==0" test.

  1368                 // Check whether the last character was a "yesYes" or a "yesNo".

  1369                 // If a "yesNo", then we get its trailing ccc from its

  1370                 // mapping and check for canonical order.

  1371                 // All other cases are ok.

  1372                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc

  1373             ) {

  1374                 // Fails FCD test.

  1375             } else if(prevCC<=cc || cc==0) {

  1376                 prevCC=cc;

  1377                 if(norm16<MIN_YES_YES_WITH_CC) {

  1378                     if(pQCResult!=NULL) {

  1379                         *pQCResult=UNORM_MAYBE;

  1380                     } else {

  1381                         return prevBoundary;

  1382                     }

  1383                 }

  1384                 continue;

  1385             }

  1386         }

  1387         if(pQCResult!=NULL) {

  1388             *pQCResult=UNORM_NO;

  1389         }

  1390         return prevBoundary;

  1391     }

  1392 }

  1394 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,

  1395                                        UBool doCompose,

  1396                                        UBool onlyContiguous,

  1397                                        UnicodeString &safeMiddle,

  1398                                        ReorderingBuffer &buffer,

  1399                                        UErrorCode &errorCode) const {

  1400     if(!buffer.isEmpty()) {

  1401         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);

  1402         if(src!=firstStarterInSrc) {

  1403             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),

  1404                                                                     buffer.getLimit());

  1405             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);

  1406             UnicodeString middle(lastStarterInDest, destSuffixLength);

  1407             buffer.removeSuffix(destSuffixLength);

  1408             safeMiddle=middle;

  1409             middle.append(src, (int32_t)(firstStarterInSrc-src));

  1410             const UChar *middleStart=middle.getBuffer();

  1411             compose(middleStart, middleStart+middle.length(), onlyContiguous,

  1412                     TRUE, buffer, errorCode);

  1413             if(U_FAILURE(errorCode)) {

  1414                 return;

  1415             }

  1416             src=firstStarterInSrc;

  1417         }

  1418     }

  1419     if(doCompose) {

  1420         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);

  1421     } else {

  1422         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL

  1423             limit=u_strchr(src, 0);

  1424         }

  1425         buffer.appendZeroCC(src, limit, errorCode);

  1426     }

  1427 }

  1429 /**

  1430  * Does c have a composition boundary before it?

  1431  * True if its decomposition begins with a character that has

  1432  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).

  1433  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes

  1434  * (isCompYesAndZeroCC()) so we need not decompose.

  1435  */

  1436 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {

  1437     for(;;) {

  1438         if(isCompYesAndZeroCC(norm16)) {

  1439             return TRUE;

  1440         } else if(isMaybeOrNonZeroCC(norm16)) {

  1441             return FALSE;

  1442         } else if(isDecompNoAlgorithmic(norm16)) {

  1443             c=mapAlgorithmic(c, norm16);

  1444             norm16=getNorm16(c);

  1445         } else {

  1446             // c decomposes, get everything from the variable-length extra data

  1447             const uint16_t *mapping=getMapping(norm16);

  1448             uint16_t firstUnit=*mapping;

  1449             if((firstUnit&MAPPING_LENGTH_MASK)==0) {

  1450                 return FALSE;

  1451             }

  1452             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {

  1453                 return FALSE;  // non-zero leadCC

  1454             }

  1455             int32_t i=1;  // skip over the firstUnit

  1456             UChar32 c;

  1457             U16_NEXT_UNSAFE(mapping, i, c);

  1458             return isCompYesAndZeroCC(getNorm16(c));

  1459         }

  1460     }

  1461 }

  1463 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {

  1464     for(;;) {

  1465         uint16_t norm16=getNorm16(c);

  1466         if(isInert(norm16)) {

  1467             return TRUE;

  1468         } else if(norm16<=minYesNo) {

  1469             // Hangul: norm16==minYesNo

  1470             // Hangul LVT has a boundary after it.

  1471             // Hangul LV and non-inert yesYes characters combine forward.

  1472             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);

  1473         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {

  1474             return FALSE;

  1475         } else if(isDecompNoAlgorithmic(norm16)) {

  1476             c=mapAlgorithmic(c, norm16);

  1477         } else {

  1478             // c decomposes, get everything from the variable-length extra data.

  1479             // If testInert, then c must be a yesNo character which has lccc=0,

  1480             // otherwise it could be a noNo.

  1481             const uint16_t *mapping=getMapping(norm16);

  1482             uint16_t firstUnit=*mapping;

  1483             // TRUE if

  1484             //   not MAPPING_NO_COMP_BOUNDARY_AFTER

  1485             //     (which is set if

  1486             //       c is not deleted, and

  1487             //       it and its decomposition do not combine forward, and it has a starter)

  1488             //   and if FCC then trailCC<=1

  1489             return

  1490                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&

  1491                 (!onlyContiguous || firstUnit<=0x1ff);

  1492         }

  1493     }

  1494 }

  1496 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {

  1497     BackwardUTrie2StringIterator iter(normTrie, start, p);

  1498     uint16_t norm16;

  1499     do {

  1500         norm16=iter.previous16();

  1501     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));

  1502     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,

  1503     // but that's probably not worth the extra cost.

  1504     return iter.codePointStart;

  1505 }

  1507 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {

  1508     ForwardUTrie2StringIterator iter(normTrie, p, limit);

  1509     uint16_t norm16;

  1510     do {

  1511         norm16=iter.next16();

  1512     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));

  1513     return iter.codePointStart;

  1514 }

  1516 // Note: normalizer2impl.cpp r30982 (2011-nov-27)

  1517 // still had getFCDTrie() which built and cached an FCD trie.

  1518 // That provided faster access to FCD data than getFCD16FromNormData()

  1519 // but required synchronization and consumed some 10kB of heap memory

  1520 // in any process that uses FCD (e.g., via collation).

  1521 // tccc180[] and smallFCD[] are intended to help with any loss of performance,

  1522 // at least for Latin & CJK.

  1524 // Gets the FCD value from the regular normalization data.

  1525 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {

  1526     // Only loops for 1:1 algorithmic mappings.

  1527     for(;;) {

  1528         uint16_t norm16=getNorm16(c);

  1529         if(norm16<=minYesNo) {

  1530             // no decomposition or Hangul syllable, all zeros

  1531             return 0;

  1532         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {

  1533             // combining mark

  1534             norm16&=0xff;

  1535             return norm16|(norm16<<8);

  1536         } else if(norm16>=minMaybeYes) {

  1537             return 0;

  1538         } else if(isDecompNoAlgorithmic(norm16)) {

  1539             c=mapAlgorithmic(c, norm16);

  1540         } else {

  1541             // c decomposes, get everything from the variable-length extra data

  1542             const uint16_t *mapping=getMapping(norm16);

  1543             uint16_t firstUnit=*mapping;

  1544             if((firstUnit&MAPPING_LENGTH_MASK)==0) {

  1545                 // A character that is deleted (maps to an empty string) must

  1546                 // get the worst-case lccc and tccc values because arbitrary

  1547                 // characters on both sides will become adjacent.

  1548                 return 0x1ff;

  1549             } else {

  1550                 norm16=firstUnit>>8;  // tccc

  1551                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {

  1552                     norm16|=*(mapping-1)&0xff00;  // lccc

  1553                 }

  1554                 return norm16;

  1555             }

  1556         }

  1557     }

  1558 }

  1560 // Dual functionality:

  1561 // buffer!=NULL: normalize

  1562 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes

  1563 const UChar *

  1564 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,

  1565                          ReorderingBuffer *buffer,

  1566                          UErrorCode &errorCode) const {

  1567     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.

  1568     // Similar to the prevBoundary in the compose() implementation.

  1569     const UChar *prevBoundary=src;

  1570     int32_t prevFCD16=0;

  1571     if(limit==NULL) {

  1572         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);

  1573         if(U_FAILURE(errorCode)) {

  1574             return src;

  1575         }

  1576         if(prevBoundary<src) {

  1577             prevBoundary=src;

  1578             // We know that the previous character's lccc==0.

  1579             // Fetching the fcd16 value was deferred for this below-U+0300 code point.

  1580             prevFCD16=getFCD16(*(src-1));

  1581             if(prevFCD16>1) {

  1582                 --prevBoundary;

  1583             }

  1584         }

  1585         limit=u_strchr(src, 0);

  1586     }

  1588     // Note: In this function we use buffer->appendZeroCC() because we track

  1589     // the lead and trail combining classes here, rather than leaving it to

  1590     // the ReorderingBuffer.

  1591     // The exception is the call to decomposeShort() which uses the buffer

  1592     // in the normal way.

  1594     const UChar *prevSrc;

  1595     UChar32 c=0;

  1596     uint16_t fcd16=0;

  1598     for(;;) {

  1599         // count code units with lccc==0

  1600         for(prevSrc=src; src!=limit;) {

  1601             if((c=*src)<MIN_CCC_LCCC_CP) {

  1602                 prevFCD16=~c;

  1603                 ++src;

  1604             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {

  1605                 prevFCD16=0;

  1606                 ++src;

  1607             } else {

  1608                 if(U16_IS_SURROGATE(c)) {

  1609                     UChar c2;

  1610                     if(U16_IS_SURROGATE_LEAD(c)) {

  1611                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {

  1612                             c=U16_GET_SUPPLEMENTARY(c, c2);

  1613                         }

  1614                     } else /* trail surrogate */ {

  1615                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {

  1616                             --src;

  1617                             c=U16_GET_SUPPLEMENTARY(c2, c);

  1618                         }

  1619                     }

  1620                 }

  1621                 if((fcd16=getFCD16FromNormData(c))<=0xff) {

  1622                     prevFCD16=fcd16;

  1623                     src+=U16_LENGTH(c);

  1624                 } else {

  1625                     break;

  1626                 }

  1627             }

  1628         }

  1629         // copy these code units all at once

  1630         if(src!=prevSrc) {

  1631             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {

  1632                 break;

  1633             }

  1634             if(src==limit) {

  1635                 break;

  1636             }

  1637             prevBoundary=src;

  1638             // We know that the previous character's lccc==0.

  1639             if(prevFCD16<0) {

  1640                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.

  1641                 UChar32 prev=~prevFCD16;

  1642                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);

  1643                 if(prevFCD16>1) {

  1644                     --prevBoundary;

  1645                 }

  1646             } else {

  1647                 const UChar *p=src-1;

  1648                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {

  1649                     --p;

  1650                     // Need to fetch the previous character's FCD value because

  1651                     // prevFCD16 was just for the trail surrogate code point.

  1652                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));

  1653                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.

  1654                 }

  1655                 if(prevFCD16>1) {

  1656                     prevBoundary=p;

  1657                 }

  1658             }

  1659             // The start of the current character (c).

  1660             prevSrc=src;

  1661         } else if(src==limit) {

  1662             break;

  1663         }

  1665         src+=U16_LENGTH(c);

  1666         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.

  1667         // Check for proper order, and decompose locally if necessary.

  1668         if((prevFCD16&0xff)<=(fcd16>>8)) {

  1669             // proper order: prev tccc <= current lccc

  1670             if((fcd16&0xff)<=1) {

  1671                 prevBoundary=src;

  1672             }

  1673             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {

  1674                 break;

  1675             }

  1676             prevFCD16=fcd16;

  1677             continue;

  1678         } else if(buffer==NULL) {

  1679             return prevBoundary;  // quick check "no"

  1680         } else {

  1681             /*

  1682              * Back out the part of the source that we copied or appended

  1683              * already but is now going to be decomposed.

  1684              * prevSrc is set to after what was copied/appended.

  1685              */

  1686             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));

  1687             /*

  1688              * Find the part of the source that needs to be decomposed,

  1689              * up to the next safe boundary.

  1690              */

  1691             src=findNextFCDBoundary(src, limit);

  1692             /*

  1693              * The source text does not fulfill the conditions for FCD.

  1694              * Decompose and reorder a limited piece of the text.

  1695              */

  1696             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {

  1697                 break;

  1698             }

  1699             prevBoundary=src;

  1700             prevFCD16=0;

  1701         }

  1702     }

  1703     return src;

  1704 }

  1706 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,

  1707                                        UBool doMakeFCD,

  1708                                        UnicodeString &safeMiddle,

  1709                                        ReorderingBuffer &buffer,

  1710                                        UErrorCode &errorCode) const {

  1711     if(!buffer.isEmpty()) {

  1712         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);

  1713         if(src!=firstBoundaryInSrc) {

  1714             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),

  1715                                                                     buffer.getLimit());

  1716             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);

  1717             UnicodeString middle(lastBoundaryInDest, destSuffixLength);

  1718             buffer.removeSuffix(destSuffixLength);

  1719             safeMiddle=middle;

  1720             middle.append(src, (int32_t)(firstBoundaryInSrc-src));

  1721             const UChar *middleStart=middle.getBuffer();

  1722             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);

  1723             if(U_FAILURE(errorCode)) {

  1724                 return;

  1725             }

  1726             src=firstBoundaryInSrc;

  1727         }

  1728     }

  1729     if(doMakeFCD) {

  1730         makeFCD(src, limit, &buffer, errorCode);

  1731     } else {

  1732         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL

  1733             limit=u_strchr(src, 0);

  1734         }

  1735         buffer.appendZeroCC(src, limit, errorCode);

  1736     }

  1737 }

  1739 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {

  1740     while(start<p && previousFCD16(start, p)>0xff) {}

  1741     return p;

  1742 }

  1744 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {

  1745     while(p<limit) {

  1746         const UChar *codePointStart=p;

  1747         if(nextFCD16(p, limit)<=0xff) {

  1748             return codePointStart;

  1749         }

  1750     }

  1751     return p;

  1752 }

  1754 // CanonicalIterator data -------------------------------------------------- ***

  1756 CanonIterData::CanonIterData(UErrorCode &errorCode) :

  1757         trie(utrie2_open(0, 0, &errorCode)),

  1758         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}

  1760 CanonIterData::~CanonIterData() {

  1761     utrie2_close(trie);

  1762 }

  1764 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {

  1765     uint32_t canonValue=utrie2_get32(trie, decompLead);

  1766     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {

  1767         // origin is the first character whose decomposition starts with

  1768         // the character for which we are setting the value.

  1769         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);

  1770     } else {

  1771         // origin is not the first character, or it is U+0000.

  1772         UnicodeSet *set;

  1773         if((canonValue&CANON_HAS_SET)==0) {

  1774             set=new UnicodeSet;

  1775             if(set==NULL) {

  1776                 errorCode=U_MEMORY_ALLOCATION_ERROR;

  1777                 return;

  1778             }

  1779             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);

  1780             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();

  1781             utrie2_set32(trie, decompLead, canonValue, &errorCode);

  1782             canonStartSets.addElement(set, errorCode);

  1783             if(firstOrigin!=0) {

  1784                 set->add(firstOrigin);

  1785             }

  1786         } else {

  1787             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];

  1788         }

  1789         set->add(origin);

  1790     }

  1791 }

  1793 U_CDECL_BEGIN

  1795 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.

  1796 //     context: the Normalizer2Impl

  1797 static UBool U_CALLCONV

  1798 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {

  1799     UErrorCode errorCode = U_ZERO_ERROR;

  1800     if (value != 0) {

  1801         Normalizer2Impl *impl = (Normalizer2Impl *)context;

  1802         impl->makeCanonIterDataFromNorm16(

  1803             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);

  1804     }

  1805     return U_SUCCESS(errorCode);

  1806 }

  1810 // UInitOnce instantiation function for CanonIterData

  1812 static void U_CALLCONV

  1813 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {

  1814     U_ASSERT(impl->fCanonIterData == NULL);

  1815     impl->fCanonIterData = new CanonIterData(errorCode);

  1816     if (impl->fCanonIterData == NULL) {

  1817         errorCode=U_MEMORY_ALLOCATION_ERROR;

  1818     }

  1819     if (U_SUCCESS(errorCode)) {

  1820         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);

  1821         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);

  1822     }

  1823     if (U_FAILURE(errorCode)) {

  1824         delete impl->fCanonIterData;

  1825         impl->fCanonIterData = NULL;

  1826     }

  1827 }

  1829 U_CDECL_END

  1831 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,

  1832                                                   CanonIterData &newData,

  1833                                                   UErrorCode &errorCode) const {

  1834     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {

  1835         // Inert, or 2-way mapping (including Hangul syllable).

  1836         // We do not write a canonStartSet for any yesNo character.

  1837         // Composites from 2-way mappings are added at runtime from the

  1838         // starter's compositions list, and the other characters in

  1839         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are

  1840         // "maybe" characters.

  1841         return;

  1842     }

  1843     for(UChar32 c=start; c<=end; ++c) {

  1844         uint32_t oldValue=utrie2_get32(newData.trie, c);

  1845         uint32_t newValue=oldValue;

  1846         if(norm16>=minMaybeYes) {

  1847             // not a segment starter if it occurs in a decomposition or has cc!=0

  1848             newValue|=CANON_NOT_SEGMENT_STARTER;

  1849             if(norm16<MIN_NORMAL_MAYBE_YES) {

  1850                 newValue|=CANON_HAS_COMPOSITIONS;

  1851             }

  1852         } else if(norm16<minYesNo) {

  1853             newValue|=CANON_HAS_COMPOSITIONS;

  1854         } else {

  1855             // c has a one-way decomposition

  1856             UChar32 c2=c;

  1857             uint16_t norm16_2=norm16;

  1858             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {

  1859                 c2=mapAlgorithmic(c2, norm16_2);

  1860                 norm16_2=getNorm16(c2);

  1861             }

  1862             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {

  1863                 // c decomposes, get everything from the variable-length extra data

  1864                 const uint16_t *mapping=getMapping(norm16_2);

  1865                 uint16_t firstUnit=*mapping;

  1866                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;

  1867                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {

  1868                     if(c==c2 && (*(mapping-1)&0xff)!=0) {

  1869                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0

  1870                     }

  1871                 }

  1872                 // Skip empty mappings (no characters in the decomposition).

  1873                 if(length!=0) {

  1874                     ++mapping;  // skip over the firstUnit

  1875                     // add c to first code point's start set

  1876                     int32_t i=0;

  1877                     U16_NEXT_UNSAFE(mapping, i, c2);

  1878                     newData.addToStartSet(c, c2, errorCode);

  1879                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a

  1880                     // one-way mapping. A 2-way mapping is possible here after

  1881                     // intermediate algorithmic mapping.

  1882                     if(norm16_2>=minNoNo) {

  1883                         while(i<length) {

  1884                             U16_NEXT_UNSAFE(mapping, i, c2);

  1885                             uint32_t c2Value=utrie2_get32(newData.trie, c2);

  1886                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {

  1887                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,

  1888                                              &errorCode);

  1889                             }

  1890                         }

  1891                     }

  1892                 }

  1893             } else {

  1894                 // c decomposed to c2 algorithmically; c has cc==0

  1895                 newData.addToStartSet(c, c2, errorCode);

  1896             }

  1897         }

  1898         if(newValue!=oldValue) {

  1899             utrie2_set32(newData.trie, c, newValue, &errorCode);

  1900         }

  1901     }

  1902 }

  1904 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {

  1905     // Logically const: Synchronized instantiation.

  1906     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);

  1907     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);

  1908     return U_SUCCESS(errorCode);

  1909 }

  1911 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {

  1912     return (int32_t)utrie2_get32(fCanonIterData->trie, c);

  1913 }

  1915 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {

  1916     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];

  1917 }

  1919 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {

  1920     return getCanonValue(c)>=0;

  1921 }

  1923 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {

  1924     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;

  1925     if(canonValue==0) {

  1926         return FALSE;

  1927     }

  1928     set.clear();

  1929     int32_t value=canonValue&CANON_VALUE_MASK;

  1930     if((canonValue&CANON_HAS_SET)!=0) {

  1931         set.addAll(getCanonStartSet(value));

  1932     } else if(value!=0) {

  1933         set.add(value);

  1934     }

  1935     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {

  1936         uint16_t norm16=getNorm16(c);

  1937         if(norm16==JAMO_L) {

  1938             UChar32 syllable=

  1939                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);

  1940             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);

  1941         } else {

  1942             addComposites(getCompositionsList(norm16), set);

  1943         }

  1944     }

  1945     return TRUE;

  1946 }

  1948 U_NAMESPACE_END

  1950 // Normalizer2 data swapping ----------------------------------------------- ***

  1952 U_NAMESPACE_USE

  1954 U_CAPI int32_t U_EXPORT2

  1955 unorm2_swap(const UDataSwapper *ds,

  1956             const void *inData, int32_t length, void *outData,

  1957             UErrorCode *pErrorCode) {

  1958     const UDataInfo *pInfo;

  1959     int32_t headerSize;

  1961     const uint8_t *inBytes;

  1962     uint8_t *outBytes;

  1964     const int32_t *inIndexes;

  1965     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];

  1967     int32_t i, offset, nextOffset, size;

  1969     /* udata_swapDataHeader checks the arguments */

  1970     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);

  1971     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

  1972         return 0;

  1973     }

  1975     /* check data format and format version */

  1976     pInfo=(const UDataInfo *)((const char *)inData+4);

  1977     if(!(

  1978         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */

  1979         pInfo->dataFormat[1]==0x72 &&

  1980         pInfo->dataFormat[2]==0x6d &&

  1981         pInfo->dataFormat[3]==0x32 &&

  1982         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)

  1983     )) {

  1984         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",

  1985                          pInfo->dataFormat[0], pInfo->dataFormat[1],

  1986                          pInfo->dataFormat[2], pInfo->dataFormat[3],

  1987                          pInfo->formatVersion[0]);

  1988         *pErrorCode=U_UNSUPPORTED_ERROR;

  1989         return 0;

  1990     }

  1992     inBytes=(const uint8_t *)inData+headerSize;

  1993     outBytes=(uint8_t *)outData+headerSize;

  1995     inIndexes=(const int32_t *)inBytes;

  1997     if(length>=0) {

  1998         length-=headerSize;

  1999         if(length<(int32_t)sizeof(indexes)) {

  2000             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",

  2001                              length);

  2002             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

  2003             return 0;

  2004         }

  2005     }

  2007     /* read the first few indexes */

  2008     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {

  2009         indexes[i]=udata_readInt32(ds, inIndexes[i]);

  2010     }

  2012     /* get the total length of the data */

  2013     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];

  2015     if(length>=0) {

  2016         if(length<size) {

  2017             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",

  2018                              length);

  2019             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

  2020             return 0;

  2021         }

  2023         /* copy the data for inaccessible bytes */

  2024         if(inBytes!=outBytes) {

  2025             uprv_memcpy(outBytes, inBytes, size);

  2026         }

  2028         offset=0;

  2030         /* swap the int32_t indexes[] */

  2031         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];

  2032         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);

  2033         offset=nextOffset;

  2035         /* swap the UTrie2 */

  2036         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];

  2037         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);

  2038         offset=nextOffset;

  2040         /* swap the uint16_t extraData[] */

  2041         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];

  2042         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);

  2043         offset=nextOffset;

  2045         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */

  2046         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];

  2047         offset=nextOffset;

  2049         U_ASSERT(offset==size);

  2050     }

  2052     return headerSize+size;

  2053 }

  2055 #endif  // !UCONFIG_NO_NORMALIZATION

The Tor Browser / file revision

intl/icu/source/common/normalizer2impl.cpp@fc2d59ddac77

intl/icu/source/common/normalizer2impl.cpp