intl/icu/source/common/normalizer2impl.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/normalizer2impl.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2055 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2009-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  normalizer2impl.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2009nov22
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +
    1.22 +#if !UCONFIG_NO_NORMALIZATION
    1.23 +
    1.24 +#include "unicode/normalizer2.h"
    1.25 +#include "unicode/udata.h"
    1.26 +#include "unicode/ustring.h"
    1.27 +#include "unicode/utf16.h"
    1.28 +#include "cmemory.h"
    1.29 +#include "mutex.h"
    1.30 +#include "normalizer2impl.h"
    1.31 +#include "putilimp.h"
    1.32 +#include "uassert.h"
    1.33 +#include "uset_imp.h"
    1.34 +#include "utrie2.h"
    1.35 +#include "uvector.h"
    1.36 +
    1.37 +U_NAMESPACE_BEGIN
    1.38 +
    1.39 +// ReorderingBuffer -------------------------------------------------------- ***
    1.40 +
    1.41 +UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
    1.42 +    int32_t length=str.length();
    1.43 +    start=str.getBuffer(destCapacity);
    1.44 +    if(start==NULL) {
    1.45 +        // getBuffer() already did str.setToBogus()
    1.46 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
    1.47 +        return FALSE;
    1.48 +    }
    1.49 +    limit=start+length;
    1.50 +    remainingCapacity=str.getCapacity()-length;
    1.51 +    reorderStart=start;
    1.52 +    if(start==limit) {
    1.53 +        lastCC=0;
    1.54 +    } else {
    1.55 +        setIterator();
    1.56 +        lastCC=previousCC();
    1.57 +        // Set reorderStart after the last code point with cc<=1 if there is one.
    1.58 +        if(lastCC>1) {
    1.59 +            while(previousCC()>1) {}
    1.60 +        }
    1.61 +        reorderStart=codePointLimit;
    1.62 +    }
    1.63 +    return TRUE;
    1.64 +}
    1.65 +
    1.66 +UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
    1.67 +    int32_t length=(int32_t)(limit-start);
    1.68 +    return
    1.69 +        length==(int32_t)(otherLimit-otherStart) &&
    1.70 +        0==u_memcmp(start, otherStart, length);
    1.71 +}
    1.72 +
    1.73 +UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
    1.74 +    if(remainingCapacity<2 && !resize(2, errorCode)) {
    1.75 +        return FALSE;
    1.76 +    }
    1.77 +    if(lastCC<=cc || cc==0) {
    1.78 +        limit[0]=U16_LEAD(c);
    1.79 +        limit[1]=U16_TRAIL(c);
    1.80 +        limit+=2;
    1.81 +        lastCC=cc;
    1.82 +        if(cc<=1) {
    1.83 +            reorderStart=limit;
    1.84 +        }
    1.85 +    } else {
    1.86 +        insert(c, cc);
    1.87 +    }
    1.88 +    remainingCapacity-=2;
    1.89 +    return TRUE;
    1.90 +}
    1.91 +
    1.92 +UBool ReorderingBuffer::append(const UChar *s, int32_t length,
    1.93 +                               uint8_t leadCC, uint8_t trailCC,
    1.94 +                               UErrorCode &errorCode) {
    1.95 +    if(length==0) {
    1.96 +        return TRUE;
    1.97 +    }
    1.98 +    if(remainingCapacity<length && !resize(length, errorCode)) {
    1.99 +        return FALSE;
   1.100 +    }
   1.101 +    remainingCapacity-=length;
   1.102 +    if(lastCC<=leadCC || leadCC==0) {
   1.103 +        if(trailCC<=1) {
   1.104 +            reorderStart=limit+length;
   1.105 +        } else if(leadCC<=1) {
   1.106 +            reorderStart=limit+1;  // Ok if not a code point boundary.
   1.107 +        }
   1.108 +        const UChar *sLimit=s+length;
   1.109 +        do { *limit++=*s++; } while(s!=sLimit);
   1.110 +        lastCC=trailCC;
   1.111 +    } else {
   1.112 +        int32_t i=0;
   1.113 +        UChar32 c;
   1.114 +        U16_NEXT(s, i, length, c);
   1.115 +        insert(c, leadCC);  // insert first code point
   1.116 +        while(i<length) {
   1.117 +            U16_NEXT(s, i, length, c);
   1.118 +            if(i<length) {
   1.119 +                // s must be in NFD, otherwise we need to use getCC().
   1.120 +                leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
   1.121 +            } else {
   1.122 +                leadCC=trailCC;
   1.123 +            }
   1.124 +            append(c, leadCC, errorCode);
   1.125 +        }
   1.126 +    }
   1.127 +    return TRUE;
   1.128 +}
   1.129 +
   1.130 +UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
   1.131 +    int32_t cpLength=U16_LENGTH(c);
   1.132 +    if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
   1.133 +        return FALSE;
   1.134 +    }
   1.135 +    remainingCapacity-=cpLength;
   1.136 +    if(cpLength==1) {
   1.137 +        *limit++=(UChar)c;
   1.138 +    } else {
   1.139 +        limit[0]=U16_LEAD(c);
   1.140 +        limit[1]=U16_TRAIL(c);
   1.141 +        limit+=2;
   1.142 +    }
   1.143 +    lastCC=0;
   1.144 +    reorderStart=limit;
   1.145 +    return TRUE;
   1.146 +}
   1.147 +
   1.148 +UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
   1.149 +    if(s==sLimit) {
   1.150 +        return TRUE;
   1.151 +    }
   1.152 +    int32_t length=(int32_t)(sLimit-s);
   1.153 +    if(remainingCapacity<length && !resize(length, errorCode)) {
   1.154 +        return FALSE;
   1.155 +    }
   1.156 +    u_memcpy(limit, s, length);
   1.157 +    limit+=length;
   1.158 +    remainingCapacity-=length;
   1.159 +    lastCC=0;
   1.160 +    reorderStart=limit;
   1.161 +    return TRUE;
   1.162 +}
   1.163 +
   1.164 +void ReorderingBuffer::remove() {
   1.165 +    reorderStart=limit=start;
   1.166 +    remainingCapacity=str.getCapacity();
   1.167 +    lastCC=0;
   1.168 +}
   1.169 +
   1.170 +void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
   1.171 +    if(suffixLength<(limit-start)) {
   1.172 +        limit-=suffixLength;
   1.173 +        remainingCapacity+=suffixLength;
   1.174 +    } else {
   1.175 +        limit=start;
   1.176 +        remainingCapacity=str.getCapacity();
   1.177 +    }
   1.178 +    lastCC=0;
   1.179 +    reorderStart=limit;
   1.180 +}
   1.181 +
   1.182 +UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
   1.183 +    int32_t reorderStartIndex=(int32_t)(reorderStart-start);
   1.184 +    int32_t length=(int32_t)(limit-start);
   1.185 +    str.releaseBuffer(length);
   1.186 +    int32_t newCapacity=length+appendLength;
   1.187 +    int32_t doubleCapacity=2*str.getCapacity();
   1.188 +    if(newCapacity<doubleCapacity) {
   1.189 +        newCapacity=doubleCapacity;
   1.190 +    }
   1.191 +    if(newCapacity<256) {
   1.192 +        newCapacity=256;
   1.193 +    }
   1.194 +    start=str.getBuffer(newCapacity);
   1.195 +    if(start==NULL) {
   1.196 +        // getBuffer() already did str.setToBogus()
   1.197 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.198 +        return FALSE;
   1.199 +    }
   1.200 +    reorderStart=start+reorderStartIndex;
   1.201 +    limit=start+length;
   1.202 +    remainingCapacity=str.getCapacity()-length;
   1.203 +    return TRUE;
   1.204 +}
   1.205 +
   1.206 +void ReorderingBuffer::skipPrevious() {
   1.207 +    codePointLimit=codePointStart;
   1.208 +    UChar c=*--codePointStart;
   1.209 +    if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
   1.210 +        --codePointStart;
   1.211 +    }
   1.212 +}
   1.213 +
   1.214 +uint8_t ReorderingBuffer::previousCC() {
   1.215 +    codePointLimit=codePointStart;
   1.216 +    if(reorderStart>=codePointStart) {
   1.217 +        return 0;
   1.218 +    }
   1.219 +    UChar32 c=*--codePointStart;
   1.220 +    if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
   1.221 +        return 0;
   1.222 +    }
   1.223 +
   1.224 +    UChar c2;
   1.225 +    if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
   1.226 +        --codePointStart;
   1.227 +        c=U16_GET_SUPPLEMENTARY(c2, c);
   1.228 +    }
   1.229 +    return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
   1.230 +}
   1.231 +
   1.232 +// Inserts c somewhere before the last character.
   1.233 +// Requires 0<cc<lastCC which implies reorderStart<limit.
   1.234 +void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
   1.235 +    for(setIterator(), skipPrevious(); previousCC()>cc;) {}
   1.236 +    // insert c at codePointLimit, after the character with prevCC<=cc
   1.237 +    UChar *q=limit;
   1.238 +    UChar *r=limit+=U16_LENGTH(c);
   1.239 +    do {
   1.240 +        *--r=*--q;
   1.241 +    } while(codePointLimit!=q);
   1.242 +    writeCodePoint(q, c);
   1.243 +    if(cc<=1) {
   1.244 +        reorderStart=r;
   1.245 +    }
   1.246 +}
   1.247 +
   1.248 +// Normalizer2Impl --------------------------------------------------------- ***
   1.249 +
   1.250 +struct CanonIterData : public UMemory {
   1.251 +    CanonIterData(UErrorCode &errorCode);
   1.252 +    ~CanonIterData();
   1.253 +    void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
   1.254 +    UTrie2 *trie;
   1.255 +    UVector canonStartSets;  // contains UnicodeSet *
   1.256 +};
   1.257 +
   1.258 +Normalizer2Impl::~Normalizer2Impl() {
   1.259 +    udata_close(memory);
   1.260 +    utrie2_close(normTrie);
   1.261 +    delete fCanonIterData;
   1.262 +}
   1.263 +
   1.264 +UBool U_CALLCONV
   1.265 +Normalizer2Impl::isAcceptable(void *context,
   1.266 +                              const char * /* type */, const char * /*name*/,
   1.267 +                              const UDataInfo *pInfo) {
   1.268 +    if(
   1.269 +        pInfo->size>=20 &&
   1.270 +        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
   1.271 +        pInfo->charsetFamily==U_CHARSET_FAMILY &&
   1.272 +        pInfo->dataFormat[0]==0x4e &&    /* dataFormat="Nrm2" */
   1.273 +        pInfo->dataFormat[1]==0x72 &&
   1.274 +        pInfo->dataFormat[2]==0x6d &&
   1.275 +        pInfo->dataFormat[3]==0x32 &&
   1.276 +        pInfo->formatVersion[0]==2
   1.277 +    ) {
   1.278 +        Normalizer2Impl *me=(Normalizer2Impl *)context;
   1.279 +        uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
   1.280 +        return TRUE;
   1.281 +    } else {
   1.282 +        return FALSE;
   1.283 +    }
   1.284 +}
   1.285 +
   1.286 +void
   1.287 +Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
   1.288 +    if(U_FAILURE(errorCode)) {
   1.289 +        return;
   1.290 +    }
   1.291 +    memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
   1.292 +    if(U_FAILURE(errorCode)) {
   1.293 +        return;
   1.294 +    }
   1.295 +    const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
   1.296 +    const int32_t *inIndexes=(const int32_t *)inBytes;
   1.297 +    int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
   1.298 +    if(indexesLength<=IX_MIN_MAYBE_YES) {
   1.299 +        errorCode=U_INVALID_FORMAT_ERROR;  // Not enough indexes.
   1.300 +        return;
   1.301 +    }
   1.302 +
   1.303 +    minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
   1.304 +    minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
   1.305 +
   1.306 +    minYesNo=inIndexes[IX_MIN_YES_NO];
   1.307 +    minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
   1.308 +    minNoNo=inIndexes[IX_MIN_NO_NO];
   1.309 +    limitNoNo=inIndexes[IX_LIMIT_NO_NO];
   1.310 +    minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
   1.311 +
   1.312 +    int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
   1.313 +    int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
   1.314 +    normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
   1.315 +                                       inBytes+offset, nextOffset-offset, NULL,
   1.316 +                                       &errorCode);
   1.317 +    if(U_FAILURE(errorCode)) {
   1.318 +        return;
   1.319 +    }
   1.320 +
   1.321 +    offset=nextOffset;
   1.322 +    nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
   1.323 +    maybeYesCompositions=(const uint16_t *)(inBytes+offset);
   1.324 +    extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
   1.325 +
   1.326 +    // smallFCD: new in formatVersion 2
   1.327 +    offset=nextOffset;
   1.328 +    smallFCD=inBytes+offset;
   1.329 +
   1.330 +    // Build tccc180[].
   1.331 +    // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
   1.332 +    uint8_t bits=0;
   1.333 +    for(UChar c=0; c<0x180; bits>>=1) {
   1.334 +        if((c&0xff)==0) {
   1.335 +            bits=smallFCD[c>>8];  // one byte per 0x100 code points
   1.336 +        }
   1.337 +        if(bits&1) {
   1.338 +            for(int i=0; i<0x20; ++i, ++c) {
   1.339 +                tccc180[c]=(uint8_t)getFCD16FromNormData(c);
   1.340 +            }
   1.341 +        } else {
   1.342 +            uprv_memset(tccc180+c, 0, 0x20);
   1.343 +            c+=0x20;
   1.344 +        }
   1.345 +    }
   1.346 +}
   1.347 +
   1.348 +uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
   1.349 +    UChar32 c;
   1.350 +    if(cpStart==(cpLimit-1)) {
   1.351 +        c=*cpStart;
   1.352 +    } else {
   1.353 +        c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
   1.354 +    }
   1.355 +    uint16_t prevNorm16=getNorm16(c);
   1.356 +    if(prevNorm16<=minYesNo) {
   1.357 +        return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
   1.358 +    } else {
   1.359 +        return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
   1.360 +    }
   1.361 +}
   1.362 +
   1.363 +U_CDECL_BEGIN
   1.364 +
   1.365 +static UBool U_CALLCONV
   1.366 +enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
   1.367 +    /* add the start code point to the USet */
   1.368 +    const USetAdder *sa=(const USetAdder *)context;
   1.369 +    sa->add(sa->set, start);
   1.370 +    return TRUE;
   1.371 +}
   1.372 +
   1.373 +static uint32_t U_CALLCONV
   1.374 +segmentStarterMapper(const void * /*context*/, uint32_t value) {
   1.375 +    return value&CANON_NOT_SEGMENT_STARTER;
   1.376 +}
   1.377 +
   1.378 +U_CDECL_END
   1.379 +
   1.380 +void
   1.381 +Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
   1.382 +    /* add the start code point of each same-value range of each trie */
   1.383 +    utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
   1.384 +
   1.385 +    /* add Hangul LV syllables and LV+1 because of skippables */
   1.386 +    for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
   1.387 +        sa->add(sa->set, c);
   1.388 +        sa->add(sa->set, c+1);
   1.389 +    }
   1.390 +    sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
   1.391 +}
   1.392 +
   1.393 +void
   1.394 +Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
   1.395 +    /* add the start code point of each same-value range of the canonical iterator data trie */
   1.396 +    if(ensureCanonIterData(errorCode)) {
   1.397 +        // currently only used for the SEGMENT_STARTER property
   1.398 +        utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
   1.399 +    }
   1.400 +}
   1.401 +
   1.402 +const UChar *
   1.403 +Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
   1.404 +                                                UChar32 minNeedDataCP,
   1.405 +                                                ReorderingBuffer *buffer,
   1.406 +                                                UErrorCode &errorCode) const {
   1.407 +    // Make some effort to support NUL-terminated strings reasonably.
   1.408 +    // Take the part of the fast quick check loop that does not look up
   1.409 +    // data and check the first part of the string.
   1.410 +    // After this prefix, determine the string length to simplify the rest
   1.411 +    // of the code.
   1.412 +    const UChar *prevSrc=src;
   1.413 +    UChar c;
   1.414 +    while((c=*src++)<minNeedDataCP && c!=0) {}
   1.415 +    // Back out the last character for full processing.
   1.416 +    // Copy this prefix.
   1.417 +    if(--src!=prevSrc) {
   1.418 +        if(buffer!=NULL) {
   1.419 +            buffer->appendZeroCC(prevSrc, src, errorCode);
   1.420 +        }
   1.421 +    }
   1.422 +    return src;
   1.423 +}
   1.424 +
   1.425 +// Dual functionality:
   1.426 +// buffer!=NULL: normalize
   1.427 +// buffer==NULL: isNormalized/spanQuickCheckYes
   1.428 +const UChar *
   1.429 +Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
   1.430 +                           ReorderingBuffer *buffer,
   1.431 +                           UErrorCode &errorCode) const {
   1.432 +    UChar32 minNoCP=minDecompNoCP;
   1.433 +    if(limit==NULL) {
   1.434 +        src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
   1.435 +        if(U_FAILURE(errorCode)) {
   1.436 +            return src;
   1.437 +        }
   1.438 +        limit=u_strchr(src, 0);
   1.439 +    }
   1.440 +
   1.441 +    const UChar *prevSrc;
   1.442 +    UChar32 c=0;
   1.443 +    uint16_t norm16=0;
   1.444 +
   1.445 +    // only for quick check
   1.446 +    const UChar *prevBoundary=src;
   1.447 +    uint8_t prevCC=0;
   1.448 +
   1.449 +    for(;;) {
   1.450 +        // count code units below the minimum or with irrelevant data for the quick check
   1.451 +        for(prevSrc=src; src!=limit;) {
   1.452 +            if( (c=*src)<minNoCP ||
   1.453 +                isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
   1.454 +            ) {
   1.455 +                ++src;
   1.456 +            } else if(!U16_IS_SURROGATE(c)) {
   1.457 +                break;
   1.458 +            } else {
   1.459 +                UChar c2;
   1.460 +                if(U16_IS_SURROGATE_LEAD(c)) {
   1.461 +                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
   1.462 +                        c=U16_GET_SUPPLEMENTARY(c, c2);
   1.463 +                    }
   1.464 +                } else /* trail surrogate */ {
   1.465 +                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
   1.466 +                        --src;
   1.467 +                        c=U16_GET_SUPPLEMENTARY(c2, c);
   1.468 +                    }
   1.469 +                }
   1.470 +                if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
   1.471 +                    src+=U16_LENGTH(c);
   1.472 +                } else {
   1.473 +                    break;
   1.474 +                }
   1.475 +            }
   1.476 +        }
   1.477 +        // copy these code units all at once
   1.478 +        if(src!=prevSrc) {
   1.479 +            if(buffer!=NULL) {
   1.480 +                if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
   1.481 +                    break;
   1.482 +                }
   1.483 +            } else {
   1.484 +                prevCC=0;
   1.485 +                prevBoundary=src;
   1.486 +            }
   1.487 +        }
   1.488 +        if(src==limit) {
   1.489 +            break;
   1.490 +        }
   1.491 +
   1.492 +        // Check one above-minimum, relevant code point.
   1.493 +        src+=U16_LENGTH(c);
   1.494 +        if(buffer!=NULL) {
   1.495 +            if(!decompose(c, norm16, *buffer, errorCode)) {
   1.496 +                break;
   1.497 +            }
   1.498 +        } else {
   1.499 +            if(isDecompYes(norm16)) {
   1.500 +                uint8_t cc=getCCFromYesOrMaybe(norm16);
   1.501 +                if(prevCC<=cc || cc==0) {
   1.502 +                    prevCC=cc;
   1.503 +                    if(cc<=1) {
   1.504 +                        prevBoundary=src;
   1.505 +                    }
   1.506 +                    continue;
   1.507 +                }
   1.508 +            }
   1.509 +            return prevBoundary;  // "no" or cc out of order
   1.510 +        }
   1.511 +    }
   1.512 +    return src;
   1.513 +}
   1.514 +
   1.515 +// Decompose a short piece of text which is likely to contain characters that
   1.516 +// fail the quick check loop and/or where the quick check loop's overhead
   1.517 +// is unlikely to be amortized.
   1.518 +// Called by the compose() and makeFCD() implementations.
   1.519 +UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
   1.520 +                                      ReorderingBuffer &buffer,
   1.521 +                                      UErrorCode &errorCode) const {
   1.522 +    while(src<limit) {
   1.523 +        UChar32 c;
   1.524 +        uint16_t norm16;
   1.525 +        UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
   1.526 +        if(!decompose(c, norm16, buffer, errorCode)) {
   1.527 +            return FALSE;
   1.528 +        }
   1.529 +    }
   1.530 +    return TRUE;
   1.531 +}
   1.532 +
   1.533 +UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
   1.534 +                                 ReorderingBuffer &buffer,
   1.535 +                                 UErrorCode &errorCode) const {
   1.536 +    // Only loops for 1:1 algorithmic mappings.
   1.537 +    for(;;) {
   1.538 +        // get the decomposition and the lead and trail cc's
   1.539 +        if(isDecompYes(norm16)) {
   1.540 +            // c does not decompose
   1.541 +            return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
   1.542 +        } else if(isHangul(norm16)) {
   1.543 +            // Hangul syllable: decompose algorithmically
   1.544 +            UChar jamos[3];
   1.545 +            return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
   1.546 +        } else if(isDecompNoAlgorithmic(norm16)) {
   1.547 +            c=mapAlgorithmic(c, norm16);
   1.548 +            norm16=getNorm16(c);
   1.549 +        } else {
   1.550 +            // c decomposes, get everything from the variable-length extra data
   1.551 +            const uint16_t *mapping=getMapping(norm16);
   1.552 +            uint16_t firstUnit=*mapping;
   1.553 +            int32_t length=firstUnit&MAPPING_LENGTH_MASK;
   1.554 +            uint8_t leadCC, trailCC;
   1.555 +            trailCC=(uint8_t)(firstUnit>>8);
   1.556 +            if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
   1.557 +                leadCC=(uint8_t)(*(mapping-1)>>8);
   1.558 +            } else {
   1.559 +                leadCC=0;
   1.560 +            }
   1.561 +            return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
   1.562 +        }
   1.563 +    }
   1.564 +}
   1.565 +
   1.566 +const UChar *
   1.567 +Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
   1.568 +    const UChar *decomp=NULL;
   1.569 +    uint16_t norm16;
   1.570 +    for(;;) {
   1.571 +        if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
   1.572 +            // c does not decompose
   1.573 +            return decomp;
   1.574 +        } else if(isHangul(norm16)) {
   1.575 +            // Hangul syllable: decompose algorithmically
   1.576 +            length=Hangul::decompose(c, buffer);
   1.577 +            return buffer;
   1.578 +        } else if(isDecompNoAlgorithmic(norm16)) {
   1.579 +            c=mapAlgorithmic(c, norm16);
   1.580 +            decomp=buffer;
   1.581 +            length=0;
   1.582 +            U16_APPEND_UNSAFE(buffer, length, c);
   1.583 +        } else {
   1.584 +            // c decomposes, get everything from the variable-length extra data
   1.585 +            const uint16_t *mapping=getMapping(norm16);
   1.586 +            length=*mapping&MAPPING_LENGTH_MASK;
   1.587 +            return (const UChar *)mapping+1;
   1.588 +        }
   1.589 +    }
   1.590 +}
   1.591 +
   1.592 +// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
   1.593 +// so that a raw mapping fits that consists of one unit ("rm0")
   1.594 +// plus all but the first two code units of the normal mapping.
   1.595 +// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
   1.596 +const UChar *
   1.597 +Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
   1.598 +    // We do not loop in this method because an algorithmic mapping itself
   1.599 +    // becomes a final result rather than having to be decomposed recursively.
   1.600 +    uint16_t norm16;
   1.601 +    if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
   1.602 +        // c does not decompose
   1.603 +        return NULL;
   1.604 +    } else if(isHangul(norm16)) {
   1.605 +        // Hangul syllable: decompose algorithmically
   1.606 +        Hangul::getRawDecomposition(c, buffer);
   1.607 +        length=2;
   1.608 +        return buffer;
   1.609 +    } else if(isDecompNoAlgorithmic(norm16)) {
   1.610 +        c=mapAlgorithmic(c, norm16);
   1.611 +        length=0;
   1.612 +        U16_APPEND_UNSAFE(buffer, length, c);
   1.613 +        return buffer;
   1.614 +    } else {
   1.615 +        // c decomposes, get everything from the variable-length extra data
   1.616 +        const uint16_t *mapping=getMapping(norm16);
   1.617 +        uint16_t firstUnit=*mapping;
   1.618 +        int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
   1.619 +        if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
   1.620 +            // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
   1.621 +            // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
   1.622 +            const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
   1.623 +            uint16_t rm0=*rawMapping;
   1.624 +            if(rm0<=MAPPING_LENGTH_MASK) {
   1.625 +                length=rm0;
   1.626 +                return (const UChar *)rawMapping-rm0;
   1.627 +            } else {
   1.628 +                // Copy the normal mapping and replace its first two code units with rm0.
   1.629 +                buffer[0]=(UChar)rm0;
   1.630 +                u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
   1.631 +                length=mLength-1;
   1.632 +                return buffer;
   1.633 +            }
   1.634 +        } else {
   1.635 +            length=mLength;
   1.636 +            return (const UChar *)mapping+1;
   1.637 +        }
   1.638 +    }
   1.639 +}
   1.640 +
   1.641 +void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
   1.642 +                                         UBool doDecompose,
   1.643 +                                         UnicodeString &safeMiddle,
   1.644 +                                         ReorderingBuffer &buffer,
   1.645 +                                         UErrorCode &errorCode) const {
   1.646 +    buffer.copyReorderableSuffixTo(safeMiddle);
   1.647 +    if(doDecompose) {
   1.648 +        decompose(src, limit, &buffer, errorCode);
   1.649 +        return;
   1.650 +    }
   1.651 +    // Just merge the strings at the boundary.
   1.652 +    ForwardUTrie2StringIterator iter(normTrie, src, limit);
   1.653 +    uint8_t firstCC, prevCC, cc;
   1.654 +    firstCC=prevCC=cc=getCC(iter.next16());
   1.655 +    while(cc!=0) {
   1.656 +        prevCC=cc;
   1.657 +        cc=getCC(iter.next16());
   1.658 +    };
   1.659 +    if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
   1.660 +        limit=u_strchr(iter.codePointStart, 0);
   1.661 +    }
   1.662 +
   1.663 +    if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
   1.664 +        buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
   1.665 +    }
   1.666 +}
   1.667 +
   1.668 +// Note: hasDecompBoundary() could be implemented as aliases to
   1.669 +// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
   1.670 +// at the cost of building the FCD trie for a decomposition normalizer.
   1.671 +UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
   1.672 +    for(;;) {
   1.673 +        if(c<minDecompNoCP) {
   1.674 +            return TRUE;
   1.675 +        }
   1.676 +        uint16_t norm16=getNorm16(c);
   1.677 +        if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
   1.678 +            return TRUE;
   1.679 +        } else if(norm16>MIN_NORMAL_MAYBE_YES) {
   1.680 +            return FALSE;  // ccc!=0
   1.681 +        } else if(isDecompNoAlgorithmic(norm16)) {
   1.682 +            c=mapAlgorithmic(c, norm16);
   1.683 +        } else {
   1.684 +            // c decomposes, get everything from the variable-length extra data
   1.685 +            const uint16_t *mapping=getMapping(norm16);
   1.686 +            uint16_t firstUnit=*mapping;
   1.687 +            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
   1.688 +                return FALSE;
   1.689 +            }
   1.690 +            if(!before) {
   1.691 +                // decomp after-boundary: same as hasFCDBoundaryAfter(),
   1.692 +                // fcd16<=1 || trailCC==0
   1.693 +                if(firstUnit>0x1ff) {
   1.694 +                    return FALSE;  // trailCC>1
   1.695 +                }
   1.696 +                if(firstUnit<=0xff) {
   1.697 +                    return TRUE;  // trailCC==0
   1.698 +                }
   1.699 +                // if(trailCC==1) test leadCC==0, same as checking for before-boundary
   1.700 +            }
   1.701 +            // TRUE if leadCC==0 (hasFCDBoundaryBefore())
   1.702 +            return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
   1.703 +        }
   1.704 +    }
   1.705 +}
   1.706 +
   1.707 +/*
   1.708 + * Finds the recomposition result for
   1.709 + * a forward-combining "lead" character,
   1.710 + * specified with a pointer to its compositions list,
   1.711 + * and a backward-combining "trail" character.
   1.712 + *
   1.713 + * If the lead and trail characters combine, then this function returns
   1.714 + * the following "compositeAndFwd" value:
   1.715 + * Bits 21..1  composite character
   1.716 + * Bit      0  set if the composite is a forward-combining starter
   1.717 + * otherwise it returns -1.
   1.718 + *
   1.719 + * The compositions list has (trail, compositeAndFwd) pair entries,
   1.720 + * encoded as either pairs or triples of 16-bit units.
   1.721 + * The last entry has the high bit of its first unit set.
   1.722 + *
   1.723 + * The list is sorted by ascending trail characters (there are no duplicates).
   1.724 + * A linear search is used.
   1.725 + *
   1.726 + * See normalizer2impl.h for a more detailed description
   1.727 + * of the compositions list format.
   1.728 + */
   1.729 +int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
   1.730 +    uint16_t key1, firstUnit;
   1.731 +    if(trail<COMP_1_TRAIL_LIMIT) {
   1.732 +        // trail character is 0..33FF
   1.733 +        // result entry may have 2 or 3 units
   1.734 +        key1=(uint16_t)(trail<<1);
   1.735 +        while(key1>(firstUnit=*list)) {
   1.736 +            list+=2+(firstUnit&COMP_1_TRIPLE);
   1.737 +        }
   1.738 +        if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
   1.739 +            if(firstUnit&COMP_1_TRIPLE) {
   1.740 +                return ((int32_t)list[1]<<16)|list[2];
   1.741 +            } else {
   1.742 +                return list[1];
   1.743 +            }
   1.744 +        }
   1.745 +    } else {
   1.746 +        // trail character is 3400..10FFFF
   1.747 +        // result entry has 3 units
   1.748 +        key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
   1.749 +                        (((trail>>COMP_1_TRAIL_SHIFT))&
   1.750 +                          ~COMP_1_TRIPLE));
   1.751 +        uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
   1.752 +        uint16_t secondUnit;
   1.753 +        for(;;) {
   1.754 +            if(key1>(firstUnit=*list)) {
   1.755 +                list+=2+(firstUnit&COMP_1_TRIPLE);
   1.756 +            } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
   1.757 +                if(key2>(secondUnit=list[1])) {
   1.758 +                    if(firstUnit&COMP_1_LAST_TUPLE) {
   1.759 +                        break;
   1.760 +                    } else {
   1.761 +                        list+=3;
   1.762 +                    }
   1.763 +                } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
   1.764 +                    return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
   1.765 +                } else {
   1.766 +                    break;
   1.767 +                }
   1.768 +            } else {
   1.769 +                break;
   1.770 +            }
   1.771 +        }
   1.772 +    }
   1.773 +    return -1;
   1.774 +}
   1.775 +
   1.776 +/**
   1.777 +  * @param list some character's compositions list
   1.778 +  * @param set recursively receives the composites from these compositions
   1.779 +  */
   1.780 +void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
   1.781 +    uint16_t firstUnit;
   1.782 +    int32_t compositeAndFwd;
   1.783 +    do {
   1.784 +        firstUnit=*list;
   1.785 +        if((firstUnit&COMP_1_TRIPLE)==0) {
   1.786 +            compositeAndFwd=list[1];
   1.787 +            list+=2;
   1.788 +        } else {
   1.789 +            compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
   1.790 +            list+=3;
   1.791 +        }
   1.792 +        UChar32 composite=compositeAndFwd>>1;
   1.793 +        if((compositeAndFwd&1)!=0) {
   1.794 +            addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
   1.795 +        }
   1.796 +        set.add(composite);
   1.797 +    } while((firstUnit&COMP_1_LAST_TUPLE)==0);
   1.798 +}
   1.799 +
   1.800 +/*
   1.801 + * Recomposes the buffer text starting at recomposeStartIndex
   1.802 + * (which is in NFD - decomposed and canonically ordered),
   1.803 + * and truncates the buffer contents.
   1.804 + *
   1.805 + * Note that recomposition never lengthens the text:
   1.806 + * Any character consists of either one or two code units;
   1.807 + * a composition may contain at most one more code unit than the original starter,
   1.808 + * while the combining mark that is removed has at least one code unit.
   1.809 + */
   1.810 +void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
   1.811 +                                UBool onlyContiguous) const {
   1.812 +    UChar *p=buffer.getStart()+recomposeStartIndex;
   1.813 +    UChar *limit=buffer.getLimit();
   1.814 +    if(p==limit) {
   1.815 +        return;
   1.816 +    }
   1.817 +
   1.818 +    UChar *starter, *pRemove, *q, *r;
   1.819 +    const uint16_t *compositionsList;
   1.820 +    UChar32 c, compositeAndFwd;
   1.821 +    uint16_t norm16;
   1.822 +    uint8_t cc, prevCC;
   1.823 +    UBool starterIsSupplementary;
   1.824 +
   1.825 +    // Some of the following variables are not used until we have a forward-combining starter
   1.826 +    // and are only initialized now to avoid compiler warnings.
   1.827 +    compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
   1.828 +    starter=NULL;
   1.829 +    starterIsSupplementary=FALSE;
   1.830 +    prevCC=0;
   1.831 +
   1.832 +    for(;;) {
   1.833 +        UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
   1.834 +        cc=getCCFromYesOrMaybe(norm16);
   1.835 +        if( // this character combines backward and
   1.836 +            isMaybe(norm16) &&
   1.837 +            // we have seen a starter that combines forward and
   1.838 +            compositionsList!=NULL &&
   1.839 +            // the backward-combining character is not blocked
   1.840 +            (prevCC<cc || prevCC==0)
   1.841 +        ) {
   1.842 +            if(isJamoVT(norm16)) {
   1.843 +                // c is a Jamo V/T, see if we can compose it with the previous character.
   1.844 +                if(c<Hangul::JAMO_T_BASE) {
   1.845 +                    // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
   1.846 +                    UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
   1.847 +                    if(prev<Hangul::JAMO_L_COUNT) {
   1.848 +                        pRemove=p-1;
   1.849 +                        UChar syllable=(UChar)
   1.850 +                            (Hangul::HANGUL_BASE+
   1.851 +                             (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
   1.852 +                             Hangul::JAMO_T_COUNT);
   1.853 +                        UChar t;
   1.854 +                        if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
   1.855 +                            ++p;
   1.856 +                            syllable+=t;  // The next character was a Jamo T.
   1.857 +                        }
   1.858 +                        *starter=syllable;
   1.859 +                        // remove the Jamo V/T
   1.860 +                        q=pRemove;
   1.861 +                        r=p;
   1.862 +                        while(r<limit) {
   1.863 +                            *q++=*r++;
   1.864 +                        }
   1.865 +                        limit=q;
   1.866 +                        p=pRemove;
   1.867 +                    }
   1.868 +                }
   1.869 +                /*
   1.870 +                 * No "else" for Jamo T:
   1.871 +                 * Since the input is in NFD, there are no Hangul LV syllables that
   1.872 +                 * a Jamo T could combine with.
   1.873 +                 * All Jamo Ts are combined above when handling Jamo Vs.
   1.874 +                 */
   1.875 +                if(p==limit) {
   1.876 +                    break;
   1.877 +                }
   1.878 +                compositionsList=NULL;
   1.879 +                continue;
   1.880 +            } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
   1.881 +                // The starter and the combining mark (c) do combine.
   1.882 +                UChar32 composite=compositeAndFwd>>1;
   1.883 +
   1.884 +                // Replace the starter with the composite, remove the combining mark.
   1.885 +                pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
   1.886 +                if(starterIsSupplementary) {
   1.887 +                    if(U_IS_SUPPLEMENTARY(composite)) {
   1.888 +                        // both are supplementary
   1.889 +                        starter[0]=U16_LEAD(composite);
   1.890 +                        starter[1]=U16_TRAIL(composite);
   1.891 +                    } else {
   1.892 +                        *starter=(UChar)composite;
   1.893 +                        // The composite is shorter than the starter,
   1.894 +                        // move the intermediate characters forward one.
   1.895 +                        starterIsSupplementary=FALSE;
   1.896 +                        q=starter+1;
   1.897 +                        r=q+1;
   1.898 +                        while(r<pRemove) {
   1.899 +                            *q++=*r++;
   1.900 +                        }
   1.901 +                        --pRemove;
   1.902 +                    }
   1.903 +                } else if(U_IS_SUPPLEMENTARY(composite)) {
   1.904 +                    // The composite is longer than the starter,
   1.905 +                    // move the intermediate characters back one.
   1.906 +                    starterIsSupplementary=TRUE;
   1.907 +                    ++starter;  // temporarily increment for the loop boundary
   1.908 +                    q=pRemove;
   1.909 +                    r=++pRemove;
   1.910 +                    while(starter<q) {
   1.911 +                        *--r=*--q;
   1.912 +                    }
   1.913 +                    *starter=U16_TRAIL(composite);
   1.914 +                    *--starter=U16_LEAD(composite);  // undo the temporary increment
   1.915 +                } else {
   1.916 +                    // both are on the BMP
   1.917 +                    *starter=(UChar)composite;
   1.918 +                }
   1.919 +
   1.920 +                /* remove the combining mark by moving the following text over it */
   1.921 +                if(pRemove<p) {
   1.922 +                    q=pRemove;
   1.923 +                    r=p;
   1.924 +                    while(r<limit) {
   1.925 +                        *q++=*r++;
   1.926 +                    }
   1.927 +                    limit=q;
   1.928 +                    p=pRemove;
   1.929 +                }
   1.930 +                // Keep prevCC because we removed the combining mark.
   1.931 +
   1.932 +                if(p==limit) {
   1.933 +                    break;
   1.934 +                }
   1.935 +                // Is the composite a starter that combines forward?
   1.936 +                if(compositeAndFwd&1) {
   1.937 +                    compositionsList=
   1.938 +                        getCompositionsListForComposite(getNorm16(composite));
   1.939 +                } else {
   1.940 +                    compositionsList=NULL;
   1.941 +                }
   1.942 +
   1.943 +                // We combined; continue with looking for compositions.
   1.944 +                continue;
   1.945 +            }
   1.946 +        }
   1.947 +
   1.948 +        // no combination this time
   1.949 +        prevCC=cc;
   1.950 +        if(p==limit) {
   1.951 +            break;
   1.952 +        }
   1.953 +
   1.954 +        // If c did not combine, then check if it is a starter.
   1.955 +        if(cc==0) {
   1.956 +            // Found a new starter.
   1.957 +            if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
   1.958 +                // It may combine with something, prepare for it.
   1.959 +                if(U_IS_BMP(c)) {
   1.960 +                    starterIsSupplementary=FALSE;
   1.961 +                    starter=p-1;
   1.962 +                } else {
   1.963 +                    starterIsSupplementary=TRUE;
   1.964 +                    starter=p-2;
   1.965 +                }
   1.966 +            }
   1.967 +        } else if(onlyContiguous) {
   1.968 +            // FCC: no discontiguous compositions; any intervening character blocks.
   1.969 +            compositionsList=NULL;
   1.970 +        }
   1.971 +    }
   1.972 +    buffer.setReorderingLimit(limit);
   1.973 +}
   1.974 +
   1.975 +UChar32
   1.976 +Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
   1.977 +    uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
   1.978 +    const uint16_t *list;
   1.979 +    if(isInert(norm16)) {
   1.980 +        return U_SENTINEL;
   1.981 +    } else if(norm16<minYesNoMappingsOnly) {
   1.982 +        if(isJamoL(norm16)) {
   1.983 +            b-=Hangul::JAMO_V_BASE;
   1.984 +            if(0<=b && b<Hangul::JAMO_V_COUNT) {
   1.985 +                return
   1.986 +                    (Hangul::HANGUL_BASE+
   1.987 +                     ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
   1.988 +                     Hangul::JAMO_T_COUNT);
   1.989 +            } else {
   1.990 +                return U_SENTINEL;
   1.991 +            }
   1.992 +        } else if(isHangul(norm16)) {
   1.993 +            b-=Hangul::JAMO_T_BASE;
   1.994 +            if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
   1.995 +                return a+b;
   1.996 +            } else {
   1.997 +                return U_SENTINEL;
   1.998 +            }
   1.999 +        } else {
  1.1000 +            // 'a' has a compositions list in extraData
  1.1001 +            list=extraData+norm16;
  1.1002 +            if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
  1.1003 +                list+=  // mapping pointer
  1.1004 +                    1+  // +1 to skip the first unit with the mapping lenth
  1.1005 +                    (*list&MAPPING_LENGTH_MASK);  // + mapping length
  1.1006 +            }
  1.1007 +        }
  1.1008 +    } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
  1.1009 +        return U_SENTINEL;
  1.1010 +    } else {
  1.1011 +        list=maybeYesCompositions+norm16-minMaybeYes;
  1.1012 +    }
  1.1013 +    if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
  1.1014 +        return U_SENTINEL;
  1.1015 +    }
  1.1016 +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
  1.1017 +    return combine(list, b)>>1;
  1.1018 +#else
  1.1019 +    int32_t compositeAndFwd=combine(list, b);
  1.1020 +    return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
  1.1021 +#endif
  1.1022 +}
  1.1023 +
  1.1024 +// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
  1.1025 +// doCompose: normalize
  1.1026 +// !doCompose: isNormalized (buffer must be empty and initialized)
  1.1027 +UBool
  1.1028 +Normalizer2Impl::compose(const UChar *src, const UChar *limit,
  1.1029 +                         UBool onlyContiguous,
  1.1030 +                         UBool doCompose,
  1.1031 +                         ReorderingBuffer &buffer,
  1.1032 +                         UErrorCode &errorCode) const {
  1.1033 +    /*
  1.1034 +     * prevBoundary points to the last character before the current one
  1.1035 +     * that has a composition boundary before it with ccc==0 and quick check "yes".
  1.1036 +     * Keeping track of prevBoundary saves us looking for a composition boundary
  1.1037 +     * when we find a "no" or "maybe".
  1.1038 +     *
  1.1039 +     * When we back out from prevSrc back to prevBoundary,
  1.1040 +     * then we also remove those same characters (which had been simply copied
  1.1041 +     * or canonically-order-inserted) from the ReorderingBuffer.
  1.1042 +     * Therefore, at all times, the [prevBoundary..prevSrc[ source units
  1.1043 +     * must correspond 1:1 to destination units at the end of the destination buffer.
  1.1044 +     */
  1.1045 +    const UChar *prevBoundary=src;
  1.1046 +    UChar32 minNoMaybeCP=minCompNoMaybeCP;
  1.1047 +    if(limit==NULL) {
  1.1048 +        src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
  1.1049 +                                           doCompose ? &buffer : NULL,
  1.1050 +                                           errorCode);
  1.1051 +        if(U_FAILURE(errorCode)) {
  1.1052 +            return FALSE;
  1.1053 +        }
  1.1054 +        if(prevBoundary<src) {
  1.1055 +            // Set prevBoundary to the last character in the prefix.
  1.1056 +            prevBoundary=src-1;
  1.1057 +        }
  1.1058 +        limit=u_strchr(src, 0);
  1.1059 +    }
  1.1060 +
  1.1061 +    const UChar *prevSrc;
  1.1062 +    UChar32 c=0;
  1.1063 +    uint16_t norm16=0;
  1.1064 +
  1.1065 +    // only for isNormalized
  1.1066 +    uint8_t prevCC=0;
  1.1067 +
  1.1068 +    for(;;) {
  1.1069 +        // count code units below the minimum or with irrelevant data for the quick check
  1.1070 +        for(prevSrc=src; src!=limit;) {
  1.1071 +            if( (c=*src)<minNoMaybeCP ||
  1.1072 +                isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
  1.1073 +            ) {
  1.1074 +                ++src;
  1.1075 +            } else if(!U16_IS_SURROGATE(c)) {
  1.1076 +                break;
  1.1077 +            } else {
  1.1078 +                UChar c2;
  1.1079 +                if(U16_IS_SURROGATE_LEAD(c)) {
  1.1080 +                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
  1.1081 +                        c=U16_GET_SUPPLEMENTARY(c, c2);
  1.1082 +                    }
  1.1083 +                } else /* trail surrogate */ {
  1.1084 +                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
  1.1085 +                        --src;
  1.1086 +                        c=U16_GET_SUPPLEMENTARY(c2, c);
  1.1087 +                    }
  1.1088 +                }
  1.1089 +                if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
  1.1090 +                    src+=U16_LENGTH(c);
  1.1091 +                } else {
  1.1092 +                    break;
  1.1093 +                }
  1.1094 +            }
  1.1095 +        }
  1.1096 +        // copy these code units all at once
  1.1097 +        if(src!=prevSrc) {
  1.1098 +            if(doCompose) {
  1.1099 +                if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
  1.1100 +                    break;
  1.1101 +                }
  1.1102 +            } else {
  1.1103 +                prevCC=0;
  1.1104 +            }
  1.1105 +            if(src==limit) {
  1.1106 +                break;
  1.1107 +            }
  1.1108 +            // Set prevBoundary to the last character in the quick check loop.
  1.1109 +            prevBoundary=src-1;
  1.1110 +            if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
  1.1111 +                U16_IS_LEAD(*(prevBoundary-1))
  1.1112 +            ) {
  1.1113 +                --prevBoundary;
  1.1114 +            }
  1.1115 +            // The start of the current character (c).
  1.1116 +            prevSrc=src;
  1.1117 +        } else if(src==limit) {
  1.1118 +            break;
  1.1119 +        }
  1.1120 +
  1.1121 +        src+=U16_LENGTH(c);
  1.1122 +        /*
  1.1123 +         * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
  1.1124 +         * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
  1.1125 +         * or has ccc!=0.
  1.1126 +         * Check for Jamo V/T, then for regular characters.
  1.1127 +         * c is not a Hangul syllable or Jamo L because those have "yes" properties.
  1.1128 +         */
  1.1129 +        if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
  1.1130 +            UChar prev=*(prevSrc-1);
  1.1131 +            UBool needToDecompose=FALSE;
  1.1132 +            if(c<Hangul::JAMO_T_BASE) {
  1.1133 +                // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
  1.1134 +                prev=(UChar)(prev-Hangul::JAMO_L_BASE);
  1.1135 +                if(prev<Hangul::JAMO_L_COUNT) {
  1.1136 +                    if(!doCompose) {
  1.1137 +                        return FALSE;
  1.1138 +                    }
  1.1139 +                    UChar syllable=(UChar)
  1.1140 +                        (Hangul::HANGUL_BASE+
  1.1141 +                         (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
  1.1142 +                         Hangul::JAMO_T_COUNT);
  1.1143 +                    UChar t;
  1.1144 +                    if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
  1.1145 +                        ++src;
  1.1146 +                        syllable+=t;  // The next character was a Jamo T.
  1.1147 +                        prevBoundary=src;
  1.1148 +                        buffer.setLastChar(syllable);
  1.1149 +                        continue;
  1.1150 +                    }
  1.1151 +                    // If we see L+V+x where x!=T then we drop to the slow path,
  1.1152 +                    // decompose and recompose.
  1.1153 +                    // This is to deal with NFKC finding normal L and V but a
  1.1154 +                    // compatibility variant of a T. We need to either fully compose that
  1.1155 +                    // combination here (which would complicate the code and may not work
  1.1156 +                    // with strange custom data) or use the slow path -- or else our replacing
  1.1157 +                    // two input characters (L+V) with one output character (LV syllable)
  1.1158 +                    // would violate the invariant that [prevBoundary..prevSrc[ has the same
  1.1159 +                    // length as what we appended to the buffer since prevBoundary.
  1.1160 +                    needToDecompose=TRUE;
  1.1161 +                }
  1.1162 +            } else if(Hangul::isHangulWithoutJamoT(prev)) {
  1.1163 +                // c is a Jamo Trailing consonant,
  1.1164 +                // compose with previous Hangul LV that does not contain a Jamo T.
  1.1165 +                if(!doCompose) {
  1.1166 +                    return FALSE;
  1.1167 +                }
  1.1168 +                buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
  1.1169 +                prevBoundary=src;
  1.1170 +                continue;
  1.1171 +            }
  1.1172 +            if(!needToDecompose) {
  1.1173 +                // The Jamo V/T did not compose into a Hangul syllable.
  1.1174 +                if(doCompose) {
  1.1175 +                    if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
  1.1176 +                        break;
  1.1177 +                    }
  1.1178 +                } else {
  1.1179 +                    prevCC=0;
  1.1180 +                }
  1.1181 +                continue;
  1.1182 +            }
  1.1183 +        }
  1.1184 +        /*
  1.1185 +         * Source buffer pointers:
  1.1186 +         *
  1.1187 +         *  all done      quick check   current char  not yet
  1.1188 +         *                "yes" but     (c)           processed
  1.1189 +         *                may combine
  1.1190 +         *                forward
  1.1191 +         * [-------------[-------------[-------------[-------------[
  1.1192 +         * |             |             |             |             |
  1.1193 +         * orig. src     prevBoundary  prevSrc       src           limit
  1.1194 +         *
  1.1195 +         *
  1.1196 +         * Destination buffer pointers inside the ReorderingBuffer:
  1.1197 +         *
  1.1198 +         *  all done      might take    not filled yet
  1.1199 +         *                characters for
  1.1200 +         *                reordering
  1.1201 +         * [-------------[-------------[-------------[
  1.1202 +         * |             |             |             |
  1.1203 +         * start         reorderStart  limit         |
  1.1204 +         *                             +remainingCap.+
  1.1205 +         */
  1.1206 +        if(norm16>=MIN_YES_YES_WITH_CC) {
  1.1207 +            uint8_t cc=(uint8_t)norm16;  // cc!=0
  1.1208 +            if( onlyContiguous &&  // FCC
  1.1209 +                (doCompose ? buffer.getLastCC() : prevCC)==0 &&
  1.1210 +                prevBoundary<prevSrc &&
  1.1211 +                // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
  1.1212 +                // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
  1.1213 +                // passed the quick check "yes && ccc==0" test.
  1.1214 +                // Check whether the last character was a "yesYes" or a "yesNo".
  1.1215 +                // If a "yesNo", then we get its trailing ccc from its
  1.1216 +                // mapping and check for canonical order.
  1.1217 +                // All other cases are ok.
  1.1218 +                getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
  1.1219 +            ) {
  1.1220 +                // Fails FCD test, need to decompose and contiguously recompose.
  1.1221 +                if(!doCompose) {
  1.1222 +                    return FALSE;
  1.1223 +                }
  1.1224 +            } else if(doCompose) {
  1.1225 +                if(!buffer.append(c, cc, errorCode)) {
  1.1226 +                    break;
  1.1227 +                }
  1.1228 +                continue;
  1.1229 +            } else if(prevCC<=cc) {
  1.1230 +                prevCC=cc;
  1.1231 +                continue;
  1.1232 +            } else {
  1.1233 +                return FALSE;
  1.1234 +            }
  1.1235 +        } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
  1.1236 +            return FALSE;
  1.1237 +        }
  1.1238 +
  1.1239 +        /*
  1.1240 +         * Find appropriate boundaries around this character,
  1.1241 +         * decompose the source text from between the boundaries,
  1.1242 +         * and recompose it.
  1.1243 +         *
  1.1244 +         * We may need to remove the last few characters from the ReorderingBuffer
  1.1245 +         * to account for source text that was copied or appended
  1.1246 +         * but needs to take part in the recomposition.
  1.1247 +         */
  1.1248 +
  1.1249 +        /*
  1.1250 +         * Find the last composition boundary in [prevBoundary..src[.
  1.1251 +         * It is either the decomposition of the current character (at prevSrc),
  1.1252 +         * or prevBoundary.
  1.1253 +         */
  1.1254 +        if(hasCompBoundaryBefore(c, norm16)) {
  1.1255 +            prevBoundary=prevSrc;
  1.1256 +        } else if(doCompose) {
  1.1257 +            buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
  1.1258 +        }
  1.1259 +
  1.1260 +        // Find the next composition boundary in [src..limit[ -
  1.1261 +        // modifies src to point to the next starter.
  1.1262 +        src=(UChar *)findNextCompBoundary(src, limit);
  1.1263 +
  1.1264 +        // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
  1.1265 +        int32_t recomposeStartIndex=buffer.length();
  1.1266 +        if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
  1.1267 +            break;
  1.1268 +        }
  1.1269 +        recompose(buffer, recomposeStartIndex, onlyContiguous);
  1.1270 +        if(!doCompose) {
  1.1271 +            if(!buffer.equals(prevBoundary, src)) {
  1.1272 +                return FALSE;
  1.1273 +            }
  1.1274 +            buffer.remove();
  1.1275 +            prevCC=0;
  1.1276 +        }
  1.1277 +
  1.1278 +        // Move to the next starter. We never need to look back before this point again.
  1.1279 +        prevBoundary=src;
  1.1280 +    }
  1.1281 +    return TRUE;
  1.1282 +}
  1.1283 +
  1.1284 +// Very similar to compose(): Make the same changes in both places if relevant.
  1.1285 +// pQCResult==NULL: spanQuickCheckYes
  1.1286 +// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
  1.1287 +const UChar *
  1.1288 +Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
  1.1289 +                                   UBool onlyContiguous,
  1.1290 +                                   UNormalizationCheckResult *pQCResult) const {
  1.1291 +    /*
  1.1292 +     * prevBoundary points to the last character before the current one
  1.1293 +     * that has a composition boundary before it with ccc==0 and quick check "yes".
  1.1294 +     */
  1.1295 +    const UChar *prevBoundary=src;
  1.1296 +    UChar32 minNoMaybeCP=minCompNoMaybeCP;
  1.1297 +    if(limit==NULL) {
  1.1298 +        UErrorCode errorCode=U_ZERO_ERROR;
  1.1299 +        src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
  1.1300 +        if(prevBoundary<src) {
  1.1301 +            // Set prevBoundary to the last character in the prefix.
  1.1302 +            prevBoundary=src-1;
  1.1303 +        }
  1.1304 +        limit=u_strchr(src, 0);
  1.1305 +    }
  1.1306 +
  1.1307 +    const UChar *prevSrc;
  1.1308 +    UChar32 c=0;
  1.1309 +    uint16_t norm16=0;
  1.1310 +    uint8_t prevCC=0;
  1.1311 +
  1.1312 +    for(;;) {
  1.1313 +        // count code units below the minimum or with irrelevant data for the quick check
  1.1314 +        for(prevSrc=src;;) {
  1.1315 +            if(src==limit) {
  1.1316 +                return src;
  1.1317 +            }
  1.1318 +            if( (c=*src)<minNoMaybeCP ||
  1.1319 +                isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
  1.1320 +            ) {
  1.1321 +                ++src;
  1.1322 +            } else if(!U16_IS_SURROGATE(c)) {
  1.1323 +                break;
  1.1324 +            } else {
  1.1325 +                UChar c2;
  1.1326 +                if(U16_IS_SURROGATE_LEAD(c)) {
  1.1327 +                    if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
  1.1328 +                        c=U16_GET_SUPPLEMENTARY(c, c2);
  1.1329 +                    }
  1.1330 +                } else /* trail surrogate */ {
  1.1331 +                    if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
  1.1332 +                        --src;
  1.1333 +                        c=U16_GET_SUPPLEMENTARY(c2, c);
  1.1334 +                    }
  1.1335 +                }
  1.1336 +                if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
  1.1337 +                    src+=U16_LENGTH(c);
  1.1338 +                } else {
  1.1339 +                    break;
  1.1340 +                }
  1.1341 +            }
  1.1342 +        }
  1.1343 +        if(src!=prevSrc) {
  1.1344 +            // Set prevBoundary to the last character in the quick check loop.
  1.1345 +            prevBoundary=src-1;
  1.1346 +            if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
  1.1347 +                U16_IS_LEAD(*(prevBoundary-1))
  1.1348 +            ) {
  1.1349 +                --prevBoundary;
  1.1350 +            }
  1.1351 +            prevCC=0;
  1.1352 +            // The start of the current character (c).
  1.1353 +            prevSrc=src;
  1.1354 +        }
  1.1355 +
  1.1356 +        src+=U16_LENGTH(c);
  1.1357 +        /*
  1.1358 +         * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
  1.1359 +         * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
  1.1360 +         * or has ccc!=0.
  1.1361 +         */
  1.1362 +        if(isMaybeOrNonZeroCC(norm16)) {
  1.1363 +            uint8_t cc=getCCFromYesOrMaybe(norm16);
  1.1364 +            if( onlyContiguous &&  // FCC
  1.1365 +                cc!=0 &&
  1.1366 +                prevCC==0 &&
  1.1367 +                prevBoundary<prevSrc &&
  1.1368 +                // prevCC==0 && prevBoundary<prevSrc tell us that
  1.1369 +                // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
  1.1370 +                // passed the quick check "yes && ccc==0" test.
  1.1371 +                // Check whether the last character was a "yesYes" or a "yesNo".
  1.1372 +                // If a "yesNo", then we get its trailing ccc from its
  1.1373 +                // mapping and check for canonical order.
  1.1374 +                // All other cases are ok.
  1.1375 +                getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
  1.1376 +            ) {
  1.1377 +                // Fails FCD test.
  1.1378 +            } else if(prevCC<=cc || cc==0) {
  1.1379 +                prevCC=cc;
  1.1380 +                if(norm16<MIN_YES_YES_WITH_CC) {
  1.1381 +                    if(pQCResult!=NULL) {
  1.1382 +                        *pQCResult=UNORM_MAYBE;
  1.1383 +                    } else {
  1.1384 +                        return prevBoundary;
  1.1385 +                    }
  1.1386 +                }
  1.1387 +                continue;
  1.1388 +            }
  1.1389 +        }
  1.1390 +        if(pQCResult!=NULL) {
  1.1391 +            *pQCResult=UNORM_NO;
  1.1392 +        }
  1.1393 +        return prevBoundary;
  1.1394 +    }
  1.1395 +}
  1.1396 +
  1.1397 +void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
  1.1398 +                                       UBool doCompose,
  1.1399 +                                       UBool onlyContiguous,
  1.1400 +                                       UnicodeString &safeMiddle,
  1.1401 +                                       ReorderingBuffer &buffer,
  1.1402 +                                       UErrorCode &errorCode) const {
  1.1403 +    if(!buffer.isEmpty()) {
  1.1404 +        const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
  1.1405 +        if(src!=firstStarterInSrc) {
  1.1406 +            const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
  1.1407 +                                                                    buffer.getLimit());
  1.1408 +            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
  1.1409 +            UnicodeString middle(lastStarterInDest, destSuffixLength);
  1.1410 +            buffer.removeSuffix(destSuffixLength);
  1.1411 +            safeMiddle=middle;
  1.1412 +            middle.append(src, (int32_t)(firstStarterInSrc-src));
  1.1413 +            const UChar *middleStart=middle.getBuffer();
  1.1414 +            compose(middleStart, middleStart+middle.length(), onlyContiguous,
  1.1415 +                    TRUE, buffer, errorCode);
  1.1416 +            if(U_FAILURE(errorCode)) {
  1.1417 +                return;
  1.1418 +            }
  1.1419 +            src=firstStarterInSrc;
  1.1420 +        }
  1.1421 +    }
  1.1422 +    if(doCompose) {
  1.1423 +        compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
  1.1424 +    } else {
  1.1425 +        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
  1.1426 +            limit=u_strchr(src, 0);
  1.1427 +        }
  1.1428 +        buffer.appendZeroCC(src, limit, errorCode);
  1.1429 +    }
  1.1430 +}
  1.1431 +
  1.1432 +/**
  1.1433 + * Does c have a composition boundary before it?
  1.1434 + * True if its decomposition begins with a character that has
  1.1435 + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
  1.1436 + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
  1.1437 + * (isCompYesAndZeroCC()) so we need not decompose.
  1.1438 + */
  1.1439 +UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
  1.1440 +    for(;;) {
  1.1441 +        if(isCompYesAndZeroCC(norm16)) {
  1.1442 +            return TRUE;
  1.1443 +        } else if(isMaybeOrNonZeroCC(norm16)) {
  1.1444 +            return FALSE;
  1.1445 +        } else if(isDecompNoAlgorithmic(norm16)) {
  1.1446 +            c=mapAlgorithmic(c, norm16);
  1.1447 +            norm16=getNorm16(c);
  1.1448 +        } else {
  1.1449 +            // c decomposes, get everything from the variable-length extra data
  1.1450 +            const uint16_t *mapping=getMapping(norm16);
  1.1451 +            uint16_t firstUnit=*mapping;
  1.1452 +            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
  1.1453 +                return FALSE;
  1.1454 +            }
  1.1455 +            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
  1.1456 +                return FALSE;  // non-zero leadCC
  1.1457 +            }
  1.1458 +            int32_t i=1;  // skip over the firstUnit
  1.1459 +            UChar32 c;
  1.1460 +            U16_NEXT_UNSAFE(mapping, i, c);
  1.1461 +            return isCompYesAndZeroCC(getNorm16(c));
  1.1462 +        }
  1.1463 +    }
  1.1464 +}
  1.1465 +
  1.1466 +UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
  1.1467 +    for(;;) {
  1.1468 +        uint16_t norm16=getNorm16(c);
  1.1469 +        if(isInert(norm16)) {
  1.1470 +            return TRUE;
  1.1471 +        } else if(norm16<=minYesNo) {
  1.1472 +            // Hangul: norm16==minYesNo
  1.1473 +            // Hangul LVT has a boundary after it.
  1.1474 +            // Hangul LV and non-inert yesYes characters combine forward.
  1.1475 +            return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
  1.1476 +        } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
  1.1477 +            return FALSE;
  1.1478 +        } else if(isDecompNoAlgorithmic(norm16)) {
  1.1479 +            c=mapAlgorithmic(c, norm16);
  1.1480 +        } else {
  1.1481 +            // c decomposes, get everything from the variable-length extra data.
  1.1482 +            // If testInert, then c must be a yesNo character which has lccc=0,
  1.1483 +            // otherwise it could be a noNo.
  1.1484 +            const uint16_t *mapping=getMapping(norm16);
  1.1485 +            uint16_t firstUnit=*mapping;
  1.1486 +            // TRUE if
  1.1487 +            //   not MAPPING_NO_COMP_BOUNDARY_AFTER
  1.1488 +            //     (which is set if
  1.1489 +            //       c is not deleted, and
  1.1490 +            //       it and its decomposition do not combine forward, and it has a starter)
  1.1491 +            //   and if FCC then trailCC<=1
  1.1492 +            return
  1.1493 +                (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
  1.1494 +                (!onlyContiguous || firstUnit<=0x1ff);
  1.1495 +        }
  1.1496 +    }
  1.1497 +}
  1.1498 +
  1.1499 +const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
  1.1500 +    BackwardUTrie2StringIterator iter(normTrie, start, p);
  1.1501 +    uint16_t norm16;
  1.1502 +    do {
  1.1503 +        norm16=iter.previous16();
  1.1504 +    } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
  1.1505 +    // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
  1.1506 +    // but that's probably not worth the extra cost.
  1.1507 +    return iter.codePointStart;
  1.1508 +}
  1.1509 +
  1.1510 +const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
  1.1511 +    ForwardUTrie2StringIterator iter(normTrie, p, limit);
  1.1512 +    uint16_t norm16;
  1.1513 +    do {
  1.1514 +        norm16=iter.next16();
  1.1515 +    } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
  1.1516 +    return iter.codePointStart;
  1.1517 +}
  1.1518 +
  1.1519 +// Note: normalizer2impl.cpp r30982 (2011-nov-27)
  1.1520 +// still had getFCDTrie() which built and cached an FCD trie.
  1.1521 +// That provided faster access to FCD data than getFCD16FromNormData()
  1.1522 +// but required synchronization and consumed some 10kB of heap memory
  1.1523 +// in any process that uses FCD (e.g., via collation).
  1.1524 +// tccc180[] and smallFCD[] are intended to help with any loss of performance,
  1.1525 +// at least for Latin & CJK.
  1.1526 +
  1.1527 +// Gets the FCD value from the regular normalization data.
  1.1528 +uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
  1.1529 +    // Only loops for 1:1 algorithmic mappings.
  1.1530 +    for(;;) {
  1.1531 +        uint16_t norm16=getNorm16(c);
  1.1532 +        if(norm16<=minYesNo) {
  1.1533 +            // no decomposition or Hangul syllable, all zeros
  1.1534 +            return 0;
  1.1535 +        } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
  1.1536 +            // combining mark
  1.1537 +            norm16&=0xff;
  1.1538 +            return norm16|(norm16<<8);
  1.1539 +        } else if(norm16>=minMaybeYes) {
  1.1540 +            return 0;
  1.1541 +        } else if(isDecompNoAlgorithmic(norm16)) {
  1.1542 +            c=mapAlgorithmic(c, norm16);
  1.1543 +        } else {
  1.1544 +            // c decomposes, get everything from the variable-length extra data
  1.1545 +            const uint16_t *mapping=getMapping(norm16);
  1.1546 +            uint16_t firstUnit=*mapping;
  1.1547 +            if((firstUnit&MAPPING_LENGTH_MASK)==0) {
  1.1548 +                // A character that is deleted (maps to an empty string) must
  1.1549 +                // get the worst-case lccc and tccc values because arbitrary
  1.1550 +                // characters on both sides will become adjacent.
  1.1551 +                return 0x1ff;
  1.1552 +            } else {
  1.1553 +                norm16=firstUnit>>8;  // tccc
  1.1554 +                if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
  1.1555 +                    norm16|=*(mapping-1)&0xff00;  // lccc
  1.1556 +                }
  1.1557 +                return norm16;
  1.1558 +            }
  1.1559 +        }
  1.1560 +    }
  1.1561 +}
  1.1562 +
  1.1563 +// Dual functionality:
  1.1564 +// buffer!=NULL: normalize
  1.1565 +// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
  1.1566 +const UChar *
  1.1567 +Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
  1.1568 +                         ReorderingBuffer *buffer,
  1.1569 +                         UErrorCode &errorCode) const {
  1.1570 +    // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
  1.1571 +    // Similar to the prevBoundary in the compose() implementation.
  1.1572 +    const UChar *prevBoundary=src;
  1.1573 +    int32_t prevFCD16=0;
  1.1574 +    if(limit==NULL) {
  1.1575 +        src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
  1.1576 +        if(U_FAILURE(errorCode)) {
  1.1577 +            return src;
  1.1578 +        }
  1.1579 +        if(prevBoundary<src) {
  1.1580 +            prevBoundary=src;
  1.1581 +            // We know that the previous character's lccc==0.
  1.1582 +            // Fetching the fcd16 value was deferred for this below-U+0300 code point.
  1.1583 +            prevFCD16=getFCD16(*(src-1));
  1.1584 +            if(prevFCD16>1) {
  1.1585 +                --prevBoundary;
  1.1586 +            }
  1.1587 +        }
  1.1588 +        limit=u_strchr(src, 0);
  1.1589 +    }
  1.1590 +
  1.1591 +    // Note: In this function we use buffer->appendZeroCC() because we track
  1.1592 +    // the lead and trail combining classes here, rather than leaving it to
  1.1593 +    // the ReorderingBuffer.
  1.1594 +    // The exception is the call to decomposeShort() which uses the buffer
  1.1595 +    // in the normal way.
  1.1596 +
  1.1597 +    const UChar *prevSrc;
  1.1598 +    UChar32 c=0;
  1.1599 +    uint16_t fcd16=0;
  1.1600 +
  1.1601 +    for(;;) {
  1.1602 +        // count code units with lccc==0
  1.1603 +        for(prevSrc=src; src!=limit;) {
  1.1604 +            if((c=*src)<MIN_CCC_LCCC_CP) {
  1.1605 +                prevFCD16=~c;
  1.1606 +                ++src;
  1.1607 +            } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
  1.1608 +                prevFCD16=0;
  1.1609 +                ++src;
  1.1610 +            } else {
  1.1611 +                if(U16_IS_SURROGATE(c)) {
  1.1612 +                    UChar c2;
  1.1613 +                    if(U16_IS_SURROGATE_LEAD(c)) {
  1.1614 +                        if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
  1.1615 +                            c=U16_GET_SUPPLEMENTARY(c, c2);
  1.1616 +                        }
  1.1617 +                    } else /* trail surrogate */ {
  1.1618 +                        if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
  1.1619 +                            --src;
  1.1620 +                            c=U16_GET_SUPPLEMENTARY(c2, c);
  1.1621 +                        }
  1.1622 +                    }
  1.1623 +                }
  1.1624 +                if((fcd16=getFCD16FromNormData(c))<=0xff) {
  1.1625 +                    prevFCD16=fcd16;
  1.1626 +                    src+=U16_LENGTH(c);
  1.1627 +                } else {
  1.1628 +                    break;
  1.1629 +                }
  1.1630 +            }
  1.1631 +        }
  1.1632 +        // copy these code units all at once
  1.1633 +        if(src!=prevSrc) {
  1.1634 +            if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
  1.1635 +                break;
  1.1636 +            }
  1.1637 +            if(src==limit) {
  1.1638 +                break;
  1.1639 +            }
  1.1640 +            prevBoundary=src;
  1.1641 +            // We know that the previous character's lccc==0.
  1.1642 +            if(prevFCD16<0) {
  1.1643 +                // Fetching the fcd16 value was deferred for this below-U+0300 code point.
  1.1644 +                UChar32 prev=~prevFCD16;
  1.1645 +                prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
  1.1646 +                if(prevFCD16>1) {
  1.1647 +                    --prevBoundary;
  1.1648 +                }
  1.1649 +            } else {
  1.1650 +                const UChar *p=src-1;
  1.1651 +                if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
  1.1652 +                    --p;
  1.1653 +                    // Need to fetch the previous character's FCD value because
  1.1654 +                    // prevFCD16 was just for the trail surrogate code point.
  1.1655 +                    prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
  1.1656 +                    // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
  1.1657 +                }
  1.1658 +                if(prevFCD16>1) {
  1.1659 +                    prevBoundary=p;
  1.1660 +                }
  1.1661 +            }
  1.1662 +            // The start of the current character (c).
  1.1663 +            prevSrc=src;
  1.1664 +        } else if(src==limit) {
  1.1665 +            break;
  1.1666 +        }
  1.1667 +
  1.1668 +        src+=U16_LENGTH(c);
  1.1669 +        // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
  1.1670 +        // Check for proper order, and decompose locally if necessary.
  1.1671 +        if((prevFCD16&0xff)<=(fcd16>>8)) {
  1.1672 +            // proper order: prev tccc <= current lccc
  1.1673 +            if((fcd16&0xff)<=1) {
  1.1674 +                prevBoundary=src;
  1.1675 +            }
  1.1676 +            if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
  1.1677 +                break;
  1.1678 +            }
  1.1679 +            prevFCD16=fcd16;
  1.1680 +            continue;
  1.1681 +        } else if(buffer==NULL) {
  1.1682 +            return prevBoundary;  // quick check "no"
  1.1683 +        } else {
  1.1684 +            /*
  1.1685 +             * Back out the part of the source that we copied or appended
  1.1686 +             * already but is now going to be decomposed.
  1.1687 +             * prevSrc is set to after what was copied/appended.
  1.1688 +             */
  1.1689 +            buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
  1.1690 +            /*
  1.1691 +             * Find the part of the source that needs to be decomposed,
  1.1692 +             * up to the next safe boundary.
  1.1693 +             */
  1.1694 +            src=findNextFCDBoundary(src, limit);
  1.1695 +            /*
  1.1696 +             * The source text does not fulfill the conditions for FCD.
  1.1697 +             * Decompose and reorder a limited piece of the text.
  1.1698 +             */
  1.1699 +            if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
  1.1700 +                break;
  1.1701 +            }
  1.1702 +            prevBoundary=src;
  1.1703 +            prevFCD16=0;
  1.1704 +        }
  1.1705 +    }
  1.1706 +    return src;
  1.1707 +}
  1.1708 +
  1.1709 +void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
  1.1710 +                                       UBool doMakeFCD,
  1.1711 +                                       UnicodeString &safeMiddle,
  1.1712 +                                       ReorderingBuffer &buffer,
  1.1713 +                                       UErrorCode &errorCode) const {
  1.1714 +    if(!buffer.isEmpty()) {
  1.1715 +        const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
  1.1716 +        if(src!=firstBoundaryInSrc) {
  1.1717 +            const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
  1.1718 +                                                                    buffer.getLimit());
  1.1719 +            int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
  1.1720 +            UnicodeString middle(lastBoundaryInDest, destSuffixLength);
  1.1721 +            buffer.removeSuffix(destSuffixLength);
  1.1722 +            safeMiddle=middle;
  1.1723 +            middle.append(src, (int32_t)(firstBoundaryInSrc-src));
  1.1724 +            const UChar *middleStart=middle.getBuffer();
  1.1725 +            makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
  1.1726 +            if(U_FAILURE(errorCode)) {
  1.1727 +                return;
  1.1728 +            }
  1.1729 +            src=firstBoundaryInSrc;
  1.1730 +        }
  1.1731 +    }
  1.1732 +    if(doMakeFCD) {
  1.1733 +        makeFCD(src, limit, &buffer, errorCode);
  1.1734 +    } else {
  1.1735 +        if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
  1.1736 +            limit=u_strchr(src, 0);
  1.1737 +        }
  1.1738 +        buffer.appendZeroCC(src, limit, errorCode);
  1.1739 +    }
  1.1740 +}
  1.1741 +
  1.1742 +const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
  1.1743 +    while(start<p && previousFCD16(start, p)>0xff) {}
  1.1744 +    return p;
  1.1745 +}
  1.1746 +
  1.1747 +const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
  1.1748 +    while(p<limit) {
  1.1749 +        const UChar *codePointStart=p;
  1.1750 +        if(nextFCD16(p, limit)<=0xff) {
  1.1751 +            return codePointStart;
  1.1752 +        }
  1.1753 +    }
  1.1754 +    return p;
  1.1755 +}
  1.1756 +
  1.1757 +// CanonicalIterator data -------------------------------------------------- ***
  1.1758 +
  1.1759 +CanonIterData::CanonIterData(UErrorCode &errorCode) :
  1.1760 +        trie(utrie2_open(0, 0, &errorCode)),
  1.1761 +        canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
  1.1762 +
  1.1763 +CanonIterData::~CanonIterData() {
  1.1764 +    utrie2_close(trie);
  1.1765 +}
  1.1766 +
  1.1767 +void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
  1.1768 +    uint32_t canonValue=utrie2_get32(trie, decompLead);
  1.1769 +    if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
  1.1770 +        // origin is the first character whose decomposition starts with
  1.1771 +        // the character for which we are setting the value.
  1.1772 +        utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
  1.1773 +    } else {
  1.1774 +        // origin is not the first character, or it is U+0000.
  1.1775 +        UnicodeSet *set;
  1.1776 +        if((canonValue&CANON_HAS_SET)==0) {
  1.1777 +            set=new UnicodeSet;
  1.1778 +            if(set==NULL) {
  1.1779 +                errorCode=U_MEMORY_ALLOCATION_ERROR;
  1.1780 +                return;
  1.1781 +            }
  1.1782 +            UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
  1.1783 +            canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
  1.1784 +            utrie2_set32(trie, decompLead, canonValue, &errorCode);
  1.1785 +            canonStartSets.addElement(set, errorCode);
  1.1786 +            if(firstOrigin!=0) {
  1.1787 +                set->add(firstOrigin);
  1.1788 +            }
  1.1789 +        } else {
  1.1790 +            set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
  1.1791 +        }
  1.1792 +        set->add(origin);
  1.1793 +    }
  1.1794 +}
  1.1795 +
  1.1796 +U_CDECL_BEGIN
  1.1797 +
  1.1798 +// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
  1.1799 +//     context: the Normalizer2Impl
  1.1800 +static UBool U_CALLCONV
  1.1801 +enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
  1.1802 +    UErrorCode errorCode = U_ZERO_ERROR;
  1.1803 +    if (value != 0) {
  1.1804 +        Normalizer2Impl *impl = (Normalizer2Impl *)context;
  1.1805 +        impl->makeCanonIterDataFromNorm16(
  1.1806 +            start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
  1.1807 +    }
  1.1808 +    return U_SUCCESS(errorCode);
  1.1809 +}
  1.1810 +
  1.1811 +
  1.1812 +
  1.1813 +// UInitOnce instantiation function for CanonIterData
  1.1814 +
  1.1815 +static void U_CALLCONV 
  1.1816 +initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
  1.1817 +    U_ASSERT(impl->fCanonIterData == NULL);
  1.1818 +    impl->fCanonIterData = new CanonIterData(errorCode);
  1.1819 +    if (impl->fCanonIterData == NULL) {
  1.1820 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
  1.1821 +    }
  1.1822 +    if (U_SUCCESS(errorCode)) {
  1.1823 +        utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
  1.1824 +        utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
  1.1825 +    }
  1.1826 +    if (U_FAILURE(errorCode)) {
  1.1827 +        delete impl->fCanonIterData;
  1.1828 +        impl->fCanonIterData = NULL;
  1.1829 +    }
  1.1830 +}
  1.1831 +
  1.1832 +U_CDECL_END
  1.1833 +
  1.1834 +void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
  1.1835 +                                                  CanonIterData &newData,
  1.1836 +                                                  UErrorCode &errorCode) const {
  1.1837 +    if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
  1.1838 +        // Inert, or 2-way mapping (including Hangul syllable).
  1.1839 +        // We do not write a canonStartSet for any yesNo character.
  1.1840 +        // Composites from 2-way mappings are added at runtime from the
  1.1841 +        // starter's compositions list, and the other characters in
  1.1842 +        // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
  1.1843 +        // "maybe" characters.
  1.1844 +        return;
  1.1845 +    }
  1.1846 +    for(UChar32 c=start; c<=end; ++c) {
  1.1847 +        uint32_t oldValue=utrie2_get32(newData.trie, c);
  1.1848 +        uint32_t newValue=oldValue;
  1.1849 +        if(norm16>=minMaybeYes) {
  1.1850 +            // not a segment starter if it occurs in a decomposition or has cc!=0
  1.1851 +            newValue|=CANON_NOT_SEGMENT_STARTER;
  1.1852 +            if(norm16<MIN_NORMAL_MAYBE_YES) {
  1.1853 +                newValue|=CANON_HAS_COMPOSITIONS;
  1.1854 +            }
  1.1855 +        } else if(norm16<minYesNo) {
  1.1856 +            newValue|=CANON_HAS_COMPOSITIONS;
  1.1857 +        } else {
  1.1858 +            // c has a one-way decomposition
  1.1859 +            UChar32 c2=c;
  1.1860 +            uint16_t norm16_2=norm16;
  1.1861 +            while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
  1.1862 +                c2=mapAlgorithmic(c2, norm16_2);
  1.1863 +                norm16_2=getNorm16(c2);
  1.1864 +            }
  1.1865 +            if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
  1.1866 +                // c decomposes, get everything from the variable-length extra data
  1.1867 +                const uint16_t *mapping=getMapping(norm16_2);
  1.1868 +                uint16_t firstUnit=*mapping;
  1.1869 +                int32_t length=firstUnit&MAPPING_LENGTH_MASK;
  1.1870 +                if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
  1.1871 +                    if(c==c2 && (*(mapping-1)&0xff)!=0) {
  1.1872 +                        newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
  1.1873 +                    }
  1.1874 +                }
  1.1875 +                // Skip empty mappings (no characters in the decomposition).
  1.1876 +                if(length!=0) {
  1.1877 +                    ++mapping;  // skip over the firstUnit
  1.1878 +                    // add c to first code point's start set
  1.1879 +                    int32_t i=0;
  1.1880 +                    U16_NEXT_UNSAFE(mapping, i, c2);
  1.1881 +                    newData.addToStartSet(c, c2, errorCode);
  1.1882 +                    // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
  1.1883 +                    // one-way mapping. A 2-way mapping is possible here after
  1.1884 +                    // intermediate algorithmic mapping.
  1.1885 +                    if(norm16_2>=minNoNo) {
  1.1886 +                        while(i<length) {
  1.1887 +                            U16_NEXT_UNSAFE(mapping, i, c2);
  1.1888 +                            uint32_t c2Value=utrie2_get32(newData.trie, c2);
  1.1889 +                            if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
  1.1890 +                                utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
  1.1891 +                                             &errorCode);
  1.1892 +                            }
  1.1893 +                        }
  1.1894 +                    }
  1.1895 +                }
  1.1896 +            } else {
  1.1897 +                // c decomposed to c2 algorithmically; c has cc==0
  1.1898 +                newData.addToStartSet(c, c2, errorCode);
  1.1899 +            }
  1.1900 +        }
  1.1901 +        if(newValue!=oldValue) {
  1.1902 +            utrie2_set32(newData.trie, c, newValue, &errorCode);
  1.1903 +        }
  1.1904 +    }
  1.1905 +}
  1.1906 +
  1.1907 +UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
  1.1908 +    // Logically const: Synchronized instantiation.
  1.1909 +    Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
  1.1910 +    umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
  1.1911 +    return U_SUCCESS(errorCode);
  1.1912 +}
  1.1913 +
  1.1914 +int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
  1.1915 +    return (int32_t)utrie2_get32(fCanonIterData->trie, c);
  1.1916 +}
  1.1917 +
  1.1918 +const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
  1.1919 +    return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
  1.1920 +}
  1.1921 +
  1.1922 +UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
  1.1923 +    return getCanonValue(c)>=0;
  1.1924 +}
  1.1925 +
  1.1926 +UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
  1.1927 +    int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
  1.1928 +    if(canonValue==0) {
  1.1929 +        return FALSE;
  1.1930 +    }
  1.1931 +    set.clear();
  1.1932 +    int32_t value=canonValue&CANON_VALUE_MASK;
  1.1933 +    if((canonValue&CANON_HAS_SET)!=0) {
  1.1934 +        set.addAll(getCanonStartSet(value));
  1.1935 +    } else if(value!=0) {
  1.1936 +        set.add(value);
  1.1937 +    }
  1.1938 +    if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
  1.1939 +        uint16_t norm16=getNorm16(c);
  1.1940 +        if(norm16==JAMO_L) {
  1.1941 +            UChar32 syllable=
  1.1942 +                (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
  1.1943 +            set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
  1.1944 +        } else {
  1.1945 +            addComposites(getCompositionsList(norm16), set);
  1.1946 +        }
  1.1947 +    }
  1.1948 +    return TRUE;
  1.1949 +}
  1.1950 +
  1.1951 +U_NAMESPACE_END
  1.1952 +
  1.1953 +// Normalizer2 data swapping ----------------------------------------------- ***
  1.1954 +
  1.1955 +U_NAMESPACE_USE
  1.1956 +
  1.1957 +U_CAPI int32_t U_EXPORT2
  1.1958 +unorm2_swap(const UDataSwapper *ds,
  1.1959 +            const void *inData, int32_t length, void *outData,
  1.1960 +            UErrorCode *pErrorCode) {
  1.1961 +    const UDataInfo *pInfo;
  1.1962 +    int32_t headerSize;
  1.1963 +
  1.1964 +    const uint8_t *inBytes;
  1.1965 +    uint8_t *outBytes;
  1.1966 +
  1.1967 +    const int32_t *inIndexes;
  1.1968 +    int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
  1.1969 +
  1.1970 +    int32_t i, offset, nextOffset, size;
  1.1971 +
  1.1972 +    /* udata_swapDataHeader checks the arguments */
  1.1973 +    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
  1.1974 +    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
  1.1975 +        return 0;
  1.1976 +    }
  1.1977 +
  1.1978 +    /* check data format and format version */
  1.1979 +    pInfo=(const UDataInfo *)((const char *)inData+4);
  1.1980 +    if(!(
  1.1981 +        pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
  1.1982 +        pInfo->dataFormat[1]==0x72 &&
  1.1983 +        pInfo->dataFormat[2]==0x6d &&
  1.1984 +        pInfo->dataFormat[3]==0x32 &&
  1.1985 +        (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
  1.1986 +    )) {
  1.1987 +        udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
  1.1988 +                         pInfo->dataFormat[0], pInfo->dataFormat[1],
  1.1989 +                         pInfo->dataFormat[2], pInfo->dataFormat[3],
  1.1990 +                         pInfo->formatVersion[0]);
  1.1991 +        *pErrorCode=U_UNSUPPORTED_ERROR;
  1.1992 +        return 0;
  1.1993 +    }
  1.1994 +
  1.1995 +    inBytes=(const uint8_t *)inData+headerSize;
  1.1996 +    outBytes=(uint8_t *)outData+headerSize;
  1.1997 +
  1.1998 +    inIndexes=(const int32_t *)inBytes;
  1.1999 +
  1.2000 +    if(length>=0) {
  1.2001 +        length-=headerSize;
  1.2002 +        if(length<(int32_t)sizeof(indexes)) {
  1.2003 +            udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
  1.2004 +                             length);
  1.2005 +            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.2006 +            return 0;
  1.2007 +        }
  1.2008 +    }
  1.2009 +
  1.2010 +    /* read the first few indexes */
  1.2011 +    for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
  1.2012 +        indexes[i]=udata_readInt32(ds, inIndexes[i]);
  1.2013 +    }
  1.2014 +
  1.2015 +    /* get the total length of the data */
  1.2016 +    size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
  1.2017 +
  1.2018 +    if(length>=0) {
  1.2019 +        if(length<size) {
  1.2020 +            udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
  1.2021 +                             length);
  1.2022 +            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.2023 +            return 0;
  1.2024 +        }
  1.2025 +
  1.2026 +        /* copy the data for inaccessible bytes */
  1.2027 +        if(inBytes!=outBytes) {
  1.2028 +            uprv_memcpy(outBytes, inBytes, size);
  1.2029 +        }
  1.2030 +
  1.2031 +        offset=0;
  1.2032 +
  1.2033 +        /* swap the int32_t indexes[] */
  1.2034 +        nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
  1.2035 +        ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
  1.2036 +        offset=nextOffset;
  1.2037 +
  1.2038 +        /* swap the UTrie2 */
  1.2039 +        nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
  1.2040 +        utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
  1.2041 +        offset=nextOffset;
  1.2042 +
  1.2043 +        /* swap the uint16_t extraData[] */
  1.2044 +        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
  1.2045 +        ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
  1.2046 +        offset=nextOffset;
  1.2047 +
  1.2048 +        /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
  1.2049 +        nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
  1.2050 +        offset=nextOffset;
  1.2051 +
  1.2052 +        U_ASSERT(offset==size);
  1.2053 +    }
  1.2054 +
  1.2055 +    return headerSize+size;
  1.2056 +}
  1.2057 +
  1.2058 +#endif  // !UCONFIG_NO_NORMALIZATION

mercurial