1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/normalizer2impl.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2055 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2009-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: normalizer2impl.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009nov22 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 + 1.22 +#if !UCONFIG_NO_NORMALIZATION 1.23 + 1.24 +#include "unicode/normalizer2.h" 1.25 +#include "unicode/udata.h" 1.26 +#include "unicode/ustring.h" 1.27 +#include "unicode/utf16.h" 1.28 +#include "cmemory.h" 1.29 +#include "mutex.h" 1.30 +#include "normalizer2impl.h" 1.31 +#include "putilimp.h" 1.32 +#include "uassert.h" 1.33 +#include "uset_imp.h" 1.34 +#include "utrie2.h" 1.35 +#include "uvector.h" 1.36 + 1.37 +U_NAMESPACE_BEGIN 1.38 + 1.39 +// ReorderingBuffer -------------------------------------------------------- *** 1.40 + 1.41 +UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { 1.42 + int32_t length=str.length(); 1.43 + start=str.getBuffer(destCapacity); 1.44 + if(start==NULL) { 1.45 + // getBuffer() already did str.setToBogus() 1.46 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.47 + return FALSE; 1.48 + } 1.49 + limit=start+length; 1.50 + remainingCapacity=str.getCapacity()-length; 1.51 + reorderStart=start; 1.52 + if(start==limit) { 1.53 + lastCC=0; 1.54 + } else { 1.55 + setIterator(); 1.56 + lastCC=previousCC(); 1.57 + // Set reorderStart after the last code point with cc<=1 if there is one. 1.58 + if(lastCC>1) { 1.59 + while(previousCC()>1) {} 1.60 + } 1.61 + reorderStart=codePointLimit; 1.62 + } 1.63 + return TRUE; 1.64 +} 1.65 + 1.66 +UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { 1.67 + int32_t length=(int32_t)(limit-start); 1.68 + return 1.69 + length==(int32_t)(otherLimit-otherStart) && 1.70 + 0==u_memcmp(start, otherStart, length); 1.71 +} 1.72 + 1.73 +UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 1.74 + if(remainingCapacity<2 && !resize(2, errorCode)) { 1.75 + return FALSE; 1.76 + } 1.77 + if(lastCC<=cc || cc==0) { 1.78 + limit[0]=U16_LEAD(c); 1.79 + limit[1]=U16_TRAIL(c); 1.80 + limit+=2; 1.81 + lastCC=cc; 1.82 + if(cc<=1) { 1.83 + reorderStart=limit; 1.84 + } 1.85 + } else { 1.86 + insert(c, cc); 1.87 + } 1.88 + remainingCapacity-=2; 1.89 + return TRUE; 1.90 +} 1.91 + 1.92 +UBool ReorderingBuffer::append(const UChar *s, int32_t length, 1.93 + uint8_t leadCC, uint8_t trailCC, 1.94 + UErrorCode &errorCode) { 1.95 + if(length==0) { 1.96 + return TRUE; 1.97 + } 1.98 + if(remainingCapacity<length && !resize(length, errorCode)) { 1.99 + return FALSE; 1.100 + } 1.101 + remainingCapacity-=length; 1.102 + if(lastCC<=leadCC || leadCC==0) { 1.103 + if(trailCC<=1) { 1.104 + reorderStart=limit+length; 1.105 + } else if(leadCC<=1) { 1.106 + reorderStart=limit+1; // Ok if not a code point boundary. 1.107 + } 1.108 + const UChar *sLimit=s+length; 1.109 + do { *limit++=*s++; } while(s!=sLimit); 1.110 + lastCC=trailCC; 1.111 + } else { 1.112 + int32_t i=0; 1.113 + UChar32 c; 1.114 + U16_NEXT(s, i, length, c); 1.115 + insert(c, leadCC); // insert first code point 1.116 + while(i<length) { 1.117 + U16_NEXT(s, i, length, c); 1.118 + if(i<length) { 1.119 + // s must be in NFD, otherwise we need to use getCC(). 1.120 + leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 1.121 + } else { 1.122 + leadCC=trailCC; 1.123 + } 1.124 + append(c, leadCC, errorCode); 1.125 + } 1.126 + } 1.127 + return TRUE; 1.128 +} 1.129 + 1.130 +UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { 1.131 + int32_t cpLength=U16_LENGTH(c); 1.132 + if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { 1.133 + return FALSE; 1.134 + } 1.135 + remainingCapacity-=cpLength; 1.136 + if(cpLength==1) { 1.137 + *limit++=(UChar)c; 1.138 + } else { 1.139 + limit[0]=U16_LEAD(c); 1.140 + limit[1]=U16_TRAIL(c); 1.141 + limit+=2; 1.142 + } 1.143 + lastCC=0; 1.144 + reorderStart=limit; 1.145 + return TRUE; 1.146 +} 1.147 + 1.148 +UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { 1.149 + if(s==sLimit) { 1.150 + return TRUE; 1.151 + } 1.152 + int32_t length=(int32_t)(sLimit-s); 1.153 + if(remainingCapacity<length && !resize(length, errorCode)) { 1.154 + return FALSE; 1.155 + } 1.156 + u_memcpy(limit, s, length); 1.157 + limit+=length; 1.158 + remainingCapacity-=length; 1.159 + lastCC=0; 1.160 + reorderStart=limit; 1.161 + return TRUE; 1.162 +} 1.163 + 1.164 +void ReorderingBuffer::remove() { 1.165 + reorderStart=limit=start; 1.166 + remainingCapacity=str.getCapacity(); 1.167 + lastCC=0; 1.168 +} 1.169 + 1.170 +void ReorderingBuffer::removeSuffix(int32_t suffixLength) { 1.171 + if(suffixLength<(limit-start)) { 1.172 + limit-=suffixLength; 1.173 + remainingCapacity+=suffixLength; 1.174 + } else { 1.175 + limit=start; 1.176 + remainingCapacity=str.getCapacity(); 1.177 + } 1.178 + lastCC=0; 1.179 + reorderStart=limit; 1.180 +} 1.181 + 1.182 +UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { 1.183 + int32_t reorderStartIndex=(int32_t)(reorderStart-start); 1.184 + int32_t length=(int32_t)(limit-start); 1.185 + str.releaseBuffer(length); 1.186 + int32_t newCapacity=length+appendLength; 1.187 + int32_t doubleCapacity=2*str.getCapacity(); 1.188 + if(newCapacity<doubleCapacity) { 1.189 + newCapacity=doubleCapacity; 1.190 + } 1.191 + if(newCapacity<256) { 1.192 + newCapacity=256; 1.193 + } 1.194 + start=str.getBuffer(newCapacity); 1.195 + if(start==NULL) { 1.196 + // getBuffer() already did str.setToBogus() 1.197 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.198 + return FALSE; 1.199 + } 1.200 + reorderStart=start+reorderStartIndex; 1.201 + limit=start+length; 1.202 + remainingCapacity=str.getCapacity()-length; 1.203 + return TRUE; 1.204 +} 1.205 + 1.206 +void ReorderingBuffer::skipPrevious() { 1.207 + codePointLimit=codePointStart; 1.208 + UChar c=*--codePointStart; 1.209 + if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { 1.210 + --codePointStart; 1.211 + } 1.212 +} 1.213 + 1.214 +uint8_t ReorderingBuffer::previousCC() { 1.215 + codePointLimit=codePointStart; 1.216 + if(reorderStart>=codePointStart) { 1.217 + return 0; 1.218 + } 1.219 + UChar32 c=*--codePointStart; 1.220 + if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { 1.221 + return 0; 1.222 + } 1.223 + 1.224 + UChar c2; 1.225 + if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { 1.226 + --codePointStart; 1.227 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.228 + } 1.229 + return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); 1.230 +} 1.231 + 1.232 +// Inserts c somewhere before the last character. 1.233 +// Requires 0<cc<lastCC which implies reorderStart<limit. 1.234 +void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { 1.235 + for(setIterator(), skipPrevious(); previousCC()>cc;) {} 1.236 + // insert c at codePointLimit, after the character with prevCC<=cc 1.237 + UChar *q=limit; 1.238 + UChar *r=limit+=U16_LENGTH(c); 1.239 + do { 1.240 + *--r=*--q; 1.241 + } while(codePointLimit!=q); 1.242 + writeCodePoint(q, c); 1.243 + if(cc<=1) { 1.244 + reorderStart=r; 1.245 + } 1.246 +} 1.247 + 1.248 +// Normalizer2Impl --------------------------------------------------------- *** 1.249 + 1.250 +struct CanonIterData : public UMemory { 1.251 + CanonIterData(UErrorCode &errorCode); 1.252 + ~CanonIterData(); 1.253 + void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); 1.254 + UTrie2 *trie; 1.255 + UVector canonStartSets; // contains UnicodeSet * 1.256 +}; 1.257 + 1.258 +Normalizer2Impl::~Normalizer2Impl() { 1.259 + udata_close(memory); 1.260 + utrie2_close(normTrie); 1.261 + delete fCanonIterData; 1.262 +} 1.263 + 1.264 +UBool U_CALLCONV 1.265 +Normalizer2Impl::isAcceptable(void *context, 1.266 + const char * /* type */, const char * /*name*/, 1.267 + const UDataInfo *pInfo) { 1.268 + if( 1.269 + pInfo->size>=20 && 1.270 + pInfo->isBigEndian==U_IS_BIG_ENDIAN && 1.271 + pInfo->charsetFamily==U_CHARSET_FAMILY && 1.272 + pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1.273 + pInfo->dataFormat[1]==0x72 && 1.274 + pInfo->dataFormat[2]==0x6d && 1.275 + pInfo->dataFormat[3]==0x32 && 1.276 + pInfo->formatVersion[0]==2 1.277 + ) { 1.278 + Normalizer2Impl *me=(Normalizer2Impl *)context; 1.279 + uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); 1.280 + return TRUE; 1.281 + } else { 1.282 + return FALSE; 1.283 + } 1.284 +} 1.285 + 1.286 +void 1.287 +Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { 1.288 + if(U_FAILURE(errorCode)) { 1.289 + return; 1.290 + } 1.291 + memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); 1.292 + if(U_FAILURE(errorCode)) { 1.293 + return; 1.294 + } 1.295 + const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); 1.296 + const int32_t *inIndexes=(const int32_t *)inBytes; 1.297 + int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; 1.298 + if(indexesLength<=IX_MIN_MAYBE_YES) { 1.299 + errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. 1.300 + return; 1.301 + } 1.302 + 1.303 + minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 1.304 + minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 1.305 + 1.306 + minYesNo=inIndexes[IX_MIN_YES_NO]; 1.307 + minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 1.308 + minNoNo=inIndexes[IX_MIN_NO_NO]; 1.309 + limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 1.310 + minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 1.311 + 1.312 + int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; 1.313 + int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 1.314 + normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, 1.315 + inBytes+offset, nextOffset-offset, NULL, 1.316 + &errorCode); 1.317 + if(U_FAILURE(errorCode)) { 1.318 + return; 1.319 + } 1.320 + 1.321 + offset=nextOffset; 1.322 + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 1.323 + maybeYesCompositions=(const uint16_t *)(inBytes+offset); 1.324 + extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); 1.325 + 1.326 + // smallFCD: new in formatVersion 2 1.327 + offset=nextOffset; 1.328 + smallFCD=inBytes+offset; 1.329 + 1.330 + // Build tccc180[]. 1.331 + // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 1.332 + uint8_t bits=0; 1.333 + for(UChar c=0; c<0x180; bits>>=1) { 1.334 + if((c&0xff)==0) { 1.335 + bits=smallFCD[c>>8]; // one byte per 0x100 code points 1.336 + } 1.337 + if(bits&1) { 1.338 + for(int i=0; i<0x20; ++i, ++c) { 1.339 + tccc180[c]=(uint8_t)getFCD16FromNormData(c); 1.340 + } 1.341 + } else { 1.342 + uprv_memset(tccc180+c, 0, 0x20); 1.343 + c+=0x20; 1.344 + } 1.345 + } 1.346 +} 1.347 + 1.348 +uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { 1.349 + UChar32 c; 1.350 + if(cpStart==(cpLimit-1)) { 1.351 + c=*cpStart; 1.352 + } else { 1.353 + c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); 1.354 + } 1.355 + uint16_t prevNorm16=getNorm16(c); 1.356 + if(prevNorm16<=minYesNo) { 1.357 + return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1.358 + } else { 1.359 + return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo 1.360 + } 1.361 +} 1.362 + 1.363 +U_CDECL_BEGIN 1.364 + 1.365 +static UBool U_CALLCONV 1.366 +enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 1.367 + /* add the start code point to the USet */ 1.368 + const USetAdder *sa=(const USetAdder *)context; 1.369 + sa->add(sa->set, start); 1.370 + return TRUE; 1.371 +} 1.372 + 1.373 +static uint32_t U_CALLCONV 1.374 +segmentStarterMapper(const void * /*context*/, uint32_t value) { 1.375 + return value&CANON_NOT_SEGMENT_STARTER; 1.376 +} 1.377 + 1.378 +U_CDECL_END 1.379 + 1.380 +void 1.381 +Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 1.382 + /* add the start code point of each same-value range of each trie */ 1.383 + utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); 1.384 + 1.385 + /* add Hangul LV syllables and LV+1 because of skippables */ 1.386 + for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { 1.387 + sa->add(sa->set, c); 1.388 + sa->add(sa->set, c+1); 1.389 + } 1.390 + sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 1.391 +} 1.392 + 1.393 +void 1.394 +Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { 1.395 + /* add the start code point of each same-value range of the canonical iterator data trie */ 1.396 + if(ensureCanonIterData(errorCode)) { 1.397 + // currently only used for the SEGMENT_STARTER property 1.398 + utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); 1.399 + } 1.400 +} 1.401 + 1.402 +const UChar * 1.403 +Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, 1.404 + UChar32 minNeedDataCP, 1.405 + ReorderingBuffer *buffer, 1.406 + UErrorCode &errorCode) const { 1.407 + // Make some effort to support NUL-terminated strings reasonably. 1.408 + // Take the part of the fast quick check loop that does not look up 1.409 + // data and check the first part of the string. 1.410 + // After this prefix, determine the string length to simplify the rest 1.411 + // of the code. 1.412 + const UChar *prevSrc=src; 1.413 + UChar c; 1.414 + while((c=*src++)<minNeedDataCP && c!=0) {} 1.415 + // Back out the last character for full processing. 1.416 + // Copy this prefix. 1.417 + if(--src!=prevSrc) { 1.418 + if(buffer!=NULL) { 1.419 + buffer->appendZeroCC(prevSrc, src, errorCode); 1.420 + } 1.421 + } 1.422 + return src; 1.423 +} 1.424 + 1.425 +// Dual functionality: 1.426 +// buffer!=NULL: normalize 1.427 +// buffer==NULL: isNormalized/spanQuickCheckYes 1.428 +const UChar * 1.429 +Normalizer2Impl::decompose(const UChar *src, const UChar *limit, 1.430 + ReorderingBuffer *buffer, 1.431 + UErrorCode &errorCode) const { 1.432 + UChar32 minNoCP=minDecompNoCP; 1.433 + if(limit==NULL) { 1.434 + src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); 1.435 + if(U_FAILURE(errorCode)) { 1.436 + return src; 1.437 + } 1.438 + limit=u_strchr(src, 0); 1.439 + } 1.440 + 1.441 + const UChar *prevSrc; 1.442 + UChar32 c=0; 1.443 + uint16_t norm16=0; 1.444 + 1.445 + // only for quick check 1.446 + const UChar *prevBoundary=src; 1.447 + uint8_t prevCC=0; 1.448 + 1.449 + for(;;) { 1.450 + // count code units below the minimum or with irrelevant data for the quick check 1.451 + for(prevSrc=src; src!=limit;) { 1.452 + if( (c=*src)<minNoCP || 1.453 + isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1.454 + ) { 1.455 + ++src; 1.456 + } else if(!U16_IS_SURROGATE(c)) { 1.457 + break; 1.458 + } else { 1.459 + UChar c2; 1.460 + if(U16_IS_SURROGATE_LEAD(c)) { 1.461 + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1.462 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.463 + } 1.464 + } else /* trail surrogate */ { 1.465 + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1.466 + --src; 1.467 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.468 + } 1.469 + } 1.470 + if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 1.471 + src+=U16_LENGTH(c); 1.472 + } else { 1.473 + break; 1.474 + } 1.475 + } 1.476 + } 1.477 + // copy these code units all at once 1.478 + if(src!=prevSrc) { 1.479 + if(buffer!=NULL) { 1.480 + if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { 1.481 + break; 1.482 + } 1.483 + } else { 1.484 + prevCC=0; 1.485 + prevBoundary=src; 1.486 + } 1.487 + } 1.488 + if(src==limit) { 1.489 + break; 1.490 + } 1.491 + 1.492 + // Check one above-minimum, relevant code point. 1.493 + src+=U16_LENGTH(c); 1.494 + if(buffer!=NULL) { 1.495 + if(!decompose(c, norm16, *buffer, errorCode)) { 1.496 + break; 1.497 + } 1.498 + } else { 1.499 + if(isDecompYes(norm16)) { 1.500 + uint8_t cc=getCCFromYesOrMaybe(norm16); 1.501 + if(prevCC<=cc || cc==0) { 1.502 + prevCC=cc; 1.503 + if(cc<=1) { 1.504 + prevBoundary=src; 1.505 + } 1.506 + continue; 1.507 + } 1.508 + } 1.509 + return prevBoundary; // "no" or cc out of order 1.510 + } 1.511 + } 1.512 + return src; 1.513 +} 1.514 + 1.515 +// Decompose a short piece of text which is likely to contain characters that 1.516 +// fail the quick check loop and/or where the quick check loop's overhead 1.517 +// is unlikely to be amortized. 1.518 +// Called by the compose() and makeFCD() implementations. 1.519 +UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, 1.520 + ReorderingBuffer &buffer, 1.521 + UErrorCode &errorCode) const { 1.522 + while(src<limit) { 1.523 + UChar32 c; 1.524 + uint16_t norm16; 1.525 + UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); 1.526 + if(!decompose(c, norm16, buffer, errorCode)) { 1.527 + return FALSE; 1.528 + } 1.529 + } 1.530 + return TRUE; 1.531 +} 1.532 + 1.533 +UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, 1.534 + ReorderingBuffer &buffer, 1.535 + UErrorCode &errorCode) const { 1.536 + // Only loops for 1:1 algorithmic mappings. 1.537 + for(;;) { 1.538 + // get the decomposition and the lead and trail cc's 1.539 + if(isDecompYes(norm16)) { 1.540 + // c does not decompose 1.541 + return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); 1.542 + } else if(isHangul(norm16)) { 1.543 + // Hangul syllable: decompose algorithmically 1.544 + UChar jamos[3]; 1.545 + return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); 1.546 + } else if(isDecompNoAlgorithmic(norm16)) { 1.547 + c=mapAlgorithmic(c, norm16); 1.548 + norm16=getNorm16(c); 1.549 + } else { 1.550 + // c decomposes, get everything from the variable-length extra data 1.551 + const uint16_t *mapping=getMapping(norm16); 1.552 + uint16_t firstUnit=*mapping; 1.553 + int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1.554 + uint8_t leadCC, trailCC; 1.555 + trailCC=(uint8_t)(firstUnit>>8); 1.556 + if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1.557 + leadCC=(uint8_t)(*(mapping-1)>>8); 1.558 + } else { 1.559 + leadCC=0; 1.560 + } 1.561 + return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); 1.562 + } 1.563 + } 1.564 +} 1.565 + 1.566 +const UChar * 1.567 +Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { 1.568 + const UChar *decomp=NULL; 1.569 + uint16_t norm16; 1.570 + for(;;) { 1.571 + if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 1.572 + // c does not decompose 1.573 + return decomp; 1.574 + } else if(isHangul(norm16)) { 1.575 + // Hangul syllable: decompose algorithmically 1.576 + length=Hangul::decompose(c, buffer); 1.577 + return buffer; 1.578 + } else if(isDecompNoAlgorithmic(norm16)) { 1.579 + c=mapAlgorithmic(c, norm16); 1.580 + decomp=buffer; 1.581 + length=0; 1.582 + U16_APPEND_UNSAFE(buffer, length, c); 1.583 + } else { 1.584 + // c decomposes, get everything from the variable-length extra data 1.585 + const uint16_t *mapping=getMapping(norm16); 1.586 + length=*mapping&MAPPING_LENGTH_MASK; 1.587 + return (const UChar *)mapping+1; 1.588 + } 1.589 + } 1.590 +} 1.591 + 1.592 +// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 1.593 +// so that a raw mapping fits that consists of one unit ("rm0") 1.594 +// plus all but the first two code units of the normal mapping. 1.595 +// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. 1.596 +const UChar * 1.597 +Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { 1.598 + // We do not loop in this method because an algorithmic mapping itself 1.599 + // becomes a final result rather than having to be decomposed recursively. 1.600 + uint16_t norm16; 1.601 + if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 1.602 + // c does not decompose 1.603 + return NULL; 1.604 + } else if(isHangul(norm16)) { 1.605 + // Hangul syllable: decompose algorithmically 1.606 + Hangul::getRawDecomposition(c, buffer); 1.607 + length=2; 1.608 + return buffer; 1.609 + } else if(isDecompNoAlgorithmic(norm16)) { 1.610 + c=mapAlgorithmic(c, norm16); 1.611 + length=0; 1.612 + U16_APPEND_UNSAFE(buffer, length, c); 1.613 + return buffer; 1.614 + } else { 1.615 + // c decomposes, get everything from the variable-length extra data 1.616 + const uint16_t *mapping=getMapping(norm16); 1.617 + uint16_t firstUnit=*mapping; 1.618 + int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 1.619 + if(firstUnit&MAPPING_HAS_RAW_MAPPING) { 1.620 + // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 1.621 + // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 1.622 + const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; 1.623 + uint16_t rm0=*rawMapping; 1.624 + if(rm0<=MAPPING_LENGTH_MASK) { 1.625 + length=rm0; 1.626 + return (const UChar *)rawMapping-rm0; 1.627 + } else { 1.628 + // Copy the normal mapping and replace its first two code units with rm0. 1.629 + buffer[0]=(UChar)rm0; 1.630 + u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); 1.631 + length=mLength-1; 1.632 + return buffer; 1.633 + } 1.634 + } else { 1.635 + length=mLength; 1.636 + return (const UChar *)mapping+1; 1.637 + } 1.638 + } 1.639 +} 1.640 + 1.641 +void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, 1.642 + UBool doDecompose, 1.643 + UnicodeString &safeMiddle, 1.644 + ReorderingBuffer &buffer, 1.645 + UErrorCode &errorCode) const { 1.646 + buffer.copyReorderableSuffixTo(safeMiddle); 1.647 + if(doDecompose) { 1.648 + decompose(src, limit, &buffer, errorCode); 1.649 + return; 1.650 + } 1.651 + // Just merge the strings at the boundary. 1.652 + ForwardUTrie2StringIterator iter(normTrie, src, limit); 1.653 + uint8_t firstCC, prevCC, cc; 1.654 + firstCC=prevCC=cc=getCC(iter.next16()); 1.655 + while(cc!=0) { 1.656 + prevCC=cc; 1.657 + cc=getCC(iter.next16()); 1.658 + }; 1.659 + if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1.660 + limit=u_strchr(iter.codePointStart, 0); 1.661 + } 1.662 + 1.663 + if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { 1.664 + buffer.appendZeroCC(iter.codePointStart, limit, errorCode); 1.665 + } 1.666 +} 1.667 + 1.668 +// Note: hasDecompBoundary() could be implemented as aliases to 1.669 +// hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1.670 +// at the cost of building the FCD trie for a decomposition normalizer. 1.671 +UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { 1.672 + for(;;) { 1.673 + if(c<minDecompNoCP) { 1.674 + return TRUE; 1.675 + } 1.676 + uint16_t norm16=getNorm16(c); 1.677 + if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1.678 + return TRUE; 1.679 + } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1.680 + return FALSE; // ccc!=0 1.681 + } else if(isDecompNoAlgorithmic(norm16)) { 1.682 + c=mapAlgorithmic(c, norm16); 1.683 + } else { 1.684 + // c decomposes, get everything from the variable-length extra data 1.685 + const uint16_t *mapping=getMapping(norm16); 1.686 + uint16_t firstUnit=*mapping; 1.687 + if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1.688 + return FALSE; 1.689 + } 1.690 + if(!before) { 1.691 + // decomp after-boundary: same as hasFCDBoundaryAfter(), 1.692 + // fcd16<=1 || trailCC==0 1.693 + if(firstUnit>0x1ff) { 1.694 + return FALSE; // trailCC>1 1.695 + } 1.696 + if(firstUnit<=0xff) { 1.697 + return TRUE; // trailCC==0 1.698 + } 1.699 + // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1.700 + } 1.701 + // TRUE if leadCC==0 (hasFCDBoundaryBefore()) 1.702 + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; 1.703 + } 1.704 + } 1.705 +} 1.706 + 1.707 +/* 1.708 + * Finds the recomposition result for 1.709 + * a forward-combining "lead" character, 1.710 + * specified with a pointer to its compositions list, 1.711 + * and a backward-combining "trail" character. 1.712 + * 1.713 + * If the lead and trail characters combine, then this function returns 1.714 + * the following "compositeAndFwd" value: 1.715 + * Bits 21..1 composite character 1.716 + * Bit 0 set if the composite is a forward-combining starter 1.717 + * otherwise it returns -1. 1.718 + * 1.719 + * The compositions list has (trail, compositeAndFwd) pair entries, 1.720 + * encoded as either pairs or triples of 16-bit units. 1.721 + * The last entry has the high bit of its first unit set. 1.722 + * 1.723 + * The list is sorted by ascending trail characters (there are no duplicates). 1.724 + * A linear search is used. 1.725 + * 1.726 + * See normalizer2impl.h for a more detailed description 1.727 + * of the compositions list format. 1.728 + */ 1.729 +int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { 1.730 + uint16_t key1, firstUnit; 1.731 + if(trail<COMP_1_TRAIL_LIMIT) { 1.732 + // trail character is 0..33FF 1.733 + // result entry may have 2 or 3 units 1.734 + key1=(uint16_t)(trail<<1); 1.735 + while(key1>(firstUnit=*list)) { 1.736 + list+=2+(firstUnit&COMP_1_TRIPLE); 1.737 + } 1.738 + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1.739 + if(firstUnit&COMP_1_TRIPLE) { 1.740 + return ((int32_t)list[1]<<16)|list[2]; 1.741 + } else { 1.742 + return list[1]; 1.743 + } 1.744 + } 1.745 + } else { 1.746 + // trail character is 3400..10FFFF 1.747 + // result entry has 3 units 1.748 + key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ 1.749 + (((trail>>COMP_1_TRAIL_SHIFT))& 1.750 + ~COMP_1_TRIPLE)); 1.751 + uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); 1.752 + uint16_t secondUnit; 1.753 + for(;;) { 1.754 + if(key1>(firstUnit=*list)) { 1.755 + list+=2+(firstUnit&COMP_1_TRIPLE); 1.756 + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1.757 + if(key2>(secondUnit=list[1])) { 1.758 + if(firstUnit&COMP_1_LAST_TUPLE) { 1.759 + break; 1.760 + } else { 1.761 + list+=3; 1.762 + } 1.763 + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1.764 + return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; 1.765 + } else { 1.766 + break; 1.767 + } 1.768 + } else { 1.769 + break; 1.770 + } 1.771 + } 1.772 + } 1.773 + return -1; 1.774 +} 1.775 + 1.776 +/** 1.777 + * @param list some character's compositions list 1.778 + * @param set recursively receives the composites from these compositions 1.779 + */ 1.780 +void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { 1.781 + uint16_t firstUnit; 1.782 + int32_t compositeAndFwd; 1.783 + do { 1.784 + firstUnit=*list; 1.785 + if((firstUnit&COMP_1_TRIPLE)==0) { 1.786 + compositeAndFwd=list[1]; 1.787 + list+=2; 1.788 + } else { 1.789 + compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; 1.790 + list+=3; 1.791 + } 1.792 + UChar32 composite=compositeAndFwd>>1; 1.793 + if((compositeAndFwd&1)!=0) { 1.794 + addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1.795 + } 1.796 + set.add(composite); 1.797 + } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1.798 +} 1.799 + 1.800 +/* 1.801 + * Recomposes the buffer text starting at recomposeStartIndex 1.802 + * (which is in NFD - decomposed and canonically ordered), 1.803 + * and truncates the buffer contents. 1.804 + * 1.805 + * Note that recomposition never lengthens the text: 1.806 + * Any character consists of either one or two code units; 1.807 + * a composition may contain at most one more code unit than the original starter, 1.808 + * while the combining mark that is removed has at least one code unit. 1.809 + */ 1.810 +void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 1.811 + UBool onlyContiguous) const { 1.812 + UChar *p=buffer.getStart()+recomposeStartIndex; 1.813 + UChar *limit=buffer.getLimit(); 1.814 + if(p==limit) { 1.815 + return; 1.816 + } 1.817 + 1.818 + UChar *starter, *pRemove, *q, *r; 1.819 + const uint16_t *compositionsList; 1.820 + UChar32 c, compositeAndFwd; 1.821 + uint16_t norm16; 1.822 + uint8_t cc, prevCC; 1.823 + UBool starterIsSupplementary; 1.824 + 1.825 + // Some of the following variables are not used until we have a forward-combining starter 1.826 + // and are only initialized now to avoid compiler warnings. 1.827 + compositionsList=NULL; // used as indicator for whether we have a forward-combining starter 1.828 + starter=NULL; 1.829 + starterIsSupplementary=FALSE; 1.830 + prevCC=0; 1.831 + 1.832 + for(;;) { 1.833 + UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); 1.834 + cc=getCCFromYesOrMaybe(norm16); 1.835 + if( // this character combines backward and 1.836 + isMaybe(norm16) && 1.837 + // we have seen a starter that combines forward and 1.838 + compositionsList!=NULL && 1.839 + // the backward-combining character is not blocked 1.840 + (prevCC<cc || prevCC==0) 1.841 + ) { 1.842 + if(isJamoVT(norm16)) { 1.843 + // c is a Jamo V/T, see if we can compose it with the previous character. 1.844 + if(c<Hangul::JAMO_T_BASE) { 1.845 + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1.846 + UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); 1.847 + if(prev<Hangul::JAMO_L_COUNT) { 1.848 + pRemove=p-1; 1.849 + UChar syllable=(UChar) 1.850 + (Hangul::HANGUL_BASE+ 1.851 + (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1.852 + Hangul::JAMO_T_COUNT); 1.853 + UChar t; 1.854 + if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1.855 + ++p; 1.856 + syllable+=t; // The next character was a Jamo T. 1.857 + } 1.858 + *starter=syllable; 1.859 + // remove the Jamo V/T 1.860 + q=pRemove; 1.861 + r=p; 1.862 + while(r<limit) { 1.863 + *q++=*r++; 1.864 + } 1.865 + limit=q; 1.866 + p=pRemove; 1.867 + } 1.868 + } 1.869 + /* 1.870 + * No "else" for Jamo T: 1.871 + * Since the input is in NFD, there are no Hangul LV syllables that 1.872 + * a Jamo T could combine with. 1.873 + * All Jamo Ts are combined above when handling Jamo Vs. 1.874 + */ 1.875 + if(p==limit) { 1.876 + break; 1.877 + } 1.878 + compositionsList=NULL; 1.879 + continue; 1.880 + } else if((compositeAndFwd=combine(compositionsList, c))>=0) { 1.881 + // The starter and the combining mark (c) do combine. 1.882 + UChar32 composite=compositeAndFwd>>1; 1.883 + 1.884 + // Replace the starter with the composite, remove the combining mark. 1.885 + pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark 1.886 + if(starterIsSupplementary) { 1.887 + if(U_IS_SUPPLEMENTARY(composite)) { 1.888 + // both are supplementary 1.889 + starter[0]=U16_LEAD(composite); 1.890 + starter[1]=U16_TRAIL(composite); 1.891 + } else { 1.892 + *starter=(UChar)composite; 1.893 + // The composite is shorter than the starter, 1.894 + // move the intermediate characters forward one. 1.895 + starterIsSupplementary=FALSE; 1.896 + q=starter+1; 1.897 + r=q+1; 1.898 + while(r<pRemove) { 1.899 + *q++=*r++; 1.900 + } 1.901 + --pRemove; 1.902 + } 1.903 + } else if(U_IS_SUPPLEMENTARY(composite)) { 1.904 + // The composite is longer than the starter, 1.905 + // move the intermediate characters back one. 1.906 + starterIsSupplementary=TRUE; 1.907 + ++starter; // temporarily increment for the loop boundary 1.908 + q=pRemove; 1.909 + r=++pRemove; 1.910 + while(starter<q) { 1.911 + *--r=*--q; 1.912 + } 1.913 + *starter=U16_TRAIL(composite); 1.914 + *--starter=U16_LEAD(composite); // undo the temporary increment 1.915 + } else { 1.916 + // both are on the BMP 1.917 + *starter=(UChar)composite; 1.918 + } 1.919 + 1.920 + /* remove the combining mark by moving the following text over it */ 1.921 + if(pRemove<p) { 1.922 + q=pRemove; 1.923 + r=p; 1.924 + while(r<limit) { 1.925 + *q++=*r++; 1.926 + } 1.927 + limit=q; 1.928 + p=pRemove; 1.929 + } 1.930 + // Keep prevCC because we removed the combining mark. 1.931 + 1.932 + if(p==limit) { 1.933 + break; 1.934 + } 1.935 + // Is the composite a starter that combines forward? 1.936 + if(compositeAndFwd&1) { 1.937 + compositionsList= 1.938 + getCompositionsListForComposite(getNorm16(composite)); 1.939 + } else { 1.940 + compositionsList=NULL; 1.941 + } 1.942 + 1.943 + // We combined; continue with looking for compositions. 1.944 + continue; 1.945 + } 1.946 + } 1.947 + 1.948 + // no combination this time 1.949 + prevCC=cc; 1.950 + if(p==limit) { 1.951 + break; 1.952 + } 1.953 + 1.954 + // If c did not combine, then check if it is a starter. 1.955 + if(cc==0) { 1.956 + // Found a new starter. 1.957 + if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { 1.958 + // It may combine with something, prepare for it. 1.959 + if(U_IS_BMP(c)) { 1.960 + starterIsSupplementary=FALSE; 1.961 + starter=p-1; 1.962 + } else { 1.963 + starterIsSupplementary=TRUE; 1.964 + starter=p-2; 1.965 + } 1.966 + } 1.967 + } else if(onlyContiguous) { 1.968 + // FCC: no discontiguous compositions; any intervening character blocks. 1.969 + compositionsList=NULL; 1.970 + } 1.971 + } 1.972 + buffer.setReorderingLimit(limit); 1.973 +} 1.974 + 1.975 +UChar32 1.976 +Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { 1.977 + uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 1.978 + const uint16_t *list; 1.979 + if(isInert(norm16)) { 1.980 + return U_SENTINEL; 1.981 + } else if(norm16<minYesNoMappingsOnly) { 1.982 + if(isJamoL(norm16)) { 1.983 + b-=Hangul::JAMO_V_BASE; 1.984 + if(0<=b && b<Hangul::JAMO_V_COUNT) { 1.985 + return 1.986 + (Hangul::HANGUL_BASE+ 1.987 + ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* 1.988 + Hangul::JAMO_T_COUNT); 1.989 + } else { 1.990 + return U_SENTINEL; 1.991 + } 1.992 + } else if(isHangul(norm16)) { 1.993 + b-=Hangul::JAMO_T_BASE; 1.994 + if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! 1.995 + return a+b; 1.996 + } else { 1.997 + return U_SENTINEL; 1.998 + } 1.999 + } else { 1.1000 + // 'a' has a compositions list in extraData 1.1001 + list=extraData+norm16; 1.1002 + if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 1.1003 + list+= // mapping pointer 1.1004 + 1+ // +1 to skip the first unit with the mapping lenth 1.1005 + (*list&MAPPING_LENGTH_MASK); // + mapping length 1.1006 + } 1.1007 + } 1.1008 + } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 1.1009 + return U_SENTINEL; 1.1010 + } else { 1.1011 + list=maybeYesCompositions+norm16-minMaybeYes; 1.1012 + } 1.1013 + if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 1.1014 + return U_SENTINEL; 1.1015 + } 1.1016 +#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC 1.1017 + return combine(list, b)>>1; 1.1018 +#else 1.1019 + int32_t compositeAndFwd=combine(list, b); 1.1020 + return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; 1.1021 +#endif 1.1022 +} 1.1023 + 1.1024 +// Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1.1025 +// doCompose: normalize 1.1026 +// !doCompose: isNormalized (buffer must be empty and initialized) 1.1027 +UBool 1.1028 +Normalizer2Impl::compose(const UChar *src, const UChar *limit, 1.1029 + UBool onlyContiguous, 1.1030 + UBool doCompose, 1.1031 + ReorderingBuffer &buffer, 1.1032 + UErrorCode &errorCode) const { 1.1033 + /* 1.1034 + * prevBoundary points to the last character before the current one 1.1035 + * that has a composition boundary before it with ccc==0 and quick check "yes". 1.1036 + * Keeping track of prevBoundary saves us looking for a composition boundary 1.1037 + * when we find a "no" or "maybe". 1.1038 + * 1.1039 + * When we back out from prevSrc back to prevBoundary, 1.1040 + * then we also remove those same characters (which had been simply copied 1.1041 + * or canonically-order-inserted) from the ReorderingBuffer. 1.1042 + * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1.1043 + * must correspond 1:1 to destination units at the end of the destination buffer. 1.1044 + */ 1.1045 + const UChar *prevBoundary=src; 1.1046 + UChar32 minNoMaybeCP=minCompNoMaybeCP; 1.1047 + if(limit==NULL) { 1.1048 + src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, 1.1049 + doCompose ? &buffer : NULL, 1.1050 + errorCode); 1.1051 + if(U_FAILURE(errorCode)) { 1.1052 + return FALSE; 1.1053 + } 1.1054 + if(prevBoundary<src) { 1.1055 + // Set prevBoundary to the last character in the prefix. 1.1056 + prevBoundary=src-1; 1.1057 + } 1.1058 + limit=u_strchr(src, 0); 1.1059 + } 1.1060 + 1.1061 + const UChar *prevSrc; 1.1062 + UChar32 c=0; 1.1063 + uint16_t norm16=0; 1.1064 + 1.1065 + // only for isNormalized 1.1066 + uint8_t prevCC=0; 1.1067 + 1.1068 + for(;;) { 1.1069 + // count code units below the minimum or with irrelevant data for the quick check 1.1070 + for(prevSrc=src; src!=limit;) { 1.1071 + if( (c=*src)<minNoMaybeCP || 1.1072 + isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1.1073 + ) { 1.1074 + ++src; 1.1075 + } else if(!U16_IS_SURROGATE(c)) { 1.1076 + break; 1.1077 + } else { 1.1078 + UChar c2; 1.1079 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1080 + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1.1081 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.1082 + } 1.1083 + } else /* trail surrogate */ { 1.1084 + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1.1085 + --src; 1.1086 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.1087 + } 1.1088 + } 1.1089 + if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1.1090 + src+=U16_LENGTH(c); 1.1091 + } else { 1.1092 + break; 1.1093 + } 1.1094 + } 1.1095 + } 1.1096 + // copy these code units all at once 1.1097 + if(src!=prevSrc) { 1.1098 + if(doCompose) { 1.1099 + if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { 1.1100 + break; 1.1101 + } 1.1102 + } else { 1.1103 + prevCC=0; 1.1104 + } 1.1105 + if(src==limit) { 1.1106 + break; 1.1107 + } 1.1108 + // Set prevBoundary to the last character in the quick check loop. 1.1109 + prevBoundary=src-1; 1.1110 + if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1.1111 + U16_IS_LEAD(*(prevBoundary-1)) 1.1112 + ) { 1.1113 + --prevBoundary; 1.1114 + } 1.1115 + // The start of the current character (c). 1.1116 + prevSrc=src; 1.1117 + } else if(src==limit) { 1.1118 + break; 1.1119 + } 1.1120 + 1.1121 + src+=U16_LENGTH(c); 1.1122 + /* 1.1123 + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1.1124 + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1.1125 + * or has ccc!=0. 1.1126 + * Check for Jamo V/T, then for regular characters. 1.1127 + * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1.1128 + */ 1.1129 + if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1.1130 + UChar prev=*(prevSrc-1); 1.1131 + UBool needToDecompose=FALSE; 1.1132 + if(c<Hangul::JAMO_T_BASE) { 1.1133 + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1.1134 + prev=(UChar)(prev-Hangul::JAMO_L_BASE); 1.1135 + if(prev<Hangul::JAMO_L_COUNT) { 1.1136 + if(!doCompose) { 1.1137 + return FALSE; 1.1138 + } 1.1139 + UChar syllable=(UChar) 1.1140 + (Hangul::HANGUL_BASE+ 1.1141 + (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* 1.1142 + Hangul::JAMO_T_COUNT); 1.1143 + UChar t; 1.1144 + if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { 1.1145 + ++src; 1.1146 + syllable+=t; // The next character was a Jamo T. 1.1147 + prevBoundary=src; 1.1148 + buffer.setLastChar(syllable); 1.1149 + continue; 1.1150 + } 1.1151 + // If we see L+V+x where x!=T then we drop to the slow path, 1.1152 + // decompose and recompose. 1.1153 + // This is to deal with NFKC finding normal L and V but a 1.1154 + // compatibility variant of a T. We need to either fully compose that 1.1155 + // combination here (which would complicate the code and may not work 1.1156 + // with strange custom data) or use the slow path -- or else our replacing 1.1157 + // two input characters (L+V) with one output character (LV syllable) 1.1158 + // would violate the invariant that [prevBoundary..prevSrc[ has the same 1.1159 + // length as what we appended to the buffer since prevBoundary. 1.1160 + needToDecompose=TRUE; 1.1161 + } 1.1162 + } else if(Hangul::isHangulWithoutJamoT(prev)) { 1.1163 + // c is a Jamo Trailing consonant, 1.1164 + // compose with previous Hangul LV that does not contain a Jamo T. 1.1165 + if(!doCompose) { 1.1166 + return FALSE; 1.1167 + } 1.1168 + buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); 1.1169 + prevBoundary=src; 1.1170 + continue; 1.1171 + } 1.1172 + if(!needToDecompose) { 1.1173 + // The Jamo V/T did not compose into a Hangul syllable. 1.1174 + if(doCompose) { 1.1175 + if(!buffer.appendBMP((UChar)c, 0, errorCode)) { 1.1176 + break; 1.1177 + } 1.1178 + } else { 1.1179 + prevCC=0; 1.1180 + } 1.1181 + continue; 1.1182 + } 1.1183 + } 1.1184 + /* 1.1185 + * Source buffer pointers: 1.1186 + * 1.1187 + * all done quick check current char not yet 1.1188 + * "yes" but (c) processed 1.1189 + * may combine 1.1190 + * forward 1.1191 + * [-------------[-------------[-------------[-------------[ 1.1192 + * | | | | | 1.1193 + * orig. src prevBoundary prevSrc src limit 1.1194 + * 1.1195 + * 1.1196 + * Destination buffer pointers inside the ReorderingBuffer: 1.1197 + * 1.1198 + * all done might take not filled yet 1.1199 + * characters for 1.1200 + * reordering 1.1201 + * [-------------[-------------[-------------[ 1.1202 + * | | | | 1.1203 + * start reorderStart limit | 1.1204 + * +remainingCap.+ 1.1205 + */ 1.1206 + if(norm16>=MIN_YES_YES_WITH_CC) { 1.1207 + uint8_t cc=(uint8_t)norm16; // cc!=0 1.1208 + if( onlyContiguous && // FCC 1.1209 + (doCompose ? buffer.getLastCC() : prevCC)==0 && 1.1210 + prevBoundary<prevSrc && 1.1211 + // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1.1212 + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1.1213 + // passed the quick check "yes && ccc==0" test. 1.1214 + // Check whether the last character was a "yesYes" or a "yesNo". 1.1215 + // If a "yesNo", then we get its trailing ccc from its 1.1216 + // mapping and check for canonical order. 1.1217 + // All other cases are ok. 1.1218 + getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1.1219 + ) { 1.1220 + // Fails FCD test, need to decompose and contiguously recompose. 1.1221 + if(!doCompose) { 1.1222 + return FALSE; 1.1223 + } 1.1224 + } else if(doCompose) { 1.1225 + if(!buffer.append(c, cc, errorCode)) { 1.1226 + break; 1.1227 + } 1.1228 + continue; 1.1229 + } else if(prevCC<=cc) { 1.1230 + prevCC=cc; 1.1231 + continue; 1.1232 + } else { 1.1233 + return FALSE; 1.1234 + } 1.1235 + } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1.1236 + return FALSE; 1.1237 + } 1.1238 + 1.1239 + /* 1.1240 + * Find appropriate boundaries around this character, 1.1241 + * decompose the source text from between the boundaries, 1.1242 + * and recompose it. 1.1243 + * 1.1244 + * We may need to remove the last few characters from the ReorderingBuffer 1.1245 + * to account for source text that was copied or appended 1.1246 + * but needs to take part in the recomposition. 1.1247 + */ 1.1248 + 1.1249 + /* 1.1250 + * Find the last composition boundary in [prevBoundary..src[. 1.1251 + * It is either the decomposition of the current character (at prevSrc), 1.1252 + * or prevBoundary. 1.1253 + */ 1.1254 + if(hasCompBoundaryBefore(c, norm16)) { 1.1255 + prevBoundary=prevSrc; 1.1256 + } else if(doCompose) { 1.1257 + buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); 1.1258 + } 1.1259 + 1.1260 + // Find the next composition boundary in [src..limit[ - 1.1261 + // modifies src to point to the next starter. 1.1262 + src=(UChar *)findNextCompBoundary(src, limit); 1.1263 + 1.1264 + // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1.1265 + int32_t recomposeStartIndex=buffer.length(); 1.1266 + if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { 1.1267 + break; 1.1268 + } 1.1269 + recompose(buffer, recomposeStartIndex, onlyContiguous); 1.1270 + if(!doCompose) { 1.1271 + if(!buffer.equals(prevBoundary, src)) { 1.1272 + return FALSE; 1.1273 + } 1.1274 + buffer.remove(); 1.1275 + prevCC=0; 1.1276 + } 1.1277 + 1.1278 + // Move to the next starter. We never need to look back before this point again. 1.1279 + prevBoundary=src; 1.1280 + } 1.1281 + return TRUE; 1.1282 +} 1.1283 + 1.1284 +// Very similar to compose(): Make the same changes in both places if relevant. 1.1285 +// pQCResult==NULL: spanQuickCheckYes 1.1286 +// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) 1.1287 +const UChar * 1.1288 +Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, 1.1289 + UBool onlyContiguous, 1.1290 + UNormalizationCheckResult *pQCResult) const { 1.1291 + /* 1.1292 + * prevBoundary points to the last character before the current one 1.1293 + * that has a composition boundary before it with ccc==0 and quick check "yes". 1.1294 + */ 1.1295 + const UChar *prevBoundary=src; 1.1296 + UChar32 minNoMaybeCP=minCompNoMaybeCP; 1.1297 + if(limit==NULL) { 1.1298 + UErrorCode errorCode=U_ZERO_ERROR; 1.1299 + src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); 1.1300 + if(prevBoundary<src) { 1.1301 + // Set prevBoundary to the last character in the prefix. 1.1302 + prevBoundary=src-1; 1.1303 + } 1.1304 + limit=u_strchr(src, 0); 1.1305 + } 1.1306 + 1.1307 + const UChar *prevSrc; 1.1308 + UChar32 c=0; 1.1309 + uint16_t norm16=0; 1.1310 + uint8_t prevCC=0; 1.1311 + 1.1312 + for(;;) { 1.1313 + // count code units below the minimum or with irrelevant data for the quick check 1.1314 + for(prevSrc=src;;) { 1.1315 + if(src==limit) { 1.1316 + return src; 1.1317 + } 1.1318 + if( (c=*src)<minNoMaybeCP || 1.1319 + isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) 1.1320 + ) { 1.1321 + ++src; 1.1322 + } else if(!U16_IS_SURROGATE(c)) { 1.1323 + break; 1.1324 + } else { 1.1325 + UChar c2; 1.1326 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1327 + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1.1328 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.1329 + } 1.1330 + } else /* trail surrogate */ { 1.1331 + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1.1332 + --src; 1.1333 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.1334 + } 1.1335 + } 1.1336 + if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1.1337 + src+=U16_LENGTH(c); 1.1338 + } else { 1.1339 + break; 1.1340 + } 1.1341 + } 1.1342 + } 1.1343 + if(src!=prevSrc) { 1.1344 + // Set prevBoundary to the last character in the quick check loop. 1.1345 + prevBoundary=src-1; 1.1346 + if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && 1.1347 + U16_IS_LEAD(*(prevBoundary-1)) 1.1348 + ) { 1.1349 + --prevBoundary; 1.1350 + } 1.1351 + prevCC=0; 1.1352 + // The start of the current character (c). 1.1353 + prevSrc=src; 1.1354 + } 1.1355 + 1.1356 + src+=U16_LENGTH(c); 1.1357 + /* 1.1358 + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1.1359 + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1.1360 + * or has ccc!=0. 1.1361 + */ 1.1362 + if(isMaybeOrNonZeroCC(norm16)) { 1.1363 + uint8_t cc=getCCFromYesOrMaybe(norm16); 1.1364 + if( onlyContiguous && // FCC 1.1365 + cc!=0 && 1.1366 + prevCC==0 && 1.1367 + prevBoundary<prevSrc && 1.1368 + // prevCC==0 && prevBoundary<prevSrc tell us that 1.1369 + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1.1370 + // passed the quick check "yes && ccc==0" test. 1.1371 + // Check whether the last character was a "yesYes" or a "yesNo". 1.1372 + // If a "yesNo", then we get its trailing ccc from its 1.1373 + // mapping and check for canonical order. 1.1374 + // All other cases are ok. 1.1375 + getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc 1.1376 + ) { 1.1377 + // Fails FCD test. 1.1378 + } else if(prevCC<=cc || cc==0) { 1.1379 + prevCC=cc; 1.1380 + if(norm16<MIN_YES_YES_WITH_CC) { 1.1381 + if(pQCResult!=NULL) { 1.1382 + *pQCResult=UNORM_MAYBE; 1.1383 + } else { 1.1384 + return prevBoundary; 1.1385 + } 1.1386 + } 1.1387 + continue; 1.1388 + } 1.1389 + } 1.1390 + if(pQCResult!=NULL) { 1.1391 + *pQCResult=UNORM_NO; 1.1392 + } 1.1393 + return prevBoundary; 1.1394 + } 1.1395 +} 1.1396 + 1.1397 +void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, 1.1398 + UBool doCompose, 1.1399 + UBool onlyContiguous, 1.1400 + UnicodeString &safeMiddle, 1.1401 + ReorderingBuffer &buffer, 1.1402 + UErrorCode &errorCode) const { 1.1403 + if(!buffer.isEmpty()) { 1.1404 + const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); 1.1405 + if(src!=firstStarterInSrc) { 1.1406 + const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), 1.1407 + buffer.getLimit()); 1.1408 + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); 1.1409 + UnicodeString middle(lastStarterInDest, destSuffixLength); 1.1410 + buffer.removeSuffix(destSuffixLength); 1.1411 + safeMiddle=middle; 1.1412 + middle.append(src, (int32_t)(firstStarterInSrc-src)); 1.1413 + const UChar *middleStart=middle.getBuffer(); 1.1414 + compose(middleStart, middleStart+middle.length(), onlyContiguous, 1.1415 + TRUE, buffer, errorCode); 1.1416 + if(U_FAILURE(errorCode)) { 1.1417 + return; 1.1418 + } 1.1419 + src=firstStarterInSrc; 1.1420 + } 1.1421 + } 1.1422 + if(doCompose) { 1.1423 + compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); 1.1424 + } else { 1.1425 + if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1.1426 + limit=u_strchr(src, 0); 1.1427 + } 1.1428 + buffer.appendZeroCC(src, limit, errorCode); 1.1429 + } 1.1430 +} 1.1431 + 1.1432 +/** 1.1433 + * Does c have a composition boundary before it? 1.1434 + * True if its decomposition begins with a character that has 1.1435 + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 1.1436 + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 1.1437 + * (isCompYesAndZeroCC()) so we need not decompose. 1.1438 + */ 1.1439 +UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { 1.1440 + for(;;) { 1.1441 + if(isCompYesAndZeroCC(norm16)) { 1.1442 + return TRUE; 1.1443 + } else if(isMaybeOrNonZeroCC(norm16)) { 1.1444 + return FALSE; 1.1445 + } else if(isDecompNoAlgorithmic(norm16)) { 1.1446 + c=mapAlgorithmic(c, norm16); 1.1447 + norm16=getNorm16(c); 1.1448 + } else { 1.1449 + // c decomposes, get everything from the variable-length extra data 1.1450 + const uint16_t *mapping=getMapping(norm16); 1.1451 + uint16_t firstUnit=*mapping; 1.1452 + if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1.1453 + return FALSE; 1.1454 + } 1.1455 + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { 1.1456 + return FALSE; // non-zero leadCC 1.1457 + } 1.1458 + int32_t i=1; // skip over the firstUnit 1.1459 + UChar32 c; 1.1460 + U16_NEXT_UNSAFE(mapping, i, c); 1.1461 + return isCompYesAndZeroCC(getNorm16(c)); 1.1462 + } 1.1463 + } 1.1464 +} 1.1465 + 1.1466 +UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { 1.1467 + for(;;) { 1.1468 + uint16_t norm16=getNorm16(c); 1.1469 + if(isInert(norm16)) { 1.1470 + return TRUE; 1.1471 + } else if(norm16<=minYesNo) { 1.1472 + // Hangul: norm16==minYesNo 1.1473 + // Hangul LVT has a boundary after it. 1.1474 + // Hangul LV and non-inert yesYes characters combine forward. 1.1475 + return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); 1.1476 + } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1.1477 + return FALSE; 1.1478 + } else if(isDecompNoAlgorithmic(norm16)) { 1.1479 + c=mapAlgorithmic(c, norm16); 1.1480 + } else { 1.1481 + // c decomposes, get everything from the variable-length extra data. 1.1482 + // If testInert, then c must be a yesNo character which has lccc=0, 1.1483 + // otherwise it could be a noNo. 1.1484 + const uint16_t *mapping=getMapping(norm16); 1.1485 + uint16_t firstUnit=*mapping; 1.1486 + // TRUE if 1.1487 + // not MAPPING_NO_COMP_BOUNDARY_AFTER 1.1488 + // (which is set if 1.1489 + // c is not deleted, and 1.1490 + // it and its decomposition do not combine forward, and it has a starter) 1.1491 + // and if FCC then trailCC<=1 1.1492 + return 1.1493 + (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1.1494 + (!onlyContiguous || firstUnit<=0x1ff); 1.1495 + } 1.1496 + } 1.1497 +} 1.1498 + 1.1499 +const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { 1.1500 + BackwardUTrie2StringIterator iter(normTrie, start, p); 1.1501 + uint16_t norm16; 1.1502 + do { 1.1503 + norm16=iter.previous16(); 1.1504 + } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1.1505 + // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 1.1506 + // but that's probably not worth the extra cost. 1.1507 + return iter.codePointStart; 1.1508 +} 1.1509 + 1.1510 +const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { 1.1511 + ForwardUTrie2StringIterator iter(normTrie, p, limit); 1.1512 + uint16_t norm16; 1.1513 + do { 1.1514 + norm16=iter.next16(); 1.1515 + } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); 1.1516 + return iter.codePointStart; 1.1517 +} 1.1518 + 1.1519 +// Note: normalizer2impl.cpp r30982 (2011-nov-27) 1.1520 +// still had getFCDTrie() which built and cached an FCD trie. 1.1521 +// That provided faster access to FCD data than getFCD16FromNormData() 1.1522 +// but required synchronization and consumed some 10kB of heap memory 1.1523 +// in any process that uses FCD (e.g., via collation). 1.1524 +// tccc180[] and smallFCD[] are intended to help with any loss of performance, 1.1525 +// at least for Latin & CJK. 1.1526 + 1.1527 +// Gets the FCD value from the regular normalization data. 1.1528 +uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { 1.1529 + // Only loops for 1:1 algorithmic mappings. 1.1530 + for(;;) { 1.1531 + uint16_t norm16=getNorm16(c); 1.1532 + if(norm16<=minYesNo) { 1.1533 + // no decomposition or Hangul syllable, all zeros 1.1534 + return 0; 1.1535 + } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 1.1536 + // combining mark 1.1537 + norm16&=0xff; 1.1538 + return norm16|(norm16<<8); 1.1539 + } else if(norm16>=minMaybeYes) { 1.1540 + return 0; 1.1541 + } else if(isDecompNoAlgorithmic(norm16)) { 1.1542 + c=mapAlgorithmic(c, norm16); 1.1543 + } else { 1.1544 + // c decomposes, get everything from the variable-length extra data 1.1545 + const uint16_t *mapping=getMapping(norm16); 1.1546 + uint16_t firstUnit=*mapping; 1.1547 + if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1.1548 + // A character that is deleted (maps to an empty string) must 1.1549 + // get the worst-case lccc and tccc values because arbitrary 1.1550 + // characters on both sides will become adjacent. 1.1551 + return 0x1ff; 1.1552 + } else { 1.1553 + norm16=firstUnit>>8; // tccc 1.1554 + if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { 1.1555 + norm16|=*(mapping-1)&0xff00; // lccc 1.1556 + } 1.1557 + return norm16; 1.1558 + } 1.1559 + } 1.1560 + } 1.1561 +} 1.1562 + 1.1563 +// Dual functionality: 1.1564 +// buffer!=NULL: normalize 1.1565 +// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes 1.1566 +const UChar * 1.1567 +Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, 1.1568 + ReorderingBuffer *buffer, 1.1569 + UErrorCode &errorCode) const { 1.1570 + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1.1571 + // Similar to the prevBoundary in the compose() implementation. 1.1572 + const UChar *prevBoundary=src; 1.1573 + int32_t prevFCD16=0; 1.1574 + if(limit==NULL) { 1.1575 + src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); 1.1576 + if(U_FAILURE(errorCode)) { 1.1577 + return src; 1.1578 + } 1.1579 + if(prevBoundary<src) { 1.1580 + prevBoundary=src; 1.1581 + // We know that the previous character's lccc==0. 1.1582 + // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1.1583 + prevFCD16=getFCD16(*(src-1)); 1.1584 + if(prevFCD16>1) { 1.1585 + --prevBoundary; 1.1586 + } 1.1587 + } 1.1588 + limit=u_strchr(src, 0); 1.1589 + } 1.1590 + 1.1591 + // Note: In this function we use buffer->appendZeroCC() because we track 1.1592 + // the lead and trail combining classes here, rather than leaving it to 1.1593 + // the ReorderingBuffer. 1.1594 + // The exception is the call to decomposeShort() which uses the buffer 1.1595 + // in the normal way. 1.1596 + 1.1597 + const UChar *prevSrc; 1.1598 + UChar32 c=0; 1.1599 + uint16_t fcd16=0; 1.1600 + 1.1601 + for(;;) { 1.1602 + // count code units with lccc==0 1.1603 + for(prevSrc=src; src!=limit;) { 1.1604 + if((c=*src)<MIN_CCC_LCCC_CP) { 1.1605 + prevFCD16=~c; 1.1606 + ++src; 1.1607 + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1.1608 + prevFCD16=0; 1.1609 + ++src; 1.1610 + } else { 1.1611 + if(U16_IS_SURROGATE(c)) { 1.1612 + UChar c2; 1.1613 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1614 + if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { 1.1615 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.1616 + } 1.1617 + } else /* trail surrogate */ { 1.1618 + if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { 1.1619 + --src; 1.1620 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.1621 + } 1.1622 + } 1.1623 + } 1.1624 + if((fcd16=getFCD16FromNormData(c))<=0xff) { 1.1625 + prevFCD16=fcd16; 1.1626 + src+=U16_LENGTH(c); 1.1627 + } else { 1.1628 + break; 1.1629 + } 1.1630 + } 1.1631 + } 1.1632 + // copy these code units all at once 1.1633 + if(src!=prevSrc) { 1.1634 + if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { 1.1635 + break; 1.1636 + } 1.1637 + if(src==limit) { 1.1638 + break; 1.1639 + } 1.1640 + prevBoundary=src; 1.1641 + // We know that the previous character's lccc==0. 1.1642 + if(prevFCD16<0) { 1.1643 + // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1.1644 + UChar32 prev=~prevFCD16; 1.1645 + prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1.1646 + if(prevFCD16>1) { 1.1647 + --prevBoundary; 1.1648 + } 1.1649 + } else { 1.1650 + const UChar *p=src-1; 1.1651 + if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { 1.1652 + --p; 1.1653 + // Need to fetch the previous character's FCD value because 1.1654 + // prevFCD16 was just for the trail surrogate code point. 1.1655 + prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); 1.1656 + // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1.1657 + } 1.1658 + if(prevFCD16>1) { 1.1659 + prevBoundary=p; 1.1660 + } 1.1661 + } 1.1662 + // The start of the current character (c). 1.1663 + prevSrc=src; 1.1664 + } else if(src==limit) { 1.1665 + break; 1.1666 + } 1.1667 + 1.1668 + src+=U16_LENGTH(c); 1.1669 + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1.1670 + // Check for proper order, and decompose locally if necessary. 1.1671 + if((prevFCD16&0xff)<=(fcd16>>8)) { 1.1672 + // proper order: prev tccc <= current lccc 1.1673 + if((fcd16&0xff)<=1) { 1.1674 + prevBoundary=src; 1.1675 + } 1.1676 + if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { 1.1677 + break; 1.1678 + } 1.1679 + prevFCD16=fcd16; 1.1680 + continue; 1.1681 + } else if(buffer==NULL) { 1.1682 + return prevBoundary; // quick check "no" 1.1683 + } else { 1.1684 + /* 1.1685 + * Back out the part of the source that we copied or appended 1.1686 + * already but is now going to be decomposed. 1.1687 + * prevSrc is set to after what was copied/appended. 1.1688 + */ 1.1689 + buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); 1.1690 + /* 1.1691 + * Find the part of the source that needs to be decomposed, 1.1692 + * up to the next safe boundary. 1.1693 + */ 1.1694 + src=findNextFCDBoundary(src, limit); 1.1695 + /* 1.1696 + * The source text does not fulfill the conditions for FCD. 1.1697 + * Decompose and reorder a limited piece of the text. 1.1698 + */ 1.1699 + if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { 1.1700 + break; 1.1701 + } 1.1702 + prevBoundary=src; 1.1703 + prevFCD16=0; 1.1704 + } 1.1705 + } 1.1706 + return src; 1.1707 +} 1.1708 + 1.1709 +void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, 1.1710 + UBool doMakeFCD, 1.1711 + UnicodeString &safeMiddle, 1.1712 + ReorderingBuffer &buffer, 1.1713 + UErrorCode &errorCode) const { 1.1714 + if(!buffer.isEmpty()) { 1.1715 + const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); 1.1716 + if(src!=firstBoundaryInSrc) { 1.1717 + const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), 1.1718 + buffer.getLimit()); 1.1719 + int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); 1.1720 + UnicodeString middle(lastBoundaryInDest, destSuffixLength); 1.1721 + buffer.removeSuffix(destSuffixLength); 1.1722 + safeMiddle=middle; 1.1723 + middle.append(src, (int32_t)(firstBoundaryInSrc-src)); 1.1724 + const UChar *middleStart=middle.getBuffer(); 1.1725 + makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); 1.1726 + if(U_FAILURE(errorCode)) { 1.1727 + return; 1.1728 + } 1.1729 + src=firstBoundaryInSrc; 1.1730 + } 1.1731 + } 1.1732 + if(doMakeFCD) { 1.1733 + makeFCD(src, limit, &buffer, errorCode); 1.1734 + } else { 1.1735 + if(limit==NULL) { // appendZeroCC() needs limit!=NULL 1.1736 + limit=u_strchr(src, 0); 1.1737 + } 1.1738 + buffer.appendZeroCC(src, limit, errorCode); 1.1739 + } 1.1740 +} 1.1741 + 1.1742 +const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { 1.1743 + while(start<p && previousFCD16(start, p)>0xff) {} 1.1744 + return p; 1.1745 +} 1.1746 + 1.1747 +const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { 1.1748 + while(p<limit) { 1.1749 + const UChar *codePointStart=p; 1.1750 + if(nextFCD16(p, limit)<=0xff) { 1.1751 + return codePointStart; 1.1752 + } 1.1753 + } 1.1754 + return p; 1.1755 +} 1.1756 + 1.1757 +// CanonicalIterator data -------------------------------------------------- *** 1.1758 + 1.1759 +CanonIterData::CanonIterData(UErrorCode &errorCode) : 1.1760 + trie(utrie2_open(0, 0, &errorCode)), 1.1761 + canonStartSets(uprv_deleteUObject, NULL, errorCode) {} 1.1762 + 1.1763 +CanonIterData::~CanonIterData() { 1.1764 + utrie2_close(trie); 1.1765 +} 1.1766 + 1.1767 +void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { 1.1768 + uint32_t canonValue=utrie2_get32(trie, decompLead); 1.1769 + if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 1.1770 + // origin is the first character whose decomposition starts with 1.1771 + // the character for which we are setting the value. 1.1772 + utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); 1.1773 + } else { 1.1774 + // origin is not the first character, or it is U+0000. 1.1775 + UnicodeSet *set; 1.1776 + if((canonValue&CANON_HAS_SET)==0) { 1.1777 + set=new UnicodeSet; 1.1778 + if(set==NULL) { 1.1779 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.1780 + return; 1.1781 + } 1.1782 + UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); 1.1783 + canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); 1.1784 + utrie2_set32(trie, decompLead, canonValue, &errorCode); 1.1785 + canonStartSets.addElement(set, errorCode); 1.1786 + if(firstOrigin!=0) { 1.1787 + set->add(firstOrigin); 1.1788 + } 1.1789 + } else { 1.1790 + set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; 1.1791 + } 1.1792 + set->add(origin); 1.1793 + } 1.1794 +} 1.1795 + 1.1796 +U_CDECL_BEGIN 1.1797 + 1.1798 +// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. 1.1799 +// context: the Normalizer2Impl 1.1800 +static UBool U_CALLCONV 1.1801 +enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1.1802 + UErrorCode errorCode = U_ZERO_ERROR; 1.1803 + if (value != 0) { 1.1804 + Normalizer2Impl *impl = (Normalizer2Impl *)context; 1.1805 + impl->makeCanonIterDataFromNorm16( 1.1806 + start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); 1.1807 + } 1.1808 + return U_SUCCESS(errorCode); 1.1809 +} 1.1810 + 1.1811 + 1.1812 + 1.1813 +// UInitOnce instantiation function for CanonIterData 1.1814 + 1.1815 +static void U_CALLCONV 1.1816 +initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { 1.1817 + U_ASSERT(impl->fCanonIterData == NULL); 1.1818 + impl->fCanonIterData = new CanonIterData(errorCode); 1.1819 + if (impl->fCanonIterData == NULL) { 1.1820 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.1821 + } 1.1822 + if (U_SUCCESS(errorCode)) { 1.1823 + utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); 1.1824 + utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); 1.1825 + } 1.1826 + if (U_FAILURE(errorCode)) { 1.1827 + delete impl->fCanonIterData; 1.1828 + impl->fCanonIterData = NULL; 1.1829 + } 1.1830 +} 1.1831 + 1.1832 +U_CDECL_END 1.1833 + 1.1834 +void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1.1835 + CanonIterData &newData, 1.1836 + UErrorCode &errorCode) const { 1.1837 + if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 1.1838 + // Inert, or 2-way mapping (including Hangul syllable). 1.1839 + // We do not write a canonStartSet for any yesNo character. 1.1840 + // Composites from 2-way mappings are added at runtime from the 1.1841 + // starter's compositions list, and the other characters in 1.1842 + // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 1.1843 + // "maybe" characters. 1.1844 + return; 1.1845 + } 1.1846 + for(UChar32 c=start; c<=end; ++c) { 1.1847 + uint32_t oldValue=utrie2_get32(newData.trie, c); 1.1848 + uint32_t newValue=oldValue; 1.1849 + if(norm16>=minMaybeYes) { 1.1850 + // not a segment starter if it occurs in a decomposition or has cc!=0 1.1851 + newValue|=CANON_NOT_SEGMENT_STARTER; 1.1852 + if(norm16<MIN_NORMAL_MAYBE_YES) { 1.1853 + newValue|=CANON_HAS_COMPOSITIONS; 1.1854 + } 1.1855 + } else if(norm16<minYesNo) { 1.1856 + newValue|=CANON_HAS_COMPOSITIONS; 1.1857 + } else { 1.1858 + // c has a one-way decomposition 1.1859 + UChar32 c2=c; 1.1860 + uint16_t norm16_2=norm16; 1.1861 + while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 1.1862 + c2=mapAlgorithmic(c2, norm16_2); 1.1863 + norm16_2=getNorm16(c2); 1.1864 + } 1.1865 + if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 1.1866 + // c decomposes, get everything from the variable-length extra data 1.1867 + const uint16_t *mapping=getMapping(norm16_2); 1.1868 + uint16_t firstUnit=*mapping; 1.1869 + int32_t length=firstUnit&MAPPING_LENGTH_MASK; 1.1870 + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1.1871 + if(c==c2 && (*(mapping-1)&0xff)!=0) { 1.1872 + newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 1.1873 + } 1.1874 + } 1.1875 + // Skip empty mappings (no characters in the decomposition). 1.1876 + if(length!=0) { 1.1877 + ++mapping; // skip over the firstUnit 1.1878 + // add c to first code point's start set 1.1879 + int32_t i=0; 1.1880 + U16_NEXT_UNSAFE(mapping, i, c2); 1.1881 + newData.addToStartSet(c, c2, errorCode); 1.1882 + // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 1.1883 + // one-way mapping. A 2-way mapping is possible here after 1.1884 + // intermediate algorithmic mapping. 1.1885 + if(norm16_2>=minNoNo) { 1.1886 + while(i<length) { 1.1887 + U16_NEXT_UNSAFE(mapping, i, c2); 1.1888 + uint32_t c2Value=utrie2_get32(newData.trie, c2); 1.1889 + if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 1.1890 + utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, 1.1891 + &errorCode); 1.1892 + } 1.1893 + } 1.1894 + } 1.1895 + } 1.1896 + } else { 1.1897 + // c decomposed to c2 algorithmically; c has cc==0 1.1898 + newData.addToStartSet(c, c2, errorCode); 1.1899 + } 1.1900 + } 1.1901 + if(newValue!=oldValue) { 1.1902 + utrie2_set32(newData.trie, c, newValue, &errorCode); 1.1903 + } 1.1904 + } 1.1905 +} 1.1906 + 1.1907 +UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { 1.1908 + // Logically const: Synchronized instantiation. 1.1909 + Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); 1.1910 + umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); 1.1911 + return U_SUCCESS(errorCode); 1.1912 +} 1.1913 + 1.1914 +int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { 1.1915 + return (int32_t)utrie2_get32(fCanonIterData->trie, c); 1.1916 +} 1.1917 + 1.1918 +const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { 1.1919 + return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; 1.1920 +} 1.1921 + 1.1922 +UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { 1.1923 + return getCanonValue(c)>=0; 1.1924 +} 1.1925 + 1.1926 +UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { 1.1927 + int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; 1.1928 + if(canonValue==0) { 1.1929 + return FALSE; 1.1930 + } 1.1931 + set.clear(); 1.1932 + int32_t value=canonValue&CANON_VALUE_MASK; 1.1933 + if((canonValue&CANON_HAS_SET)!=0) { 1.1934 + set.addAll(getCanonStartSet(value)); 1.1935 + } else if(value!=0) { 1.1936 + set.add(value); 1.1937 + } 1.1938 + if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 1.1939 + uint16_t norm16=getNorm16(c); 1.1940 + if(norm16==JAMO_L) { 1.1941 + UChar32 syllable= 1.1942 + (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); 1.1943 + set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); 1.1944 + } else { 1.1945 + addComposites(getCompositionsList(norm16), set); 1.1946 + } 1.1947 + } 1.1948 + return TRUE; 1.1949 +} 1.1950 + 1.1951 +U_NAMESPACE_END 1.1952 + 1.1953 +// Normalizer2 data swapping ----------------------------------------------- *** 1.1954 + 1.1955 +U_NAMESPACE_USE 1.1956 + 1.1957 +U_CAPI int32_t U_EXPORT2 1.1958 +unorm2_swap(const UDataSwapper *ds, 1.1959 + const void *inData, int32_t length, void *outData, 1.1960 + UErrorCode *pErrorCode) { 1.1961 + const UDataInfo *pInfo; 1.1962 + int32_t headerSize; 1.1963 + 1.1964 + const uint8_t *inBytes; 1.1965 + uint8_t *outBytes; 1.1966 + 1.1967 + const int32_t *inIndexes; 1.1968 + int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; 1.1969 + 1.1970 + int32_t i, offset, nextOffset, size; 1.1971 + 1.1972 + /* udata_swapDataHeader checks the arguments */ 1.1973 + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); 1.1974 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1975 + return 0; 1.1976 + } 1.1977 + 1.1978 + /* check data format and format version */ 1.1979 + pInfo=(const UDataInfo *)((const char *)inData+4); 1.1980 + if(!( 1.1981 + pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ 1.1982 + pInfo->dataFormat[1]==0x72 && 1.1983 + pInfo->dataFormat[2]==0x6d && 1.1984 + pInfo->dataFormat[3]==0x32 && 1.1985 + (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) 1.1986 + )) { 1.1987 + udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", 1.1988 + pInfo->dataFormat[0], pInfo->dataFormat[1], 1.1989 + pInfo->dataFormat[2], pInfo->dataFormat[3], 1.1990 + pInfo->formatVersion[0]); 1.1991 + *pErrorCode=U_UNSUPPORTED_ERROR; 1.1992 + return 0; 1.1993 + } 1.1994 + 1.1995 + inBytes=(const uint8_t *)inData+headerSize; 1.1996 + outBytes=(uint8_t *)outData+headerSize; 1.1997 + 1.1998 + inIndexes=(const int32_t *)inBytes; 1.1999 + 1.2000 + if(length>=0) { 1.2001 + length-=headerSize; 1.2002 + if(length<(int32_t)sizeof(indexes)) { 1.2003 + udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", 1.2004 + length); 1.2005 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2006 + return 0; 1.2007 + } 1.2008 + } 1.2009 + 1.2010 + /* read the first few indexes */ 1.2011 + for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { 1.2012 + indexes[i]=udata_readInt32(ds, inIndexes[i]); 1.2013 + } 1.2014 + 1.2015 + /* get the total length of the data */ 1.2016 + size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1.2017 + 1.2018 + if(length>=0) { 1.2019 + if(length<size) { 1.2020 + udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", 1.2021 + length); 1.2022 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2023 + return 0; 1.2024 + } 1.2025 + 1.2026 + /* copy the data for inaccessible bytes */ 1.2027 + if(inBytes!=outBytes) { 1.2028 + uprv_memcpy(outBytes, inBytes, size); 1.2029 + } 1.2030 + 1.2031 + offset=0; 1.2032 + 1.2033 + /* swap the int32_t indexes[] */ 1.2034 + nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; 1.2035 + ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); 1.2036 + offset=nextOffset; 1.2037 + 1.2038 + /* swap the UTrie2 */ 1.2039 + nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; 1.2040 + utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 1.2041 + offset=nextOffset; 1.2042 + 1.2043 + /* swap the uint16_t extraData[] */ 1.2044 + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; 1.2045 + ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); 1.2046 + offset=nextOffset; 1.2047 + 1.2048 + /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ 1.2049 + nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; 1.2050 + offset=nextOffset; 1.2051 + 1.2052 + U_ASSERT(offset==size); 1.2053 + } 1.2054 + 1.2055 + return headerSize+size; 1.2056 +} 1.2057 + 1.2058 +#endif // !UCONFIG_NO_NORMALIZATION