intl/icu/source/common/normalizer2impl.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2009-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: normalizer2impl.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2009nov22
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18
michael@0 19 #if !UCONFIG_NO_NORMALIZATION
michael@0 20
michael@0 21 #include "unicode/normalizer2.h"
michael@0 22 #include "unicode/udata.h"
michael@0 23 #include "unicode/ustring.h"
michael@0 24 #include "unicode/utf16.h"
michael@0 25 #include "cmemory.h"
michael@0 26 #include "mutex.h"
michael@0 27 #include "normalizer2impl.h"
michael@0 28 #include "putilimp.h"
michael@0 29 #include "uassert.h"
michael@0 30 #include "uset_imp.h"
michael@0 31 #include "utrie2.h"
michael@0 32 #include "uvector.h"
michael@0 33
michael@0 34 U_NAMESPACE_BEGIN
michael@0 35
michael@0 36 // ReorderingBuffer -------------------------------------------------------- ***
michael@0 37
michael@0 38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
michael@0 39 int32_t length=str.length();
michael@0 40 start=str.getBuffer(destCapacity);
michael@0 41 if(start==NULL) {
michael@0 42 // getBuffer() already did str.setToBogus()
michael@0 43 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 44 return FALSE;
michael@0 45 }
michael@0 46 limit=start+length;
michael@0 47 remainingCapacity=str.getCapacity()-length;
michael@0 48 reorderStart=start;
michael@0 49 if(start==limit) {
michael@0 50 lastCC=0;
michael@0 51 } else {
michael@0 52 setIterator();
michael@0 53 lastCC=previousCC();
michael@0 54 // Set reorderStart after the last code point with cc<=1 if there is one.
michael@0 55 if(lastCC>1) {
michael@0 56 while(previousCC()>1) {}
michael@0 57 }
michael@0 58 reorderStart=codePointLimit;
michael@0 59 }
michael@0 60 return TRUE;
michael@0 61 }
michael@0 62
michael@0 63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
michael@0 64 int32_t length=(int32_t)(limit-start);
michael@0 65 return
michael@0 66 length==(int32_t)(otherLimit-otherStart) &&
michael@0 67 0==u_memcmp(start, otherStart, length);
michael@0 68 }
michael@0 69
michael@0 70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
michael@0 71 if(remainingCapacity<2 && !resize(2, errorCode)) {
michael@0 72 return FALSE;
michael@0 73 }
michael@0 74 if(lastCC<=cc || cc==0) {
michael@0 75 limit[0]=U16_LEAD(c);
michael@0 76 limit[1]=U16_TRAIL(c);
michael@0 77 limit+=2;
michael@0 78 lastCC=cc;
michael@0 79 if(cc<=1) {
michael@0 80 reorderStart=limit;
michael@0 81 }
michael@0 82 } else {
michael@0 83 insert(c, cc);
michael@0 84 }
michael@0 85 remainingCapacity-=2;
michael@0 86 return TRUE;
michael@0 87 }
michael@0 88
michael@0 89 UBool ReorderingBuffer::append(const UChar *s, int32_t length,
michael@0 90 uint8_t leadCC, uint8_t trailCC,
michael@0 91 UErrorCode &errorCode) {
michael@0 92 if(length==0) {
michael@0 93 return TRUE;
michael@0 94 }
michael@0 95 if(remainingCapacity<length && !resize(length, errorCode)) {
michael@0 96 return FALSE;
michael@0 97 }
michael@0 98 remainingCapacity-=length;
michael@0 99 if(lastCC<=leadCC || leadCC==0) {
michael@0 100 if(trailCC<=1) {
michael@0 101 reorderStart=limit+length;
michael@0 102 } else if(leadCC<=1) {
michael@0 103 reorderStart=limit+1; // Ok if not a code point boundary.
michael@0 104 }
michael@0 105 const UChar *sLimit=s+length;
michael@0 106 do { *limit++=*s++; } while(s!=sLimit);
michael@0 107 lastCC=trailCC;
michael@0 108 } else {
michael@0 109 int32_t i=0;
michael@0 110 UChar32 c;
michael@0 111 U16_NEXT(s, i, length, c);
michael@0 112 insert(c, leadCC); // insert first code point
michael@0 113 while(i<length) {
michael@0 114 U16_NEXT(s, i, length, c);
michael@0 115 if(i<length) {
michael@0 116 // s must be in NFD, otherwise we need to use getCC().
michael@0 117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
michael@0 118 } else {
michael@0 119 leadCC=trailCC;
michael@0 120 }
michael@0 121 append(c, leadCC, errorCode);
michael@0 122 }
michael@0 123 }
michael@0 124 return TRUE;
michael@0 125 }
michael@0 126
michael@0 127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
michael@0 128 int32_t cpLength=U16_LENGTH(c);
michael@0 129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
michael@0 130 return FALSE;
michael@0 131 }
michael@0 132 remainingCapacity-=cpLength;
michael@0 133 if(cpLength==1) {
michael@0 134 *limit++=(UChar)c;
michael@0 135 } else {
michael@0 136 limit[0]=U16_LEAD(c);
michael@0 137 limit[1]=U16_TRAIL(c);
michael@0 138 limit+=2;
michael@0 139 }
michael@0 140 lastCC=0;
michael@0 141 reorderStart=limit;
michael@0 142 return TRUE;
michael@0 143 }
michael@0 144
michael@0 145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
michael@0 146 if(s==sLimit) {
michael@0 147 return TRUE;
michael@0 148 }
michael@0 149 int32_t length=(int32_t)(sLimit-s);
michael@0 150 if(remainingCapacity<length && !resize(length, errorCode)) {
michael@0 151 return FALSE;
michael@0 152 }
michael@0 153 u_memcpy(limit, s, length);
michael@0 154 limit+=length;
michael@0 155 remainingCapacity-=length;
michael@0 156 lastCC=0;
michael@0 157 reorderStart=limit;
michael@0 158 return TRUE;
michael@0 159 }
michael@0 160
michael@0 161 void ReorderingBuffer::remove() {
michael@0 162 reorderStart=limit=start;
michael@0 163 remainingCapacity=str.getCapacity();
michael@0 164 lastCC=0;
michael@0 165 }
michael@0 166
michael@0 167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
michael@0 168 if(suffixLength<(limit-start)) {
michael@0 169 limit-=suffixLength;
michael@0 170 remainingCapacity+=suffixLength;
michael@0 171 } else {
michael@0 172 limit=start;
michael@0 173 remainingCapacity=str.getCapacity();
michael@0 174 }
michael@0 175 lastCC=0;
michael@0 176 reorderStart=limit;
michael@0 177 }
michael@0 178
michael@0 179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
michael@0 180 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
michael@0 181 int32_t length=(int32_t)(limit-start);
michael@0 182 str.releaseBuffer(length);
michael@0 183 int32_t newCapacity=length+appendLength;
michael@0 184 int32_t doubleCapacity=2*str.getCapacity();
michael@0 185 if(newCapacity<doubleCapacity) {
michael@0 186 newCapacity=doubleCapacity;
michael@0 187 }
michael@0 188 if(newCapacity<256) {
michael@0 189 newCapacity=256;
michael@0 190 }
michael@0 191 start=str.getBuffer(newCapacity);
michael@0 192 if(start==NULL) {
michael@0 193 // getBuffer() already did str.setToBogus()
michael@0 194 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 195 return FALSE;
michael@0 196 }
michael@0 197 reorderStart=start+reorderStartIndex;
michael@0 198 limit=start+length;
michael@0 199 remainingCapacity=str.getCapacity()-length;
michael@0 200 return TRUE;
michael@0 201 }
michael@0 202
michael@0 203 void ReorderingBuffer::skipPrevious() {
michael@0 204 codePointLimit=codePointStart;
michael@0 205 UChar c=*--codePointStart;
michael@0 206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
michael@0 207 --codePointStart;
michael@0 208 }
michael@0 209 }
michael@0 210
michael@0 211 uint8_t ReorderingBuffer::previousCC() {
michael@0 212 codePointLimit=codePointStart;
michael@0 213 if(reorderStart>=codePointStart) {
michael@0 214 return 0;
michael@0 215 }
michael@0 216 UChar32 c=*--codePointStart;
michael@0 217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
michael@0 218 return 0;
michael@0 219 }
michael@0 220
michael@0 221 UChar c2;
michael@0 222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
michael@0 223 --codePointStart;
michael@0 224 c=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 225 }
michael@0 226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
michael@0 227 }
michael@0 228
michael@0 229 // Inserts c somewhere before the last character.
michael@0 230 // Requires 0<cc<lastCC which implies reorderStart<limit.
michael@0 231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
michael@0 232 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
michael@0 233 // insert c at codePointLimit, after the character with prevCC<=cc
michael@0 234 UChar *q=limit;
michael@0 235 UChar *r=limit+=U16_LENGTH(c);
michael@0 236 do {
michael@0 237 *--r=*--q;
michael@0 238 } while(codePointLimit!=q);
michael@0 239 writeCodePoint(q, c);
michael@0 240 if(cc<=1) {
michael@0 241 reorderStart=r;
michael@0 242 }
michael@0 243 }
michael@0 244
michael@0 245 // Normalizer2Impl --------------------------------------------------------- ***
michael@0 246
michael@0 247 struct CanonIterData : public UMemory {
michael@0 248 CanonIterData(UErrorCode &errorCode);
michael@0 249 ~CanonIterData();
michael@0 250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
michael@0 251 UTrie2 *trie;
michael@0 252 UVector canonStartSets; // contains UnicodeSet *
michael@0 253 };
michael@0 254
michael@0 255 Normalizer2Impl::~Normalizer2Impl() {
michael@0 256 udata_close(memory);
michael@0 257 utrie2_close(normTrie);
michael@0 258 delete fCanonIterData;
michael@0 259 }
michael@0 260
michael@0 261 UBool U_CALLCONV
michael@0 262 Normalizer2Impl::isAcceptable(void *context,
michael@0 263 const char * /* type */, const char * /*name*/,
michael@0 264 const UDataInfo *pInfo) {
michael@0 265 if(
michael@0 266 pInfo->size>=20 &&
michael@0 267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
michael@0 268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
michael@0 269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
michael@0 270 pInfo->dataFormat[1]==0x72 &&
michael@0 271 pInfo->dataFormat[2]==0x6d &&
michael@0 272 pInfo->dataFormat[3]==0x32 &&
michael@0 273 pInfo->formatVersion[0]==2
michael@0 274 ) {
michael@0 275 Normalizer2Impl *me=(Normalizer2Impl *)context;
michael@0 276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
michael@0 277 return TRUE;
michael@0 278 } else {
michael@0 279 return FALSE;
michael@0 280 }
michael@0 281 }
michael@0 282
michael@0 283 void
michael@0 284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
michael@0 285 if(U_FAILURE(errorCode)) {
michael@0 286 return;
michael@0 287 }
michael@0 288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
michael@0 289 if(U_FAILURE(errorCode)) {
michael@0 290 return;
michael@0 291 }
michael@0 292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
michael@0 293 const int32_t *inIndexes=(const int32_t *)inBytes;
michael@0 294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
michael@0 295 if(indexesLength<=IX_MIN_MAYBE_YES) {
michael@0 296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
michael@0 297 return;
michael@0 298 }
michael@0 299
michael@0 300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
michael@0 301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
michael@0 302
michael@0 303 minYesNo=inIndexes[IX_MIN_YES_NO];
michael@0 304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
michael@0 305 minNoNo=inIndexes[IX_MIN_NO_NO];
michael@0 306 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
michael@0 307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
michael@0 308
michael@0 309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
michael@0 310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
michael@0 311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
michael@0 312 inBytes+offset, nextOffset-offset, NULL,
michael@0 313 &errorCode);
michael@0 314 if(U_FAILURE(errorCode)) {
michael@0 315 return;
michael@0 316 }
michael@0 317
michael@0 318 offset=nextOffset;
michael@0 319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
michael@0 320 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
michael@0 321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
michael@0 322
michael@0 323 // smallFCD: new in formatVersion 2
michael@0 324 offset=nextOffset;
michael@0 325 smallFCD=inBytes+offset;
michael@0 326
michael@0 327 // Build tccc180[].
michael@0 328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
michael@0 329 uint8_t bits=0;
michael@0 330 for(UChar c=0; c<0x180; bits>>=1) {
michael@0 331 if((c&0xff)==0) {
michael@0 332 bits=smallFCD[c>>8]; // one byte per 0x100 code points
michael@0 333 }
michael@0 334 if(bits&1) {
michael@0 335 for(int i=0; i<0x20; ++i, ++c) {
michael@0 336 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
michael@0 337 }
michael@0 338 } else {
michael@0 339 uprv_memset(tccc180+c, 0, 0x20);
michael@0 340 c+=0x20;
michael@0 341 }
michael@0 342 }
michael@0 343 }
michael@0 344
michael@0 345 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
michael@0 346 UChar32 c;
michael@0 347 if(cpStart==(cpLimit-1)) {
michael@0 348 c=*cpStart;
michael@0 349 } else {
michael@0 350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
michael@0 351 }
michael@0 352 uint16_t prevNorm16=getNorm16(c);
michael@0 353 if(prevNorm16<=minYesNo) {
michael@0 354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
michael@0 355 } else {
michael@0 356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
michael@0 357 }
michael@0 358 }
michael@0 359
michael@0 360 U_CDECL_BEGIN
michael@0 361
michael@0 362 static UBool U_CALLCONV
michael@0 363 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
michael@0 364 /* add the start code point to the USet */
michael@0 365 const USetAdder *sa=(const USetAdder *)context;
michael@0 366 sa->add(sa->set, start);
michael@0 367 return TRUE;
michael@0 368 }
michael@0 369
michael@0 370 static uint32_t U_CALLCONV
michael@0 371 segmentStarterMapper(const void * /*context*/, uint32_t value) {
michael@0 372 return value&CANON_NOT_SEGMENT_STARTER;
michael@0 373 }
michael@0 374
michael@0 375 U_CDECL_END
michael@0 376
michael@0 377 void
michael@0 378 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
michael@0 379 /* add the start code point of each same-value range of each trie */
michael@0 380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
michael@0 381
michael@0 382 /* add Hangul LV syllables and LV+1 because of skippables */
michael@0 383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
michael@0 384 sa->add(sa->set, c);
michael@0 385 sa->add(sa->set, c+1);
michael@0 386 }
michael@0 387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
michael@0 388 }
michael@0 389
michael@0 390 void
michael@0 391 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
michael@0 392 /* add the start code point of each same-value range of the canonical iterator data trie */
michael@0 393 if(ensureCanonIterData(errorCode)) {
michael@0 394 // currently only used for the SEGMENT_STARTER property
michael@0 395 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
michael@0 396 }
michael@0 397 }
michael@0 398
michael@0 399 const UChar *
michael@0 400 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
michael@0 401 UChar32 minNeedDataCP,
michael@0 402 ReorderingBuffer *buffer,
michael@0 403 UErrorCode &errorCode) const {
michael@0 404 // Make some effort to support NUL-terminated strings reasonably.
michael@0 405 // Take the part of the fast quick check loop that does not look up
michael@0 406 // data and check the first part of the string.
michael@0 407 // After this prefix, determine the string length to simplify the rest
michael@0 408 // of the code.
michael@0 409 const UChar *prevSrc=src;
michael@0 410 UChar c;
michael@0 411 while((c=*src++)<minNeedDataCP && c!=0) {}
michael@0 412 // Back out the last character for full processing.
michael@0 413 // Copy this prefix.
michael@0 414 if(--src!=prevSrc) {
michael@0 415 if(buffer!=NULL) {
michael@0 416 buffer->appendZeroCC(prevSrc, src, errorCode);
michael@0 417 }
michael@0 418 }
michael@0 419 return src;
michael@0 420 }
michael@0 421
michael@0 422 // Dual functionality:
michael@0 423 // buffer!=NULL: normalize
michael@0 424 // buffer==NULL: isNormalized/spanQuickCheckYes
michael@0 425 const UChar *
michael@0 426 Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
michael@0 427 ReorderingBuffer *buffer,
michael@0 428 UErrorCode &errorCode) const {
michael@0 429 UChar32 minNoCP=minDecompNoCP;
michael@0 430 if(limit==NULL) {
michael@0 431 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
michael@0 432 if(U_FAILURE(errorCode)) {
michael@0 433 return src;
michael@0 434 }
michael@0 435 limit=u_strchr(src, 0);
michael@0 436 }
michael@0 437
michael@0 438 const UChar *prevSrc;
michael@0 439 UChar32 c=0;
michael@0 440 uint16_t norm16=0;
michael@0 441
michael@0 442 // only for quick check
michael@0 443 const UChar *prevBoundary=src;
michael@0 444 uint8_t prevCC=0;
michael@0 445
michael@0 446 for(;;) {
michael@0 447 // count code units below the minimum or with irrelevant data for the quick check
michael@0 448 for(prevSrc=src; src!=limit;) {
michael@0 449 if( (c=*src)<minNoCP ||
michael@0 450 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
michael@0 451 ) {
michael@0 452 ++src;
michael@0 453 } else if(!U16_IS_SURROGATE(c)) {
michael@0 454 break;
michael@0 455 } else {
michael@0 456 UChar c2;
michael@0 457 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 458 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
michael@0 459 c=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 460 }
michael@0 461 } else /* trail surrogate */ {
michael@0 462 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
michael@0 463 --src;
michael@0 464 c=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 465 }
michael@0 466 }
michael@0 467 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
michael@0 468 src+=U16_LENGTH(c);
michael@0 469 } else {
michael@0 470 break;
michael@0 471 }
michael@0 472 }
michael@0 473 }
michael@0 474 // copy these code units all at once
michael@0 475 if(src!=prevSrc) {
michael@0 476 if(buffer!=NULL) {
michael@0 477 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
michael@0 478 break;
michael@0 479 }
michael@0 480 } else {
michael@0 481 prevCC=0;
michael@0 482 prevBoundary=src;
michael@0 483 }
michael@0 484 }
michael@0 485 if(src==limit) {
michael@0 486 break;
michael@0 487 }
michael@0 488
michael@0 489 // Check one above-minimum, relevant code point.
michael@0 490 src+=U16_LENGTH(c);
michael@0 491 if(buffer!=NULL) {
michael@0 492 if(!decompose(c, norm16, *buffer, errorCode)) {
michael@0 493 break;
michael@0 494 }
michael@0 495 } else {
michael@0 496 if(isDecompYes(norm16)) {
michael@0 497 uint8_t cc=getCCFromYesOrMaybe(norm16);
michael@0 498 if(prevCC<=cc || cc==0) {
michael@0 499 prevCC=cc;
michael@0 500 if(cc<=1) {
michael@0 501 prevBoundary=src;
michael@0 502 }
michael@0 503 continue;
michael@0 504 }
michael@0 505 }
michael@0 506 return prevBoundary; // "no" or cc out of order
michael@0 507 }
michael@0 508 }
michael@0 509 return src;
michael@0 510 }
michael@0 511
michael@0 512 // Decompose a short piece of text which is likely to contain characters that
michael@0 513 // fail the quick check loop and/or where the quick check loop's overhead
michael@0 514 // is unlikely to be amortized.
michael@0 515 // Called by the compose() and makeFCD() implementations.
michael@0 516 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
michael@0 517 ReorderingBuffer &buffer,
michael@0 518 UErrorCode &errorCode) const {
michael@0 519 while(src<limit) {
michael@0 520 UChar32 c;
michael@0 521 uint16_t norm16;
michael@0 522 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
michael@0 523 if(!decompose(c, norm16, buffer, errorCode)) {
michael@0 524 return FALSE;
michael@0 525 }
michael@0 526 }
michael@0 527 return TRUE;
michael@0 528 }
michael@0 529
michael@0 530 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
michael@0 531 ReorderingBuffer &buffer,
michael@0 532 UErrorCode &errorCode) const {
michael@0 533 // Only loops for 1:1 algorithmic mappings.
michael@0 534 for(;;) {
michael@0 535 // get the decomposition and the lead and trail cc's
michael@0 536 if(isDecompYes(norm16)) {
michael@0 537 // c does not decompose
michael@0 538 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
michael@0 539 } else if(isHangul(norm16)) {
michael@0 540 // Hangul syllable: decompose algorithmically
michael@0 541 UChar jamos[3];
michael@0 542 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
michael@0 543 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 544 c=mapAlgorithmic(c, norm16);
michael@0 545 norm16=getNorm16(c);
michael@0 546 } else {
michael@0 547 // c decomposes, get everything from the variable-length extra data
michael@0 548 const uint16_t *mapping=getMapping(norm16);
michael@0 549 uint16_t firstUnit=*mapping;
michael@0 550 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
michael@0 551 uint8_t leadCC, trailCC;
michael@0 552 trailCC=(uint8_t)(firstUnit>>8);
michael@0 553 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
michael@0 554 leadCC=(uint8_t)(*(mapping-1)>>8);
michael@0 555 } else {
michael@0 556 leadCC=0;
michael@0 557 }
michael@0 558 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
michael@0 559 }
michael@0 560 }
michael@0 561 }
michael@0 562
michael@0 563 const UChar *
michael@0 564 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
michael@0 565 const UChar *decomp=NULL;
michael@0 566 uint16_t norm16;
michael@0 567 for(;;) {
michael@0 568 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
michael@0 569 // c does not decompose
michael@0 570 return decomp;
michael@0 571 } else if(isHangul(norm16)) {
michael@0 572 // Hangul syllable: decompose algorithmically
michael@0 573 length=Hangul::decompose(c, buffer);
michael@0 574 return buffer;
michael@0 575 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 576 c=mapAlgorithmic(c, norm16);
michael@0 577 decomp=buffer;
michael@0 578 length=0;
michael@0 579 U16_APPEND_UNSAFE(buffer, length, c);
michael@0 580 } else {
michael@0 581 // c decomposes, get everything from the variable-length extra data
michael@0 582 const uint16_t *mapping=getMapping(norm16);
michael@0 583 length=*mapping&MAPPING_LENGTH_MASK;
michael@0 584 return (const UChar *)mapping+1;
michael@0 585 }
michael@0 586 }
michael@0 587 }
michael@0 588
michael@0 589 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
michael@0 590 // so that a raw mapping fits that consists of one unit ("rm0")
michael@0 591 // plus all but the first two code units of the normal mapping.
michael@0 592 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
michael@0 593 const UChar *
michael@0 594 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
michael@0 595 // We do not loop in this method because an algorithmic mapping itself
michael@0 596 // becomes a final result rather than having to be decomposed recursively.
michael@0 597 uint16_t norm16;
michael@0 598 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
michael@0 599 // c does not decompose
michael@0 600 return NULL;
michael@0 601 } else if(isHangul(norm16)) {
michael@0 602 // Hangul syllable: decompose algorithmically
michael@0 603 Hangul::getRawDecomposition(c, buffer);
michael@0 604 length=2;
michael@0 605 return buffer;
michael@0 606 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 607 c=mapAlgorithmic(c, norm16);
michael@0 608 length=0;
michael@0 609 U16_APPEND_UNSAFE(buffer, length, c);
michael@0 610 return buffer;
michael@0 611 } else {
michael@0 612 // c decomposes, get everything from the variable-length extra data
michael@0 613 const uint16_t *mapping=getMapping(norm16);
michael@0 614 uint16_t firstUnit=*mapping;
michael@0 615 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
michael@0 616 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
michael@0 617 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
michael@0 618 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
michael@0 619 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
michael@0 620 uint16_t rm0=*rawMapping;
michael@0 621 if(rm0<=MAPPING_LENGTH_MASK) {
michael@0 622 length=rm0;
michael@0 623 return (const UChar *)rawMapping-rm0;
michael@0 624 } else {
michael@0 625 // Copy the normal mapping and replace its first two code units with rm0.
michael@0 626 buffer[0]=(UChar)rm0;
michael@0 627 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
michael@0 628 length=mLength-1;
michael@0 629 return buffer;
michael@0 630 }
michael@0 631 } else {
michael@0 632 length=mLength;
michael@0 633 return (const UChar *)mapping+1;
michael@0 634 }
michael@0 635 }
michael@0 636 }
michael@0 637
michael@0 638 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
michael@0 639 UBool doDecompose,
michael@0 640 UnicodeString &safeMiddle,
michael@0 641 ReorderingBuffer &buffer,
michael@0 642 UErrorCode &errorCode) const {
michael@0 643 buffer.copyReorderableSuffixTo(safeMiddle);
michael@0 644 if(doDecompose) {
michael@0 645 decompose(src, limit, &buffer, errorCode);
michael@0 646 return;
michael@0 647 }
michael@0 648 // Just merge the strings at the boundary.
michael@0 649 ForwardUTrie2StringIterator iter(normTrie, src, limit);
michael@0 650 uint8_t firstCC, prevCC, cc;
michael@0 651 firstCC=prevCC=cc=getCC(iter.next16());
michael@0 652 while(cc!=0) {
michael@0 653 prevCC=cc;
michael@0 654 cc=getCC(iter.next16());
michael@0 655 };
michael@0 656 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
michael@0 657 limit=u_strchr(iter.codePointStart, 0);
michael@0 658 }
michael@0 659
michael@0 660 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
michael@0 661 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
michael@0 662 }
michael@0 663 }
michael@0 664
michael@0 665 // Note: hasDecompBoundary() could be implemented as aliases to
michael@0 666 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
michael@0 667 // at the cost of building the FCD trie for a decomposition normalizer.
michael@0 668 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
michael@0 669 for(;;) {
michael@0 670 if(c<minDecompNoCP) {
michael@0 671 return TRUE;
michael@0 672 }
michael@0 673 uint16_t norm16=getNorm16(c);
michael@0 674 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
michael@0 675 return TRUE;
michael@0 676 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
michael@0 677 return FALSE; // ccc!=0
michael@0 678 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 679 c=mapAlgorithmic(c, norm16);
michael@0 680 } else {
michael@0 681 // c decomposes, get everything from the variable-length extra data
michael@0 682 const uint16_t *mapping=getMapping(norm16);
michael@0 683 uint16_t firstUnit=*mapping;
michael@0 684 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
michael@0 685 return FALSE;
michael@0 686 }
michael@0 687 if(!before) {
michael@0 688 // decomp after-boundary: same as hasFCDBoundaryAfter(),
michael@0 689 // fcd16<=1 || trailCC==0
michael@0 690 if(firstUnit>0x1ff) {
michael@0 691 return FALSE; // trailCC>1
michael@0 692 }
michael@0 693 if(firstUnit<=0xff) {
michael@0 694 return TRUE; // trailCC==0
michael@0 695 }
michael@0 696 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
michael@0 697 }
michael@0 698 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
michael@0 699 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
michael@0 700 }
michael@0 701 }
michael@0 702 }
michael@0 703
michael@0 704 /*
michael@0 705 * Finds the recomposition result for
michael@0 706 * a forward-combining "lead" character,
michael@0 707 * specified with a pointer to its compositions list,
michael@0 708 * and a backward-combining "trail" character.
michael@0 709 *
michael@0 710 * If the lead and trail characters combine, then this function returns
michael@0 711 * the following "compositeAndFwd" value:
michael@0 712 * Bits 21..1 composite character
michael@0 713 * Bit 0 set if the composite is a forward-combining starter
michael@0 714 * otherwise it returns -1.
michael@0 715 *
michael@0 716 * The compositions list has (trail, compositeAndFwd) pair entries,
michael@0 717 * encoded as either pairs or triples of 16-bit units.
michael@0 718 * The last entry has the high bit of its first unit set.
michael@0 719 *
michael@0 720 * The list is sorted by ascending trail characters (there are no duplicates).
michael@0 721 * A linear search is used.
michael@0 722 *
michael@0 723 * See normalizer2impl.h for a more detailed description
michael@0 724 * of the compositions list format.
michael@0 725 */
michael@0 726 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
michael@0 727 uint16_t key1, firstUnit;
michael@0 728 if(trail<COMP_1_TRAIL_LIMIT) {
michael@0 729 // trail character is 0..33FF
michael@0 730 // result entry may have 2 or 3 units
michael@0 731 key1=(uint16_t)(trail<<1);
michael@0 732 while(key1>(firstUnit=*list)) {
michael@0 733 list+=2+(firstUnit&COMP_1_TRIPLE);
michael@0 734 }
michael@0 735 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
michael@0 736 if(firstUnit&COMP_1_TRIPLE) {
michael@0 737 return ((int32_t)list[1]<<16)|list[2];
michael@0 738 } else {
michael@0 739 return list[1];
michael@0 740 }
michael@0 741 }
michael@0 742 } else {
michael@0 743 // trail character is 3400..10FFFF
michael@0 744 // result entry has 3 units
michael@0 745 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
michael@0 746 (((trail>>COMP_1_TRAIL_SHIFT))&
michael@0 747 ~COMP_1_TRIPLE));
michael@0 748 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
michael@0 749 uint16_t secondUnit;
michael@0 750 for(;;) {
michael@0 751 if(key1>(firstUnit=*list)) {
michael@0 752 list+=2+(firstUnit&COMP_1_TRIPLE);
michael@0 753 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
michael@0 754 if(key2>(secondUnit=list[1])) {
michael@0 755 if(firstUnit&COMP_1_LAST_TUPLE) {
michael@0 756 break;
michael@0 757 } else {
michael@0 758 list+=3;
michael@0 759 }
michael@0 760 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
michael@0 761 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
michael@0 762 } else {
michael@0 763 break;
michael@0 764 }
michael@0 765 } else {
michael@0 766 break;
michael@0 767 }
michael@0 768 }
michael@0 769 }
michael@0 770 return -1;
michael@0 771 }
michael@0 772
michael@0 773 /**
michael@0 774 * @param list some character's compositions list
michael@0 775 * @param set recursively receives the composites from these compositions
michael@0 776 */
michael@0 777 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
michael@0 778 uint16_t firstUnit;
michael@0 779 int32_t compositeAndFwd;
michael@0 780 do {
michael@0 781 firstUnit=*list;
michael@0 782 if((firstUnit&COMP_1_TRIPLE)==0) {
michael@0 783 compositeAndFwd=list[1];
michael@0 784 list+=2;
michael@0 785 } else {
michael@0 786 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
michael@0 787 list+=3;
michael@0 788 }
michael@0 789 UChar32 composite=compositeAndFwd>>1;
michael@0 790 if((compositeAndFwd&1)!=0) {
michael@0 791 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
michael@0 792 }
michael@0 793 set.add(composite);
michael@0 794 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
michael@0 795 }
michael@0 796
michael@0 797 /*
michael@0 798 * Recomposes the buffer text starting at recomposeStartIndex
michael@0 799 * (which is in NFD - decomposed and canonically ordered),
michael@0 800 * and truncates the buffer contents.
michael@0 801 *
michael@0 802 * Note that recomposition never lengthens the text:
michael@0 803 * Any character consists of either one or two code units;
michael@0 804 * a composition may contain at most one more code unit than the original starter,
michael@0 805 * while the combining mark that is removed has at least one code unit.
michael@0 806 */
michael@0 807 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
michael@0 808 UBool onlyContiguous) const {
michael@0 809 UChar *p=buffer.getStart()+recomposeStartIndex;
michael@0 810 UChar *limit=buffer.getLimit();
michael@0 811 if(p==limit) {
michael@0 812 return;
michael@0 813 }
michael@0 814
michael@0 815 UChar *starter, *pRemove, *q, *r;
michael@0 816 const uint16_t *compositionsList;
michael@0 817 UChar32 c, compositeAndFwd;
michael@0 818 uint16_t norm16;
michael@0 819 uint8_t cc, prevCC;
michael@0 820 UBool starterIsSupplementary;
michael@0 821
michael@0 822 // Some of the following variables are not used until we have a forward-combining starter
michael@0 823 // and are only initialized now to avoid compiler warnings.
michael@0 824 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
michael@0 825 starter=NULL;
michael@0 826 starterIsSupplementary=FALSE;
michael@0 827 prevCC=0;
michael@0 828
michael@0 829 for(;;) {
michael@0 830 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
michael@0 831 cc=getCCFromYesOrMaybe(norm16);
michael@0 832 if( // this character combines backward and
michael@0 833 isMaybe(norm16) &&
michael@0 834 // we have seen a starter that combines forward and
michael@0 835 compositionsList!=NULL &&
michael@0 836 // the backward-combining character is not blocked
michael@0 837 (prevCC<cc || prevCC==0)
michael@0 838 ) {
michael@0 839 if(isJamoVT(norm16)) {
michael@0 840 // c is a Jamo V/T, see if we can compose it with the previous character.
michael@0 841 if(c<Hangul::JAMO_T_BASE) {
michael@0 842 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
michael@0 843 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
michael@0 844 if(prev<Hangul::JAMO_L_COUNT) {
michael@0 845 pRemove=p-1;
michael@0 846 UChar syllable=(UChar)
michael@0 847 (Hangul::HANGUL_BASE+
michael@0 848 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
michael@0 849 Hangul::JAMO_T_COUNT);
michael@0 850 UChar t;
michael@0 851 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
michael@0 852 ++p;
michael@0 853 syllable+=t; // The next character was a Jamo T.
michael@0 854 }
michael@0 855 *starter=syllable;
michael@0 856 // remove the Jamo V/T
michael@0 857 q=pRemove;
michael@0 858 r=p;
michael@0 859 while(r<limit) {
michael@0 860 *q++=*r++;
michael@0 861 }
michael@0 862 limit=q;
michael@0 863 p=pRemove;
michael@0 864 }
michael@0 865 }
michael@0 866 /*
michael@0 867 * No "else" for Jamo T:
michael@0 868 * Since the input is in NFD, there are no Hangul LV syllables that
michael@0 869 * a Jamo T could combine with.
michael@0 870 * All Jamo Ts are combined above when handling Jamo Vs.
michael@0 871 */
michael@0 872 if(p==limit) {
michael@0 873 break;
michael@0 874 }
michael@0 875 compositionsList=NULL;
michael@0 876 continue;
michael@0 877 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
michael@0 878 // The starter and the combining mark (c) do combine.
michael@0 879 UChar32 composite=compositeAndFwd>>1;
michael@0 880
michael@0 881 // Replace the starter with the composite, remove the combining mark.
michael@0 882 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
michael@0 883 if(starterIsSupplementary) {
michael@0 884 if(U_IS_SUPPLEMENTARY(composite)) {
michael@0 885 // both are supplementary
michael@0 886 starter[0]=U16_LEAD(composite);
michael@0 887 starter[1]=U16_TRAIL(composite);
michael@0 888 } else {
michael@0 889 *starter=(UChar)composite;
michael@0 890 // The composite is shorter than the starter,
michael@0 891 // move the intermediate characters forward one.
michael@0 892 starterIsSupplementary=FALSE;
michael@0 893 q=starter+1;
michael@0 894 r=q+1;
michael@0 895 while(r<pRemove) {
michael@0 896 *q++=*r++;
michael@0 897 }
michael@0 898 --pRemove;
michael@0 899 }
michael@0 900 } else if(U_IS_SUPPLEMENTARY(composite)) {
michael@0 901 // The composite is longer than the starter,
michael@0 902 // move the intermediate characters back one.
michael@0 903 starterIsSupplementary=TRUE;
michael@0 904 ++starter; // temporarily increment for the loop boundary
michael@0 905 q=pRemove;
michael@0 906 r=++pRemove;
michael@0 907 while(starter<q) {
michael@0 908 *--r=*--q;
michael@0 909 }
michael@0 910 *starter=U16_TRAIL(composite);
michael@0 911 *--starter=U16_LEAD(composite); // undo the temporary increment
michael@0 912 } else {
michael@0 913 // both are on the BMP
michael@0 914 *starter=(UChar)composite;
michael@0 915 }
michael@0 916
michael@0 917 /* remove the combining mark by moving the following text over it */
michael@0 918 if(pRemove<p) {
michael@0 919 q=pRemove;
michael@0 920 r=p;
michael@0 921 while(r<limit) {
michael@0 922 *q++=*r++;
michael@0 923 }
michael@0 924 limit=q;
michael@0 925 p=pRemove;
michael@0 926 }
michael@0 927 // Keep prevCC because we removed the combining mark.
michael@0 928
michael@0 929 if(p==limit) {
michael@0 930 break;
michael@0 931 }
michael@0 932 // Is the composite a starter that combines forward?
michael@0 933 if(compositeAndFwd&1) {
michael@0 934 compositionsList=
michael@0 935 getCompositionsListForComposite(getNorm16(composite));
michael@0 936 } else {
michael@0 937 compositionsList=NULL;
michael@0 938 }
michael@0 939
michael@0 940 // We combined; continue with looking for compositions.
michael@0 941 continue;
michael@0 942 }
michael@0 943 }
michael@0 944
michael@0 945 // no combination this time
michael@0 946 prevCC=cc;
michael@0 947 if(p==limit) {
michael@0 948 break;
michael@0 949 }
michael@0 950
michael@0 951 // If c did not combine, then check if it is a starter.
michael@0 952 if(cc==0) {
michael@0 953 // Found a new starter.
michael@0 954 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
michael@0 955 // It may combine with something, prepare for it.
michael@0 956 if(U_IS_BMP(c)) {
michael@0 957 starterIsSupplementary=FALSE;
michael@0 958 starter=p-1;
michael@0 959 } else {
michael@0 960 starterIsSupplementary=TRUE;
michael@0 961 starter=p-2;
michael@0 962 }
michael@0 963 }
michael@0 964 } else if(onlyContiguous) {
michael@0 965 // FCC: no discontiguous compositions; any intervening character blocks.
michael@0 966 compositionsList=NULL;
michael@0 967 }
michael@0 968 }
michael@0 969 buffer.setReorderingLimit(limit);
michael@0 970 }
michael@0 971
michael@0 972 UChar32
michael@0 973 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
michael@0 974 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
michael@0 975 const uint16_t *list;
michael@0 976 if(isInert(norm16)) {
michael@0 977 return U_SENTINEL;
michael@0 978 } else if(norm16<minYesNoMappingsOnly) {
michael@0 979 if(isJamoL(norm16)) {
michael@0 980 b-=Hangul::JAMO_V_BASE;
michael@0 981 if(0<=b && b<Hangul::JAMO_V_COUNT) {
michael@0 982 return
michael@0 983 (Hangul::HANGUL_BASE+
michael@0 984 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
michael@0 985 Hangul::JAMO_T_COUNT);
michael@0 986 } else {
michael@0 987 return U_SENTINEL;
michael@0 988 }
michael@0 989 } else if(isHangul(norm16)) {
michael@0 990 b-=Hangul::JAMO_T_BASE;
michael@0 991 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
michael@0 992 return a+b;
michael@0 993 } else {
michael@0 994 return U_SENTINEL;
michael@0 995 }
michael@0 996 } else {
michael@0 997 // 'a' has a compositions list in extraData
michael@0 998 list=extraData+norm16;
michael@0 999 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
michael@0 1000 list+= // mapping pointer
michael@0 1001 1+ // +1 to skip the first unit with the mapping lenth
michael@0 1002 (*list&MAPPING_LENGTH_MASK); // + mapping length
michael@0 1003 }
michael@0 1004 }
michael@0 1005 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
michael@0 1006 return U_SENTINEL;
michael@0 1007 } else {
michael@0 1008 list=maybeYesCompositions+norm16-minMaybeYes;
michael@0 1009 }
michael@0 1010 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
michael@0 1011 return U_SENTINEL;
michael@0 1012 }
michael@0 1013 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
michael@0 1014 return combine(list, b)>>1;
michael@0 1015 #else
michael@0 1016 int32_t compositeAndFwd=combine(list, b);
michael@0 1017 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
michael@0 1018 #endif
michael@0 1019 }
michael@0 1020
michael@0 1021 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
michael@0 1022 // doCompose: normalize
michael@0 1023 // !doCompose: isNormalized (buffer must be empty and initialized)
michael@0 1024 UBool
michael@0 1025 Normalizer2Impl::compose(const UChar *src, const UChar *limit,
michael@0 1026 UBool onlyContiguous,
michael@0 1027 UBool doCompose,
michael@0 1028 ReorderingBuffer &buffer,
michael@0 1029 UErrorCode &errorCode) const {
michael@0 1030 /*
michael@0 1031 * prevBoundary points to the last character before the current one
michael@0 1032 * that has a composition boundary before it with ccc==0 and quick check "yes".
michael@0 1033 * Keeping track of prevBoundary saves us looking for a composition boundary
michael@0 1034 * when we find a "no" or "maybe".
michael@0 1035 *
michael@0 1036 * When we back out from prevSrc back to prevBoundary,
michael@0 1037 * then we also remove those same characters (which had been simply copied
michael@0 1038 * or canonically-order-inserted) from the ReorderingBuffer.
michael@0 1039 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
michael@0 1040 * must correspond 1:1 to destination units at the end of the destination buffer.
michael@0 1041 */
michael@0 1042 const UChar *prevBoundary=src;
michael@0 1043 UChar32 minNoMaybeCP=minCompNoMaybeCP;
michael@0 1044 if(limit==NULL) {
michael@0 1045 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
michael@0 1046 doCompose ? &buffer : NULL,
michael@0 1047 errorCode);
michael@0 1048 if(U_FAILURE(errorCode)) {
michael@0 1049 return FALSE;
michael@0 1050 }
michael@0 1051 if(prevBoundary<src) {
michael@0 1052 // Set prevBoundary to the last character in the prefix.
michael@0 1053 prevBoundary=src-1;
michael@0 1054 }
michael@0 1055 limit=u_strchr(src, 0);
michael@0 1056 }
michael@0 1057
michael@0 1058 const UChar *prevSrc;
michael@0 1059 UChar32 c=0;
michael@0 1060 uint16_t norm16=0;
michael@0 1061
michael@0 1062 // only for isNormalized
michael@0 1063 uint8_t prevCC=0;
michael@0 1064
michael@0 1065 for(;;) {
michael@0 1066 // count code units below the minimum or with irrelevant data for the quick check
michael@0 1067 for(prevSrc=src; src!=limit;) {
michael@0 1068 if( (c=*src)<minNoMaybeCP ||
michael@0 1069 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
michael@0 1070 ) {
michael@0 1071 ++src;
michael@0 1072 } else if(!U16_IS_SURROGATE(c)) {
michael@0 1073 break;
michael@0 1074 } else {
michael@0 1075 UChar c2;
michael@0 1076 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1077 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
michael@0 1078 c=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 1079 }
michael@0 1080 } else /* trail surrogate */ {
michael@0 1081 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
michael@0 1082 --src;
michael@0 1083 c=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 1084 }
michael@0 1085 }
michael@0 1086 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
michael@0 1087 src+=U16_LENGTH(c);
michael@0 1088 } else {
michael@0 1089 break;
michael@0 1090 }
michael@0 1091 }
michael@0 1092 }
michael@0 1093 // copy these code units all at once
michael@0 1094 if(src!=prevSrc) {
michael@0 1095 if(doCompose) {
michael@0 1096 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
michael@0 1097 break;
michael@0 1098 }
michael@0 1099 } else {
michael@0 1100 prevCC=0;
michael@0 1101 }
michael@0 1102 if(src==limit) {
michael@0 1103 break;
michael@0 1104 }
michael@0 1105 // Set prevBoundary to the last character in the quick check loop.
michael@0 1106 prevBoundary=src-1;
michael@0 1107 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
michael@0 1108 U16_IS_LEAD(*(prevBoundary-1))
michael@0 1109 ) {
michael@0 1110 --prevBoundary;
michael@0 1111 }
michael@0 1112 // The start of the current character (c).
michael@0 1113 prevSrc=src;
michael@0 1114 } else if(src==limit) {
michael@0 1115 break;
michael@0 1116 }
michael@0 1117
michael@0 1118 src+=U16_LENGTH(c);
michael@0 1119 /*
michael@0 1120 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
michael@0 1121 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
michael@0 1122 * or has ccc!=0.
michael@0 1123 * Check for Jamo V/T, then for regular characters.
michael@0 1124 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
michael@0 1125 */
michael@0 1126 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
michael@0 1127 UChar prev=*(prevSrc-1);
michael@0 1128 UBool needToDecompose=FALSE;
michael@0 1129 if(c<Hangul::JAMO_T_BASE) {
michael@0 1130 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
michael@0 1131 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
michael@0 1132 if(prev<Hangul::JAMO_L_COUNT) {
michael@0 1133 if(!doCompose) {
michael@0 1134 return FALSE;
michael@0 1135 }
michael@0 1136 UChar syllable=(UChar)
michael@0 1137 (Hangul::HANGUL_BASE+
michael@0 1138 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
michael@0 1139 Hangul::JAMO_T_COUNT);
michael@0 1140 UChar t;
michael@0 1141 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
michael@0 1142 ++src;
michael@0 1143 syllable+=t; // The next character was a Jamo T.
michael@0 1144 prevBoundary=src;
michael@0 1145 buffer.setLastChar(syllable);
michael@0 1146 continue;
michael@0 1147 }
michael@0 1148 // If we see L+V+x where x!=T then we drop to the slow path,
michael@0 1149 // decompose and recompose.
michael@0 1150 // This is to deal with NFKC finding normal L and V but a
michael@0 1151 // compatibility variant of a T. We need to either fully compose that
michael@0 1152 // combination here (which would complicate the code and may not work
michael@0 1153 // with strange custom data) or use the slow path -- or else our replacing
michael@0 1154 // two input characters (L+V) with one output character (LV syllable)
michael@0 1155 // would violate the invariant that [prevBoundary..prevSrc[ has the same
michael@0 1156 // length as what we appended to the buffer since prevBoundary.
michael@0 1157 needToDecompose=TRUE;
michael@0 1158 }
michael@0 1159 } else if(Hangul::isHangulWithoutJamoT(prev)) {
michael@0 1160 // c is a Jamo Trailing consonant,
michael@0 1161 // compose with previous Hangul LV that does not contain a Jamo T.
michael@0 1162 if(!doCompose) {
michael@0 1163 return FALSE;
michael@0 1164 }
michael@0 1165 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
michael@0 1166 prevBoundary=src;
michael@0 1167 continue;
michael@0 1168 }
michael@0 1169 if(!needToDecompose) {
michael@0 1170 // The Jamo V/T did not compose into a Hangul syllable.
michael@0 1171 if(doCompose) {
michael@0 1172 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
michael@0 1173 break;
michael@0 1174 }
michael@0 1175 } else {
michael@0 1176 prevCC=0;
michael@0 1177 }
michael@0 1178 continue;
michael@0 1179 }
michael@0 1180 }
michael@0 1181 /*
michael@0 1182 * Source buffer pointers:
michael@0 1183 *
michael@0 1184 * all done quick check current char not yet
michael@0 1185 * "yes" but (c) processed
michael@0 1186 * may combine
michael@0 1187 * forward
michael@0 1188 * [-------------[-------------[-------------[-------------[
michael@0 1189 * | | | | |
michael@0 1190 * orig. src prevBoundary prevSrc src limit
michael@0 1191 *
michael@0 1192 *
michael@0 1193 * Destination buffer pointers inside the ReorderingBuffer:
michael@0 1194 *
michael@0 1195 * all done might take not filled yet
michael@0 1196 * characters for
michael@0 1197 * reordering
michael@0 1198 * [-------------[-------------[-------------[
michael@0 1199 * | | | |
michael@0 1200 * start reorderStart limit |
michael@0 1201 * +remainingCap.+
michael@0 1202 */
michael@0 1203 if(norm16>=MIN_YES_YES_WITH_CC) {
michael@0 1204 uint8_t cc=(uint8_t)norm16; // cc!=0
michael@0 1205 if( onlyContiguous && // FCC
michael@0 1206 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
michael@0 1207 prevBoundary<prevSrc &&
michael@0 1208 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
michael@0 1209 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
michael@0 1210 // passed the quick check "yes && ccc==0" test.
michael@0 1211 // Check whether the last character was a "yesYes" or a "yesNo".
michael@0 1212 // If a "yesNo", then we get its trailing ccc from its
michael@0 1213 // mapping and check for canonical order.
michael@0 1214 // All other cases are ok.
michael@0 1215 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
michael@0 1216 ) {
michael@0 1217 // Fails FCD test, need to decompose and contiguously recompose.
michael@0 1218 if(!doCompose) {
michael@0 1219 return FALSE;
michael@0 1220 }
michael@0 1221 } else if(doCompose) {
michael@0 1222 if(!buffer.append(c, cc, errorCode)) {
michael@0 1223 break;
michael@0 1224 }
michael@0 1225 continue;
michael@0 1226 } else if(prevCC<=cc) {
michael@0 1227 prevCC=cc;
michael@0 1228 continue;
michael@0 1229 } else {
michael@0 1230 return FALSE;
michael@0 1231 }
michael@0 1232 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
michael@0 1233 return FALSE;
michael@0 1234 }
michael@0 1235
michael@0 1236 /*
michael@0 1237 * Find appropriate boundaries around this character,
michael@0 1238 * decompose the source text from between the boundaries,
michael@0 1239 * and recompose it.
michael@0 1240 *
michael@0 1241 * We may need to remove the last few characters from the ReorderingBuffer
michael@0 1242 * to account for source text that was copied or appended
michael@0 1243 * but needs to take part in the recomposition.
michael@0 1244 */
michael@0 1245
michael@0 1246 /*
michael@0 1247 * Find the last composition boundary in [prevBoundary..src[.
michael@0 1248 * It is either the decomposition of the current character (at prevSrc),
michael@0 1249 * or prevBoundary.
michael@0 1250 */
michael@0 1251 if(hasCompBoundaryBefore(c, norm16)) {
michael@0 1252 prevBoundary=prevSrc;
michael@0 1253 } else if(doCompose) {
michael@0 1254 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
michael@0 1255 }
michael@0 1256
michael@0 1257 // Find the next composition boundary in [src..limit[ -
michael@0 1258 // modifies src to point to the next starter.
michael@0 1259 src=(UChar *)findNextCompBoundary(src, limit);
michael@0 1260
michael@0 1261 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
michael@0 1262 int32_t recomposeStartIndex=buffer.length();
michael@0 1263 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
michael@0 1264 break;
michael@0 1265 }
michael@0 1266 recompose(buffer, recomposeStartIndex, onlyContiguous);
michael@0 1267 if(!doCompose) {
michael@0 1268 if(!buffer.equals(prevBoundary, src)) {
michael@0 1269 return FALSE;
michael@0 1270 }
michael@0 1271 buffer.remove();
michael@0 1272 prevCC=0;
michael@0 1273 }
michael@0 1274
michael@0 1275 // Move to the next starter. We never need to look back before this point again.
michael@0 1276 prevBoundary=src;
michael@0 1277 }
michael@0 1278 return TRUE;
michael@0 1279 }
michael@0 1280
michael@0 1281 // Very similar to compose(): Make the same changes in both places if relevant.
michael@0 1282 // pQCResult==NULL: spanQuickCheckYes
michael@0 1283 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
michael@0 1284 const UChar *
michael@0 1285 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
michael@0 1286 UBool onlyContiguous,
michael@0 1287 UNormalizationCheckResult *pQCResult) const {
michael@0 1288 /*
michael@0 1289 * prevBoundary points to the last character before the current one
michael@0 1290 * that has a composition boundary before it with ccc==0 and quick check "yes".
michael@0 1291 */
michael@0 1292 const UChar *prevBoundary=src;
michael@0 1293 UChar32 minNoMaybeCP=minCompNoMaybeCP;
michael@0 1294 if(limit==NULL) {
michael@0 1295 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 1296 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
michael@0 1297 if(prevBoundary<src) {
michael@0 1298 // Set prevBoundary to the last character in the prefix.
michael@0 1299 prevBoundary=src-1;
michael@0 1300 }
michael@0 1301 limit=u_strchr(src, 0);
michael@0 1302 }
michael@0 1303
michael@0 1304 const UChar *prevSrc;
michael@0 1305 UChar32 c=0;
michael@0 1306 uint16_t norm16=0;
michael@0 1307 uint8_t prevCC=0;
michael@0 1308
michael@0 1309 for(;;) {
michael@0 1310 // count code units below the minimum or with irrelevant data for the quick check
michael@0 1311 for(prevSrc=src;;) {
michael@0 1312 if(src==limit) {
michael@0 1313 return src;
michael@0 1314 }
michael@0 1315 if( (c=*src)<minNoMaybeCP ||
michael@0 1316 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
michael@0 1317 ) {
michael@0 1318 ++src;
michael@0 1319 } else if(!U16_IS_SURROGATE(c)) {
michael@0 1320 break;
michael@0 1321 } else {
michael@0 1322 UChar c2;
michael@0 1323 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1324 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
michael@0 1325 c=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 1326 }
michael@0 1327 } else /* trail surrogate */ {
michael@0 1328 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
michael@0 1329 --src;
michael@0 1330 c=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 1331 }
michael@0 1332 }
michael@0 1333 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
michael@0 1334 src+=U16_LENGTH(c);
michael@0 1335 } else {
michael@0 1336 break;
michael@0 1337 }
michael@0 1338 }
michael@0 1339 }
michael@0 1340 if(src!=prevSrc) {
michael@0 1341 // Set prevBoundary to the last character in the quick check loop.
michael@0 1342 prevBoundary=src-1;
michael@0 1343 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
michael@0 1344 U16_IS_LEAD(*(prevBoundary-1))
michael@0 1345 ) {
michael@0 1346 --prevBoundary;
michael@0 1347 }
michael@0 1348 prevCC=0;
michael@0 1349 // The start of the current character (c).
michael@0 1350 prevSrc=src;
michael@0 1351 }
michael@0 1352
michael@0 1353 src+=U16_LENGTH(c);
michael@0 1354 /*
michael@0 1355 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
michael@0 1356 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
michael@0 1357 * or has ccc!=0.
michael@0 1358 */
michael@0 1359 if(isMaybeOrNonZeroCC(norm16)) {
michael@0 1360 uint8_t cc=getCCFromYesOrMaybe(norm16);
michael@0 1361 if( onlyContiguous && // FCC
michael@0 1362 cc!=0 &&
michael@0 1363 prevCC==0 &&
michael@0 1364 prevBoundary<prevSrc &&
michael@0 1365 // prevCC==0 && prevBoundary<prevSrc tell us that
michael@0 1366 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
michael@0 1367 // passed the quick check "yes && ccc==0" test.
michael@0 1368 // Check whether the last character was a "yesYes" or a "yesNo".
michael@0 1369 // If a "yesNo", then we get its trailing ccc from its
michael@0 1370 // mapping and check for canonical order.
michael@0 1371 // All other cases are ok.
michael@0 1372 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
michael@0 1373 ) {
michael@0 1374 // Fails FCD test.
michael@0 1375 } else if(prevCC<=cc || cc==0) {
michael@0 1376 prevCC=cc;
michael@0 1377 if(norm16<MIN_YES_YES_WITH_CC) {
michael@0 1378 if(pQCResult!=NULL) {
michael@0 1379 *pQCResult=UNORM_MAYBE;
michael@0 1380 } else {
michael@0 1381 return prevBoundary;
michael@0 1382 }
michael@0 1383 }
michael@0 1384 continue;
michael@0 1385 }
michael@0 1386 }
michael@0 1387 if(pQCResult!=NULL) {
michael@0 1388 *pQCResult=UNORM_NO;
michael@0 1389 }
michael@0 1390 return prevBoundary;
michael@0 1391 }
michael@0 1392 }
michael@0 1393
michael@0 1394 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
michael@0 1395 UBool doCompose,
michael@0 1396 UBool onlyContiguous,
michael@0 1397 UnicodeString &safeMiddle,
michael@0 1398 ReorderingBuffer &buffer,
michael@0 1399 UErrorCode &errorCode) const {
michael@0 1400 if(!buffer.isEmpty()) {
michael@0 1401 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
michael@0 1402 if(src!=firstStarterInSrc) {
michael@0 1403 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
michael@0 1404 buffer.getLimit());
michael@0 1405 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
michael@0 1406 UnicodeString middle(lastStarterInDest, destSuffixLength);
michael@0 1407 buffer.removeSuffix(destSuffixLength);
michael@0 1408 safeMiddle=middle;
michael@0 1409 middle.append(src, (int32_t)(firstStarterInSrc-src));
michael@0 1410 const UChar *middleStart=middle.getBuffer();
michael@0 1411 compose(middleStart, middleStart+middle.length(), onlyContiguous,
michael@0 1412 TRUE, buffer, errorCode);
michael@0 1413 if(U_FAILURE(errorCode)) {
michael@0 1414 return;
michael@0 1415 }
michael@0 1416 src=firstStarterInSrc;
michael@0 1417 }
michael@0 1418 }
michael@0 1419 if(doCompose) {
michael@0 1420 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
michael@0 1421 } else {
michael@0 1422 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
michael@0 1423 limit=u_strchr(src, 0);
michael@0 1424 }
michael@0 1425 buffer.appendZeroCC(src, limit, errorCode);
michael@0 1426 }
michael@0 1427 }
michael@0 1428
michael@0 1429 /**
michael@0 1430 * Does c have a composition boundary before it?
michael@0 1431 * True if its decomposition begins with a character that has
michael@0 1432 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
michael@0 1433 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
michael@0 1434 * (isCompYesAndZeroCC()) so we need not decompose.
michael@0 1435 */
michael@0 1436 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
michael@0 1437 for(;;) {
michael@0 1438 if(isCompYesAndZeroCC(norm16)) {
michael@0 1439 return TRUE;
michael@0 1440 } else if(isMaybeOrNonZeroCC(norm16)) {
michael@0 1441 return FALSE;
michael@0 1442 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 1443 c=mapAlgorithmic(c, norm16);
michael@0 1444 norm16=getNorm16(c);
michael@0 1445 } else {
michael@0 1446 // c decomposes, get everything from the variable-length extra data
michael@0 1447 const uint16_t *mapping=getMapping(norm16);
michael@0 1448 uint16_t firstUnit=*mapping;
michael@0 1449 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
michael@0 1450 return FALSE;
michael@0 1451 }
michael@0 1452 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
michael@0 1453 return FALSE; // non-zero leadCC
michael@0 1454 }
michael@0 1455 int32_t i=1; // skip over the firstUnit
michael@0 1456 UChar32 c;
michael@0 1457 U16_NEXT_UNSAFE(mapping, i, c);
michael@0 1458 return isCompYesAndZeroCC(getNorm16(c));
michael@0 1459 }
michael@0 1460 }
michael@0 1461 }
michael@0 1462
michael@0 1463 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
michael@0 1464 for(;;) {
michael@0 1465 uint16_t norm16=getNorm16(c);
michael@0 1466 if(isInert(norm16)) {
michael@0 1467 return TRUE;
michael@0 1468 } else if(norm16<=minYesNo) {
michael@0 1469 // Hangul: norm16==minYesNo
michael@0 1470 // Hangul LVT has a boundary after it.
michael@0 1471 // Hangul LV and non-inert yesYes characters combine forward.
michael@0 1472 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
michael@0 1473 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
michael@0 1474 return FALSE;
michael@0 1475 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 1476 c=mapAlgorithmic(c, norm16);
michael@0 1477 } else {
michael@0 1478 // c decomposes, get everything from the variable-length extra data.
michael@0 1479 // If testInert, then c must be a yesNo character which has lccc=0,
michael@0 1480 // otherwise it could be a noNo.
michael@0 1481 const uint16_t *mapping=getMapping(norm16);
michael@0 1482 uint16_t firstUnit=*mapping;
michael@0 1483 // TRUE if
michael@0 1484 // not MAPPING_NO_COMP_BOUNDARY_AFTER
michael@0 1485 // (which is set if
michael@0 1486 // c is not deleted, and
michael@0 1487 // it and its decomposition do not combine forward, and it has a starter)
michael@0 1488 // and if FCC then trailCC<=1
michael@0 1489 return
michael@0 1490 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
michael@0 1491 (!onlyContiguous || firstUnit<=0x1ff);
michael@0 1492 }
michael@0 1493 }
michael@0 1494 }
michael@0 1495
michael@0 1496 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
michael@0 1497 BackwardUTrie2StringIterator iter(normTrie, start, p);
michael@0 1498 uint16_t norm16;
michael@0 1499 do {
michael@0 1500 norm16=iter.previous16();
michael@0 1501 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
michael@0 1502 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
michael@0 1503 // but that's probably not worth the extra cost.
michael@0 1504 return iter.codePointStart;
michael@0 1505 }
michael@0 1506
michael@0 1507 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
michael@0 1508 ForwardUTrie2StringIterator iter(normTrie, p, limit);
michael@0 1509 uint16_t norm16;
michael@0 1510 do {
michael@0 1511 norm16=iter.next16();
michael@0 1512 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
michael@0 1513 return iter.codePointStart;
michael@0 1514 }
michael@0 1515
michael@0 1516 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
michael@0 1517 // still had getFCDTrie() which built and cached an FCD trie.
michael@0 1518 // That provided faster access to FCD data than getFCD16FromNormData()
michael@0 1519 // but required synchronization and consumed some 10kB of heap memory
michael@0 1520 // in any process that uses FCD (e.g., via collation).
michael@0 1521 // tccc180[] and smallFCD[] are intended to help with any loss of performance,
michael@0 1522 // at least for Latin & CJK.
michael@0 1523
michael@0 1524 // Gets the FCD value from the regular normalization data.
michael@0 1525 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
michael@0 1526 // Only loops for 1:1 algorithmic mappings.
michael@0 1527 for(;;) {
michael@0 1528 uint16_t norm16=getNorm16(c);
michael@0 1529 if(norm16<=minYesNo) {
michael@0 1530 // no decomposition or Hangul syllable, all zeros
michael@0 1531 return 0;
michael@0 1532 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
michael@0 1533 // combining mark
michael@0 1534 norm16&=0xff;
michael@0 1535 return norm16|(norm16<<8);
michael@0 1536 } else if(norm16>=minMaybeYes) {
michael@0 1537 return 0;
michael@0 1538 } else if(isDecompNoAlgorithmic(norm16)) {
michael@0 1539 c=mapAlgorithmic(c, norm16);
michael@0 1540 } else {
michael@0 1541 // c decomposes, get everything from the variable-length extra data
michael@0 1542 const uint16_t *mapping=getMapping(norm16);
michael@0 1543 uint16_t firstUnit=*mapping;
michael@0 1544 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
michael@0 1545 // A character that is deleted (maps to an empty string) must
michael@0 1546 // get the worst-case lccc and tccc values because arbitrary
michael@0 1547 // characters on both sides will become adjacent.
michael@0 1548 return 0x1ff;
michael@0 1549 } else {
michael@0 1550 norm16=firstUnit>>8; // tccc
michael@0 1551 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
michael@0 1552 norm16|=*(mapping-1)&0xff00; // lccc
michael@0 1553 }
michael@0 1554 return norm16;
michael@0 1555 }
michael@0 1556 }
michael@0 1557 }
michael@0 1558 }
michael@0 1559
michael@0 1560 // Dual functionality:
michael@0 1561 // buffer!=NULL: normalize
michael@0 1562 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
michael@0 1563 const UChar *
michael@0 1564 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
michael@0 1565 ReorderingBuffer *buffer,
michael@0 1566 UErrorCode &errorCode) const {
michael@0 1567 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
michael@0 1568 // Similar to the prevBoundary in the compose() implementation.
michael@0 1569 const UChar *prevBoundary=src;
michael@0 1570 int32_t prevFCD16=0;
michael@0 1571 if(limit==NULL) {
michael@0 1572 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
michael@0 1573 if(U_FAILURE(errorCode)) {
michael@0 1574 return src;
michael@0 1575 }
michael@0 1576 if(prevBoundary<src) {
michael@0 1577 prevBoundary=src;
michael@0 1578 // We know that the previous character's lccc==0.
michael@0 1579 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
michael@0 1580 prevFCD16=getFCD16(*(src-1));
michael@0 1581 if(prevFCD16>1) {
michael@0 1582 --prevBoundary;
michael@0 1583 }
michael@0 1584 }
michael@0 1585 limit=u_strchr(src, 0);
michael@0 1586 }
michael@0 1587
michael@0 1588 // Note: In this function we use buffer->appendZeroCC() because we track
michael@0 1589 // the lead and trail combining classes here, rather than leaving it to
michael@0 1590 // the ReorderingBuffer.
michael@0 1591 // The exception is the call to decomposeShort() which uses the buffer
michael@0 1592 // in the normal way.
michael@0 1593
michael@0 1594 const UChar *prevSrc;
michael@0 1595 UChar32 c=0;
michael@0 1596 uint16_t fcd16=0;
michael@0 1597
michael@0 1598 for(;;) {
michael@0 1599 // count code units with lccc==0
michael@0 1600 for(prevSrc=src; src!=limit;) {
michael@0 1601 if((c=*src)<MIN_CCC_LCCC_CP) {
michael@0 1602 prevFCD16=~c;
michael@0 1603 ++src;
michael@0 1604 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
michael@0 1605 prevFCD16=0;
michael@0 1606 ++src;
michael@0 1607 } else {
michael@0 1608 if(U16_IS_SURROGATE(c)) {
michael@0 1609 UChar c2;
michael@0 1610 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1611 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
michael@0 1612 c=U16_GET_SUPPLEMENTARY(c, c2);
michael@0 1613 }
michael@0 1614 } else /* trail surrogate */ {
michael@0 1615 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
michael@0 1616 --src;
michael@0 1617 c=U16_GET_SUPPLEMENTARY(c2, c);
michael@0 1618 }
michael@0 1619 }
michael@0 1620 }
michael@0 1621 if((fcd16=getFCD16FromNormData(c))<=0xff) {
michael@0 1622 prevFCD16=fcd16;
michael@0 1623 src+=U16_LENGTH(c);
michael@0 1624 } else {
michael@0 1625 break;
michael@0 1626 }
michael@0 1627 }
michael@0 1628 }
michael@0 1629 // copy these code units all at once
michael@0 1630 if(src!=prevSrc) {
michael@0 1631 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
michael@0 1632 break;
michael@0 1633 }
michael@0 1634 if(src==limit) {
michael@0 1635 break;
michael@0 1636 }
michael@0 1637 prevBoundary=src;
michael@0 1638 // We know that the previous character's lccc==0.
michael@0 1639 if(prevFCD16<0) {
michael@0 1640 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
michael@0 1641 UChar32 prev=~prevFCD16;
michael@0 1642 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
michael@0 1643 if(prevFCD16>1) {
michael@0 1644 --prevBoundary;
michael@0 1645 }
michael@0 1646 } else {
michael@0 1647 const UChar *p=src-1;
michael@0 1648 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
michael@0 1649 --p;
michael@0 1650 // Need to fetch the previous character's FCD value because
michael@0 1651 // prevFCD16 was just for the trail surrogate code point.
michael@0 1652 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
michael@0 1653 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
michael@0 1654 }
michael@0 1655 if(prevFCD16>1) {
michael@0 1656 prevBoundary=p;
michael@0 1657 }
michael@0 1658 }
michael@0 1659 // The start of the current character (c).
michael@0 1660 prevSrc=src;
michael@0 1661 } else if(src==limit) {
michael@0 1662 break;
michael@0 1663 }
michael@0 1664
michael@0 1665 src+=U16_LENGTH(c);
michael@0 1666 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
michael@0 1667 // Check for proper order, and decompose locally if necessary.
michael@0 1668 if((prevFCD16&0xff)<=(fcd16>>8)) {
michael@0 1669 // proper order: prev tccc <= current lccc
michael@0 1670 if((fcd16&0xff)<=1) {
michael@0 1671 prevBoundary=src;
michael@0 1672 }
michael@0 1673 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
michael@0 1674 break;
michael@0 1675 }
michael@0 1676 prevFCD16=fcd16;
michael@0 1677 continue;
michael@0 1678 } else if(buffer==NULL) {
michael@0 1679 return prevBoundary; // quick check "no"
michael@0 1680 } else {
michael@0 1681 /*
michael@0 1682 * Back out the part of the source that we copied or appended
michael@0 1683 * already but is now going to be decomposed.
michael@0 1684 * prevSrc is set to after what was copied/appended.
michael@0 1685 */
michael@0 1686 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
michael@0 1687 /*
michael@0 1688 * Find the part of the source that needs to be decomposed,
michael@0 1689 * up to the next safe boundary.
michael@0 1690 */
michael@0 1691 src=findNextFCDBoundary(src, limit);
michael@0 1692 /*
michael@0 1693 * The source text does not fulfill the conditions for FCD.
michael@0 1694 * Decompose and reorder a limited piece of the text.
michael@0 1695 */
michael@0 1696 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
michael@0 1697 break;
michael@0 1698 }
michael@0 1699 prevBoundary=src;
michael@0 1700 prevFCD16=0;
michael@0 1701 }
michael@0 1702 }
michael@0 1703 return src;
michael@0 1704 }
michael@0 1705
michael@0 1706 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
michael@0 1707 UBool doMakeFCD,
michael@0 1708 UnicodeString &safeMiddle,
michael@0 1709 ReorderingBuffer &buffer,
michael@0 1710 UErrorCode &errorCode) const {
michael@0 1711 if(!buffer.isEmpty()) {
michael@0 1712 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
michael@0 1713 if(src!=firstBoundaryInSrc) {
michael@0 1714 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
michael@0 1715 buffer.getLimit());
michael@0 1716 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
michael@0 1717 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
michael@0 1718 buffer.removeSuffix(destSuffixLength);
michael@0 1719 safeMiddle=middle;
michael@0 1720 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
michael@0 1721 const UChar *middleStart=middle.getBuffer();
michael@0 1722 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
michael@0 1723 if(U_FAILURE(errorCode)) {
michael@0 1724 return;
michael@0 1725 }
michael@0 1726 src=firstBoundaryInSrc;
michael@0 1727 }
michael@0 1728 }
michael@0 1729 if(doMakeFCD) {
michael@0 1730 makeFCD(src, limit, &buffer, errorCode);
michael@0 1731 } else {
michael@0 1732 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
michael@0 1733 limit=u_strchr(src, 0);
michael@0 1734 }
michael@0 1735 buffer.appendZeroCC(src, limit, errorCode);
michael@0 1736 }
michael@0 1737 }
michael@0 1738
michael@0 1739 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
michael@0 1740 while(start<p && previousFCD16(start, p)>0xff) {}
michael@0 1741 return p;
michael@0 1742 }
michael@0 1743
michael@0 1744 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
michael@0 1745 while(p<limit) {
michael@0 1746 const UChar *codePointStart=p;
michael@0 1747 if(nextFCD16(p, limit)<=0xff) {
michael@0 1748 return codePointStart;
michael@0 1749 }
michael@0 1750 }
michael@0 1751 return p;
michael@0 1752 }
michael@0 1753
michael@0 1754 // CanonicalIterator data -------------------------------------------------- ***
michael@0 1755
michael@0 1756 CanonIterData::CanonIterData(UErrorCode &errorCode) :
michael@0 1757 trie(utrie2_open(0, 0, &errorCode)),
michael@0 1758 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
michael@0 1759
michael@0 1760 CanonIterData::~CanonIterData() {
michael@0 1761 utrie2_close(trie);
michael@0 1762 }
michael@0 1763
michael@0 1764 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
michael@0 1765 uint32_t canonValue=utrie2_get32(trie, decompLead);
michael@0 1766 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
michael@0 1767 // origin is the first character whose decomposition starts with
michael@0 1768 // the character for which we are setting the value.
michael@0 1769 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
michael@0 1770 } else {
michael@0 1771 // origin is not the first character, or it is U+0000.
michael@0 1772 UnicodeSet *set;
michael@0 1773 if((canonValue&CANON_HAS_SET)==0) {
michael@0 1774 set=new UnicodeSet;
michael@0 1775 if(set==NULL) {
michael@0 1776 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 1777 return;
michael@0 1778 }
michael@0 1779 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
michael@0 1780 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
michael@0 1781 utrie2_set32(trie, decompLead, canonValue, &errorCode);
michael@0 1782 canonStartSets.addElement(set, errorCode);
michael@0 1783 if(firstOrigin!=0) {
michael@0 1784 set->add(firstOrigin);
michael@0 1785 }
michael@0 1786 } else {
michael@0 1787 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
michael@0 1788 }
michael@0 1789 set->add(origin);
michael@0 1790 }
michael@0 1791 }
michael@0 1792
michael@0 1793 U_CDECL_BEGIN
michael@0 1794
michael@0 1795 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
michael@0 1796 // context: the Normalizer2Impl
michael@0 1797 static UBool U_CALLCONV
michael@0 1798 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
michael@0 1799 UErrorCode errorCode = U_ZERO_ERROR;
michael@0 1800 if (value != 0) {
michael@0 1801 Normalizer2Impl *impl = (Normalizer2Impl *)context;
michael@0 1802 impl->makeCanonIterDataFromNorm16(
michael@0 1803 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
michael@0 1804 }
michael@0 1805 return U_SUCCESS(errorCode);
michael@0 1806 }
michael@0 1807
michael@0 1808
michael@0 1809
michael@0 1810 // UInitOnce instantiation function for CanonIterData
michael@0 1811
michael@0 1812 static void U_CALLCONV
michael@0 1813 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
michael@0 1814 U_ASSERT(impl->fCanonIterData == NULL);
michael@0 1815 impl->fCanonIterData = new CanonIterData(errorCode);
michael@0 1816 if (impl->fCanonIterData == NULL) {
michael@0 1817 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 1818 }
michael@0 1819 if (U_SUCCESS(errorCode)) {
michael@0 1820 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
michael@0 1821 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
michael@0 1822 }
michael@0 1823 if (U_FAILURE(errorCode)) {
michael@0 1824 delete impl->fCanonIterData;
michael@0 1825 impl->fCanonIterData = NULL;
michael@0 1826 }
michael@0 1827 }
michael@0 1828
michael@0 1829 U_CDECL_END
michael@0 1830
michael@0 1831 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
michael@0 1832 CanonIterData &newData,
michael@0 1833 UErrorCode &errorCode) const {
michael@0 1834 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
michael@0 1835 // Inert, or 2-way mapping (including Hangul syllable).
michael@0 1836 // We do not write a canonStartSet for any yesNo character.
michael@0 1837 // Composites from 2-way mappings are added at runtime from the
michael@0 1838 // starter's compositions list, and the other characters in
michael@0 1839 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
michael@0 1840 // "maybe" characters.
michael@0 1841 return;
michael@0 1842 }
michael@0 1843 for(UChar32 c=start; c<=end; ++c) {
michael@0 1844 uint32_t oldValue=utrie2_get32(newData.trie, c);
michael@0 1845 uint32_t newValue=oldValue;
michael@0 1846 if(norm16>=minMaybeYes) {
michael@0 1847 // not a segment starter if it occurs in a decomposition or has cc!=0
michael@0 1848 newValue|=CANON_NOT_SEGMENT_STARTER;
michael@0 1849 if(norm16<MIN_NORMAL_MAYBE_YES) {
michael@0 1850 newValue|=CANON_HAS_COMPOSITIONS;
michael@0 1851 }
michael@0 1852 } else if(norm16<minYesNo) {
michael@0 1853 newValue|=CANON_HAS_COMPOSITIONS;
michael@0 1854 } else {
michael@0 1855 // c has a one-way decomposition
michael@0 1856 UChar32 c2=c;
michael@0 1857 uint16_t norm16_2=norm16;
michael@0 1858 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
michael@0 1859 c2=mapAlgorithmic(c2, norm16_2);
michael@0 1860 norm16_2=getNorm16(c2);
michael@0 1861 }
michael@0 1862 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
michael@0 1863 // c decomposes, get everything from the variable-length extra data
michael@0 1864 const uint16_t *mapping=getMapping(norm16_2);
michael@0 1865 uint16_t firstUnit=*mapping;
michael@0 1866 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
michael@0 1867 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
michael@0 1868 if(c==c2 && (*(mapping-1)&0xff)!=0) {
michael@0 1869 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
michael@0 1870 }
michael@0 1871 }
michael@0 1872 // Skip empty mappings (no characters in the decomposition).
michael@0 1873 if(length!=0) {
michael@0 1874 ++mapping; // skip over the firstUnit
michael@0 1875 // add c to first code point's start set
michael@0 1876 int32_t i=0;
michael@0 1877 U16_NEXT_UNSAFE(mapping, i, c2);
michael@0 1878 newData.addToStartSet(c, c2, errorCode);
michael@0 1879 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
michael@0 1880 // one-way mapping. A 2-way mapping is possible here after
michael@0 1881 // intermediate algorithmic mapping.
michael@0 1882 if(norm16_2>=minNoNo) {
michael@0 1883 while(i<length) {
michael@0 1884 U16_NEXT_UNSAFE(mapping, i, c2);
michael@0 1885 uint32_t c2Value=utrie2_get32(newData.trie, c2);
michael@0 1886 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
michael@0 1887 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
michael@0 1888 &errorCode);
michael@0 1889 }
michael@0 1890 }
michael@0 1891 }
michael@0 1892 }
michael@0 1893 } else {
michael@0 1894 // c decomposed to c2 algorithmically; c has cc==0
michael@0 1895 newData.addToStartSet(c, c2, errorCode);
michael@0 1896 }
michael@0 1897 }
michael@0 1898 if(newValue!=oldValue) {
michael@0 1899 utrie2_set32(newData.trie, c, newValue, &errorCode);
michael@0 1900 }
michael@0 1901 }
michael@0 1902 }
michael@0 1903
michael@0 1904 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
michael@0 1905 // Logically const: Synchronized instantiation.
michael@0 1906 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
michael@0 1907 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
michael@0 1908 return U_SUCCESS(errorCode);
michael@0 1909 }
michael@0 1910
michael@0 1911 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
michael@0 1912 return (int32_t)utrie2_get32(fCanonIterData->trie, c);
michael@0 1913 }
michael@0 1914
michael@0 1915 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
michael@0 1916 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
michael@0 1917 }
michael@0 1918
michael@0 1919 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
michael@0 1920 return getCanonValue(c)>=0;
michael@0 1921 }
michael@0 1922
michael@0 1923 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
michael@0 1924 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
michael@0 1925 if(canonValue==0) {
michael@0 1926 return FALSE;
michael@0 1927 }
michael@0 1928 set.clear();
michael@0 1929 int32_t value=canonValue&CANON_VALUE_MASK;
michael@0 1930 if((canonValue&CANON_HAS_SET)!=0) {
michael@0 1931 set.addAll(getCanonStartSet(value));
michael@0 1932 } else if(value!=0) {
michael@0 1933 set.add(value);
michael@0 1934 }
michael@0 1935 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
michael@0 1936 uint16_t norm16=getNorm16(c);
michael@0 1937 if(norm16==JAMO_L) {
michael@0 1938 UChar32 syllable=
michael@0 1939 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
michael@0 1940 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
michael@0 1941 } else {
michael@0 1942 addComposites(getCompositionsList(norm16), set);
michael@0 1943 }
michael@0 1944 }
michael@0 1945 return TRUE;
michael@0 1946 }
michael@0 1947
michael@0 1948 U_NAMESPACE_END
michael@0 1949
michael@0 1950 // Normalizer2 data swapping ----------------------------------------------- ***
michael@0 1951
michael@0 1952 U_NAMESPACE_USE
michael@0 1953
michael@0 1954 U_CAPI int32_t U_EXPORT2
michael@0 1955 unorm2_swap(const UDataSwapper *ds,
michael@0 1956 const void *inData, int32_t length, void *outData,
michael@0 1957 UErrorCode *pErrorCode) {
michael@0 1958 const UDataInfo *pInfo;
michael@0 1959 int32_t headerSize;
michael@0 1960
michael@0 1961 const uint8_t *inBytes;
michael@0 1962 uint8_t *outBytes;
michael@0 1963
michael@0 1964 const int32_t *inIndexes;
michael@0 1965 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
michael@0 1966
michael@0 1967 int32_t i, offset, nextOffset, size;
michael@0 1968
michael@0 1969 /* udata_swapDataHeader checks the arguments */
michael@0 1970 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
michael@0 1971 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 1972 return 0;
michael@0 1973 }
michael@0 1974
michael@0 1975 /* check data format and format version */
michael@0 1976 pInfo=(const UDataInfo *)((const char *)inData+4);
michael@0 1977 if(!(
michael@0 1978 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
michael@0 1979 pInfo->dataFormat[1]==0x72 &&
michael@0 1980 pInfo->dataFormat[2]==0x6d &&
michael@0 1981 pInfo->dataFormat[3]==0x32 &&
michael@0 1982 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
michael@0 1983 )) {
michael@0 1984 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
michael@0 1985 pInfo->dataFormat[0], pInfo->dataFormat[1],
michael@0 1986 pInfo->dataFormat[2], pInfo->dataFormat[3],
michael@0 1987 pInfo->formatVersion[0]);
michael@0 1988 *pErrorCode=U_UNSUPPORTED_ERROR;
michael@0 1989 return 0;
michael@0 1990 }
michael@0 1991
michael@0 1992 inBytes=(const uint8_t *)inData+headerSize;
michael@0 1993 outBytes=(uint8_t *)outData+headerSize;
michael@0 1994
michael@0 1995 inIndexes=(const int32_t *)inBytes;
michael@0 1996
michael@0 1997 if(length>=0) {
michael@0 1998 length-=headerSize;
michael@0 1999 if(length<(int32_t)sizeof(indexes)) {
michael@0 2000 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
michael@0 2001 length);
michael@0 2002 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2003 return 0;
michael@0 2004 }
michael@0 2005 }
michael@0 2006
michael@0 2007 /* read the first few indexes */
michael@0 2008 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
michael@0 2009 indexes[i]=udata_readInt32(ds, inIndexes[i]);
michael@0 2010 }
michael@0 2011
michael@0 2012 /* get the total length of the data */
michael@0 2013 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
michael@0 2014
michael@0 2015 if(length>=0) {
michael@0 2016 if(length<size) {
michael@0 2017 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
michael@0 2018 length);
michael@0 2019 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2020 return 0;
michael@0 2021 }
michael@0 2022
michael@0 2023 /* copy the data for inaccessible bytes */
michael@0 2024 if(inBytes!=outBytes) {
michael@0 2025 uprv_memcpy(outBytes, inBytes, size);
michael@0 2026 }
michael@0 2027
michael@0 2028 offset=0;
michael@0 2029
michael@0 2030 /* swap the int32_t indexes[] */
michael@0 2031 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
michael@0 2032 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
michael@0 2033 offset=nextOffset;
michael@0 2034
michael@0 2035 /* swap the UTrie2 */
michael@0 2036 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
michael@0 2037 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
michael@0 2038 offset=nextOffset;
michael@0 2039
michael@0 2040 /* swap the uint16_t extraData[] */
michael@0 2041 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
michael@0 2042 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
michael@0 2043 offset=nextOffset;
michael@0 2044
michael@0 2045 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
michael@0 2046 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
michael@0 2047 offset=nextOffset;
michael@0 2048
michael@0 2049 U_ASSERT(offset==size);
michael@0 2050 }
michael@0 2051
michael@0 2052 return headerSize+size;
michael@0 2053 }
michael@0 2054
michael@0 2055 #endif // !UCONFIG_NO_NORMALIZATION

mercurial