|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2009-2013, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: normalizer2impl.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2009nov22 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 |
|
19 #if !UCONFIG_NO_NORMALIZATION |
|
20 |
|
21 #include "unicode/normalizer2.h" |
|
22 #include "unicode/udata.h" |
|
23 #include "unicode/ustring.h" |
|
24 #include "unicode/utf16.h" |
|
25 #include "cmemory.h" |
|
26 #include "mutex.h" |
|
27 #include "normalizer2impl.h" |
|
28 #include "putilimp.h" |
|
29 #include "uassert.h" |
|
30 #include "uset_imp.h" |
|
31 #include "utrie2.h" |
|
32 #include "uvector.h" |
|
33 |
|
34 U_NAMESPACE_BEGIN |
|
35 |
|
36 // ReorderingBuffer -------------------------------------------------------- *** |
|
37 |
|
38 UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { |
|
39 int32_t length=str.length(); |
|
40 start=str.getBuffer(destCapacity); |
|
41 if(start==NULL) { |
|
42 // getBuffer() already did str.setToBogus() |
|
43 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
44 return FALSE; |
|
45 } |
|
46 limit=start+length; |
|
47 remainingCapacity=str.getCapacity()-length; |
|
48 reorderStart=start; |
|
49 if(start==limit) { |
|
50 lastCC=0; |
|
51 } else { |
|
52 setIterator(); |
|
53 lastCC=previousCC(); |
|
54 // Set reorderStart after the last code point with cc<=1 if there is one. |
|
55 if(lastCC>1) { |
|
56 while(previousCC()>1) {} |
|
57 } |
|
58 reorderStart=codePointLimit; |
|
59 } |
|
60 return TRUE; |
|
61 } |
|
62 |
|
63 UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { |
|
64 int32_t length=(int32_t)(limit-start); |
|
65 return |
|
66 length==(int32_t)(otherLimit-otherStart) && |
|
67 0==u_memcmp(start, otherStart, length); |
|
68 } |
|
69 |
|
70 UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { |
|
71 if(remainingCapacity<2 && !resize(2, errorCode)) { |
|
72 return FALSE; |
|
73 } |
|
74 if(lastCC<=cc || cc==0) { |
|
75 limit[0]=U16_LEAD(c); |
|
76 limit[1]=U16_TRAIL(c); |
|
77 limit+=2; |
|
78 lastCC=cc; |
|
79 if(cc<=1) { |
|
80 reorderStart=limit; |
|
81 } |
|
82 } else { |
|
83 insert(c, cc); |
|
84 } |
|
85 remainingCapacity-=2; |
|
86 return TRUE; |
|
87 } |
|
88 |
|
89 UBool ReorderingBuffer::append(const UChar *s, int32_t length, |
|
90 uint8_t leadCC, uint8_t trailCC, |
|
91 UErrorCode &errorCode) { |
|
92 if(length==0) { |
|
93 return TRUE; |
|
94 } |
|
95 if(remainingCapacity<length && !resize(length, errorCode)) { |
|
96 return FALSE; |
|
97 } |
|
98 remainingCapacity-=length; |
|
99 if(lastCC<=leadCC || leadCC==0) { |
|
100 if(trailCC<=1) { |
|
101 reorderStart=limit+length; |
|
102 } else if(leadCC<=1) { |
|
103 reorderStart=limit+1; // Ok if not a code point boundary. |
|
104 } |
|
105 const UChar *sLimit=s+length; |
|
106 do { *limit++=*s++; } while(s!=sLimit); |
|
107 lastCC=trailCC; |
|
108 } else { |
|
109 int32_t i=0; |
|
110 UChar32 c; |
|
111 U16_NEXT(s, i, length, c); |
|
112 insert(c, leadCC); // insert first code point |
|
113 while(i<length) { |
|
114 U16_NEXT(s, i, length, c); |
|
115 if(i<length) { |
|
116 // s must be in NFD, otherwise we need to use getCC(). |
|
117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); |
|
118 } else { |
|
119 leadCC=trailCC; |
|
120 } |
|
121 append(c, leadCC, errorCode); |
|
122 } |
|
123 } |
|
124 return TRUE; |
|
125 } |
|
126 |
|
127 UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { |
|
128 int32_t cpLength=U16_LENGTH(c); |
|
129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { |
|
130 return FALSE; |
|
131 } |
|
132 remainingCapacity-=cpLength; |
|
133 if(cpLength==1) { |
|
134 *limit++=(UChar)c; |
|
135 } else { |
|
136 limit[0]=U16_LEAD(c); |
|
137 limit[1]=U16_TRAIL(c); |
|
138 limit+=2; |
|
139 } |
|
140 lastCC=0; |
|
141 reorderStart=limit; |
|
142 return TRUE; |
|
143 } |
|
144 |
|
145 UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { |
|
146 if(s==sLimit) { |
|
147 return TRUE; |
|
148 } |
|
149 int32_t length=(int32_t)(sLimit-s); |
|
150 if(remainingCapacity<length && !resize(length, errorCode)) { |
|
151 return FALSE; |
|
152 } |
|
153 u_memcpy(limit, s, length); |
|
154 limit+=length; |
|
155 remainingCapacity-=length; |
|
156 lastCC=0; |
|
157 reorderStart=limit; |
|
158 return TRUE; |
|
159 } |
|
160 |
|
161 void ReorderingBuffer::remove() { |
|
162 reorderStart=limit=start; |
|
163 remainingCapacity=str.getCapacity(); |
|
164 lastCC=0; |
|
165 } |
|
166 |
|
167 void ReorderingBuffer::removeSuffix(int32_t suffixLength) { |
|
168 if(suffixLength<(limit-start)) { |
|
169 limit-=suffixLength; |
|
170 remainingCapacity+=suffixLength; |
|
171 } else { |
|
172 limit=start; |
|
173 remainingCapacity=str.getCapacity(); |
|
174 } |
|
175 lastCC=0; |
|
176 reorderStart=limit; |
|
177 } |
|
178 |
|
179 UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { |
|
180 int32_t reorderStartIndex=(int32_t)(reorderStart-start); |
|
181 int32_t length=(int32_t)(limit-start); |
|
182 str.releaseBuffer(length); |
|
183 int32_t newCapacity=length+appendLength; |
|
184 int32_t doubleCapacity=2*str.getCapacity(); |
|
185 if(newCapacity<doubleCapacity) { |
|
186 newCapacity=doubleCapacity; |
|
187 } |
|
188 if(newCapacity<256) { |
|
189 newCapacity=256; |
|
190 } |
|
191 start=str.getBuffer(newCapacity); |
|
192 if(start==NULL) { |
|
193 // getBuffer() already did str.setToBogus() |
|
194 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
195 return FALSE; |
|
196 } |
|
197 reorderStart=start+reorderStartIndex; |
|
198 limit=start+length; |
|
199 remainingCapacity=str.getCapacity()-length; |
|
200 return TRUE; |
|
201 } |
|
202 |
|
203 void ReorderingBuffer::skipPrevious() { |
|
204 codePointLimit=codePointStart; |
|
205 UChar c=*--codePointStart; |
|
206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { |
|
207 --codePointStart; |
|
208 } |
|
209 } |
|
210 |
|
211 uint8_t ReorderingBuffer::previousCC() { |
|
212 codePointLimit=codePointStart; |
|
213 if(reorderStart>=codePointStart) { |
|
214 return 0; |
|
215 } |
|
216 UChar32 c=*--codePointStart; |
|
217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) { |
|
218 return 0; |
|
219 } |
|
220 |
|
221 UChar c2; |
|
222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { |
|
223 --codePointStart; |
|
224 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
225 } |
|
226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c)); |
|
227 } |
|
228 |
|
229 // Inserts c somewhere before the last character. |
|
230 // Requires 0<cc<lastCC which implies reorderStart<limit. |
|
231 void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { |
|
232 for(setIterator(), skipPrevious(); previousCC()>cc;) {} |
|
233 // insert c at codePointLimit, after the character with prevCC<=cc |
|
234 UChar *q=limit; |
|
235 UChar *r=limit+=U16_LENGTH(c); |
|
236 do { |
|
237 *--r=*--q; |
|
238 } while(codePointLimit!=q); |
|
239 writeCodePoint(q, c); |
|
240 if(cc<=1) { |
|
241 reorderStart=r; |
|
242 } |
|
243 } |
|
244 |
|
245 // Normalizer2Impl --------------------------------------------------------- *** |
|
246 |
|
247 struct CanonIterData : public UMemory { |
|
248 CanonIterData(UErrorCode &errorCode); |
|
249 ~CanonIterData(); |
|
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode); |
|
251 UTrie2 *trie; |
|
252 UVector canonStartSets; // contains UnicodeSet * |
|
253 }; |
|
254 |
|
255 Normalizer2Impl::~Normalizer2Impl() { |
|
256 udata_close(memory); |
|
257 utrie2_close(normTrie); |
|
258 delete fCanonIterData; |
|
259 } |
|
260 |
|
261 UBool U_CALLCONV |
|
262 Normalizer2Impl::isAcceptable(void *context, |
|
263 const char * /* type */, const char * /*name*/, |
|
264 const UDataInfo *pInfo) { |
|
265 if( |
|
266 pInfo->size>=20 && |
|
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
|
268 pInfo->charsetFamily==U_CHARSET_FAMILY && |
|
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ |
|
270 pInfo->dataFormat[1]==0x72 && |
|
271 pInfo->dataFormat[2]==0x6d && |
|
272 pInfo->dataFormat[3]==0x32 && |
|
273 pInfo->formatVersion[0]==2 |
|
274 ) { |
|
275 Normalizer2Impl *me=(Normalizer2Impl *)context; |
|
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); |
|
277 return TRUE; |
|
278 } else { |
|
279 return FALSE; |
|
280 } |
|
281 } |
|
282 |
|
283 void |
|
284 Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { |
|
285 if(U_FAILURE(errorCode)) { |
|
286 return; |
|
287 } |
|
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); |
|
289 if(U_FAILURE(errorCode)) { |
|
290 return; |
|
291 } |
|
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); |
|
293 const int32_t *inIndexes=(const int32_t *)inBytes; |
|
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; |
|
295 if(indexesLength<=IX_MIN_MAYBE_YES) { |
|
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. |
|
297 return; |
|
298 } |
|
299 |
|
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; |
|
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; |
|
302 |
|
303 minYesNo=inIndexes[IX_MIN_YES_NO]; |
|
304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; |
|
305 minNoNo=inIndexes[IX_MIN_NO_NO]; |
|
306 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; |
|
307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; |
|
308 |
|
309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; |
|
310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; |
|
311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
|
312 inBytes+offset, nextOffset-offset, NULL, |
|
313 &errorCode); |
|
314 if(U_FAILURE(errorCode)) { |
|
315 return; |
|
316 } |
|
317 |
|
318 offset=nextOffset; |
|
319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; |
|
320 maybeYesCompositions=(const uint16_t *)(inBytes+offset); |
|
321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); |
|
322 |
|
323 // smallFCD: new in formatVersion 2 |
|
324 offset=nextOffset; |
|
325 smallFCD=inBytes+offset; |
|
326 |
|
327 // Build tccc180[]. |
|
328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. |
|
329 uint8_t bits=0; |
|
330 for(UChar c=0; c<0x180; bits>>=1) { |
|
331 if((c&0xff)==0) { |
|
332 bits=smallFCD[c>>8]; // one byte per 0x100 code points |
|
333 } |
|
334 if(bits&1) { |
|
335 for(int i=0; i<0x20; ++i, ++c) { |
|
336 tccc180[c]=(uint8_t)getFCD16FromNormData(c); |
|
337 } |
|
338 } else { |
|
339 uprv_memset(tccc180+c, 0, 0x20); |
|
340 c+=0x20; |
|
341 } |
|
342 } |
|
343 } |
|
344 |
|
345 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const { |
|
346 UChar32 c; |
|
347 if(cpStart==(cpLimit-1)) { |
|
348 c=*cpStart; |
|
349 } else { |
|
350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]); |
|
351 } |
|
352 uint16_t prevNorm16=getNorm16(c); |
|
353 if(prevNorm16<=minYesNo) { |
|
354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 |
|
355 } else { |
|
356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo |
|
357 } |
|
358 } |
|
359 |
|
360 U_CDECL_BEGIN |
|
361 |
|
362 static UBool U_CALLCONV |
|
363 enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { |
|
364 /* add the start code point to the USet */ |
|
365 const USetAdder *sa=(const USetAdder *)context; |
|
366 sa->add(sa->set, start); |
|
367 return TRUE; |
|
368 } |
|
369 |
|
370 static uint32_t U_CALLCONV |
|
371 segmentStarterMapper(const void * /*context*/, uint32_t value) { |
|
372 return value&CANON_NOT_SEGMENT_STARTER; |
|
373 } |
|
374 |
|
375 U_CDECL_END |
|
376 |
|
377 void |
|
378 Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { |
|
379 /* add the start code point of each same-value range of each trie */ |
|
380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); |
|
381 |
|
382 /* add Hangul LV syllables and LV+1 because of skippables */ |
|
383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { |
|
384 sa->add(sa->set, c); |
|
385 sa->add(sa->set, c+1); |
|
386 } |
|
387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ |
|
388 } |
|
389 |
|
390 void |
|
391 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { |
|
392 /* add the start code point of each same-value range of the canonical iterator data trie */ |
|
393 if(ensureCanonIterData(errorCode)) { |
|
394 // currently only used for the SEGMENT_STARTER property |
|
395 utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa); |
|
396 } |
|
397 } |
|
398 |
|
399 const UChar * |
|
400 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, |
|
401 UChar32 minNeedDataCP, |
|
402 ReorderingBuffer *buffer, |
|
403 UErrorCode &errorCode) const { |
|
404 // Make some effort to support NUL-terminated strings reasonably. |
|
405 // Take the part of the fast quick check loop that does not look up |
|
406 // data and check the first part of the string. |
|
407 // After this prefix, determine the string length to simplify the rest |
|
408 // of the code. |
|
409 const UChar *prevSrc=src; |
|
410 UChar c; |
|
411 while((c=*src++)<minNeedDataCP && c!=0) {} |
|
412 // Back out the last character for full processing. |
|
413 // Copy this prefix. |
|
414 if(--src!=prevSrc) { |
|
415 if(buffer!=NULL) { |
|
416 buffer->appendZeroCC(prevSrc, src, errorCode); |
|
417 } |
|
418 } |
|
419 return src; |
|
420 } |
|
421 |
|
422 // Dual functionality: |
|
423 // buffer!=NULL: normalize |
|
424 // buffer==NULL: isNormalized/spanQuickCheckYes |
|
425 const UChar * |
|
426 Normalizer2Impl::decompose(const UChar *src, const UChar *limit, |
|
427 ReorderingBuffer *buffer, |
|
428 UErrorCode &errorCode) const { |
|
429 UChar32 minNoCP=minDecompNoCP; |
|
430 if(limit==NULL) { |
|
431 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode); |
|
432 if(U_FAILURE(errorCode)) { |
|
433 return src; |
|
434 } |
|
435 limit=u_strchr(src, 0); |
|
436 } |
|
437 |
|
438 const UChar *prevSrc; |
|
439 UChar32 c=0; |
|
440 uint16_t norm16=0; |
|
441 |
|
442 // only for quick check |
|
443 const UChar *prevBoundary=src; |
|
444 uint8_t prevCC=0; |
|
445 |
|
446 for(;;) { |
|
447 // count code units below the minimum or with irrelevant data for the quick check |
|
448 for(prevSrc=src; src!=limit;) { |
|
449 if( (c=*src)<minNoCP || |
|
450 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) |
|
451 ) { |
|
452 ++src; |
|
453 } else if(!U16_IS_SURROGATE(c)) { |
|
454 break; |
|
455 } else { |
|
456 UChar c2; |
|
457 if(U16_IS_SURROGATE_LEAD(c)) { |
|
458 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { |
|
459 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
460 } |
|
461 } else /* trail surrogate */ { |
|
462 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { |
|
463 --src; |
|
464 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
465 } |
|
466 } |
|
467 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { |
|
468 src+=U16_LENGTH(c); |
|
469 } else { |
|
470 break; |
|
471 } |
|
472 } |
|
473 } |
|
474 // copy these code units all at once |
|
475 if(src!=prevSrc) { |
|
476 if(buffer!=NULL) { |
|
477 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { |
|
478 break; |
|
479 } |
|
480 } else { |
|
481 prevCC=0; |
|
482 prevBoundary=src; |
|
483 } |
|
484 } |
|
485 if(src==limit) { |
|
486 break; |
|
487 } |
|
488 |
|
489 // Check one above-minimum, relevant code point. |
|
490 src+=U16_LENGTH(c); |
|
491 if(buffer!=NULL) { |
|
492 if(!decompose(c, norm16, *buffer, errorCode)) { |
|
493 break; |
|
494 } |
|
495 } else { |
|
496 if(isDecompYes(norm16)) { |
|
497 uint8_t cc=getCCFromYesOrMaybe(norm16); |
|
498 if(prevCC<=cc || cc==0) { |
|
499 prevCC=cc; |
|
500 if(cc<=1) { |
|
501 prevBoundary=src; |
|
502 } |
|
503 continue; |
|
504 } |
|
505 } |
|
506 return prevBoundary; // "no" or cc out of order |
|
507 } |
|
508 } |
|
509 return src; |
|
510 } |
|
511 |
|
512 // Decompose a short piece of text which is likely to contain characters that |
|
513 // fail the quick check loop and/or where the quick check loop's overhead |
|
514 // is unlikely to be amortized. |
|
515 // Called by the compose() and makeFCD() implementations. |
|
516 UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit, |
|
517 ReorderingBuffer &buffer, |
|
518 UErrorCode &errorCode) const { |
|
519 while(src<limit) { |
|
520 UChar32 c; |
|
521 uint16_t norm16; |
|
522 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16); |
|
523 if(!decompose(c, norm16, buffer, errorCode)) { |
|
524 return FALSE; |
|
525 } |
|
526 } |
|
527 return TRUE; |
|
528 } |
|
529 |
|
530 UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, |
|
531 ReorderingBuffer &buffer, |
|
532 UErrorCode &errorCode) const { |
|
533 // Only loops for 1:1 algorithmic mappings. |
|
534 for(;;) { |
|
535 // get the decomposition and the lead and trail cc's |
|
536 if(isDecompYes(norm16)) { |
|
537 // c does not decompose |
|
538 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); |
|
539 } else if(isHangul(norm16)) { |
|
540 // Hangul syllable: decompose algorithmically |
|
541 UChar jamos[3]; |
|
542 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); |
|
543 } else if(isDecompNoAlgorithmic(norm16)) { |
|
544 c=mapAlgorithmic(c, norm16); |
|
545 norm16=getNorm16(c); |
|
546 } else { |
|
547 // c decomposes, get everything from the variable-length extra data |
|
548 const uint16_t *mapping=getMapping(norm16); |
|
549 uint16_t firstUnit=*mapping; |
|
550 int32_t length=firstUnit&MAPPING_LENGTH_MASK; |
|
551 uint8_t leadCC, trailCC; |
|
552 trailCC=(uint8_t)(firstUnit>>8); |
|
553 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { |
|
554 leadCC=(uint8_t)(*(mapping-1)>>8); |
|
555 } else { |
|
556 leadCC=0; |
|
557 } |
|
558 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode); |
|
559 } |
|
560 } |
|
561 } |
|
562 |
|
563 const UChar * |
|
564 Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { |
|
565 const UChar *decomp=NULL; |
|
566 uint16_t norm16; |
|
567 for(;;) { |
|
568 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { |
|
569 // c does not decompose |
|
570 return decomp; |
|
571 } else if(isHangul(norm16)) { |
|
572 // Hangul syllable: decompose algorithmically |
|
573 length=Hangul::decompose(c, buffer); |
|
574 return buffer; |
|
575 } else if(isDecompNoAlgorithmic(norm16)) { |
|
576 c=mapAlgorithmic(c, norm16); |
|
577 decomp=buffer; |
|
578 length=0; |
|
579 U16_APPEND_UNSAFE(buffer, length, c); |
|
580 } else { |
|
581 // c decomposes, get everything from the variable-length extra data |
|
582 const uint16_t *mapping=getMapping(norm16); |
|
583 length=*mapping&MAPPING_LENGTH_MASK; |
|
584 return (const UChar *)mapping+1; |
|
585 } |
|
586 } |
|
587 } |
|
588 |
|
589 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 |
|
590 // so that a raw mapping fits that consists of one unit ("rm0") |
|
591 // plus all but the first two code units of the normal mapping. |
|
592 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK. |
|
593 const UChar * |
|
594 Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { |
|
595 // We do not loop in this method because an algorithmic mapping itself |
|
596 // becomes a final result rather than having to be decomposed recursively. |
|
597 uint16_t norm16; |
|
598 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { |
|
599 // c does not decompose |
|
600 return NULL; |
|
601 } else if(isHangul(norm16)) { |
|
602 // Hangul syllable: decompose algorithmically |
|
603 Hangul::getRawDecomposition(c, buffer); |
|
604 length=2; |
|
605 return buffer; |
|
606 } else if(isDecompNoAlgorithmic(norm16)) { |
|
607 c=mapAlgorithmic(c, norm16); |
|
608 length=0; |
|
609 U16_APPEND_UNSAFE(buffer, length, c); |
|
610 return buffer; |
|
611 } else { |
|
612 // c decomposes, get everything from the variable-length extra data |
|
613 const uint16_t *mapping=getMapping(norm16); |
|
614 uint16_t firstUnit=*mapping; |
|
615 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping |
|
616 if(firstUnit&MAPPING_HAS_RAW_MAPPING) { |
|
617 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. |
|
618 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD |
|
619 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1; |
|
620 uint16_t rm0=*rawMapping; |
|
621 if(rm0<=MAPPING_LENGTH_MASK) { |
|
622 length=rm0; |
|
623 return (const UChar *)rawMapping-rm0; |
|
624 } else { |
|
625 // Copy the normal mapping and replace its first two code units with rm0. |
|
626 buffer[0]=(UChar)rm0; |
|
627 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2); |
|
628 length=mLength-1; |
|
629 return buffer; |
|
630 } |
|
631 } else { |
|
632 length=mLength; |
|
633 return (const UChar *)mapping+1; |
|
634 } |
|
635 } |
|
636 } |
|
637 |
|
638 void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit, |
|
639 UBool doDecompose, |
|
640 UnicodeString &safeMiddle, |
|
641 ReorderingBuffer &buffer, |
|
642 UErrorCode &errorCode) const { |
|
643 buffer.copyReorderableSuffixTo(safeMiddle); |
|
644 if(doDecompose) { |
|
645 decompose(src, limit, &buffer, errorCode); |
|
646 return; |
|
647 } |
|
648 // Just merge the strings at the boundary. |
|
649 ForwardUTrie2StringIterator iter(normTrie, src, limit); |
|
650 uint8_t firstCC, prevCC, cc; |
|
651 firstCC=prevCC=cc=getCC(iter.next16()); |
|
652 while(cc!=0) { |
|
653 prevCC=cc; |
|
654 cc=getCC(iter.next16()); |
|
655 }; |
|
656 if(limit==NULL) { // appendZeroCC() needs limit!=NULL |
|
657 limit=u_strchr(iter.codePointStart, 0); |
|
658 } |
|
659 |
|
660 if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) { |
|
661 buffer.appendZeroCC(iter.codePointStart, limit, errorCode); |
|
662 } |
|
663 } |
|
664 |
|
665 // Note: hasDecompBoundary() could be implemented as aliases to |
|
666 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() |
|
667 // at the cost of building the FCD trie for a decomposition normalizer. |
|
668 UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const { |
|
669 for(;;) { |
|
670 if(c<minDecompNoCP) { |
|
671 return TRUE; |
|
672 } |
|
673 uint16_t norm16=getNorm16(c); |
|
674 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { |
|
675 return TRUE; |
|
676 } else if(norm16>MIN_NORMAL_MAYBE_YES) { |
|
677 return FALSE; // ccc!=0 |
|
678 } else if(isDecompNoAlgorithmic(norm16)) { |
|
679 c=mapAlgorithmic(c, norm16); |
|
680 } else { |
|
681 // c decomposes, get everything from the variable-length extra data |
|
682 const uint16_t *mapping=getMapping(norm16); |
|
683 uint16_t firstUnit=*mapping; |
|
684 if((firstUnit&MAPPING_LENGTH_MASK)==0) { |
|
685 return FALSE; |
|
686 } |
|
687 if(!before) { |
|
688 // decomp after-boundary: same as hasFCDBoundaryAfter(), |
|
689 // fcd16<=1 || trailCC==0 |
|
690 if(firstUnit>0x1ff) { |
|
691 return FALSE; // trailCC>1 |
|
692 } |
|
693 if(firstUnit<=0xff) { |
|
694 return TRUE; // trailCC==0 |
|
695 } |
|
696 // if(trailCC==1) test leadCC==0, same as checking for before-boundary |
|
697 } |
|
698 // TRUE if leadCC==0 (hasFCDBoundaryBefore()) |
|
699 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; |
|
700 } |
|
701 } |
|
702 } |
|
703 |
|
704 /* |
|
705 * Finds the recomposition result for |
|
706 * a forward-combining "lead" character, |
|
707 * specified with a pointer to its compositions list, |
|
708 * and a backward-combining "trail" character. |
|
709 * |
|
710 * If the lead and trail characters combine, then this function returns |
|
711 * the following "compositeAndFwd" value: |
|
712 * Bits 21..1 composite character |
|
713 * Bit 0 set if the composite is a forward-combining starter |
|
714 * otherwise it returns -1. |
|
715 * |
|
716 * The compositions list has (trail, compositeAndFwd) pair entries, |
|
717 * encoded as either pairs or triples of 16-bit units. |
|
718 * The last entry has the high bit of its first unit set. |
|
719 * |
|
720 * The list is sorted by ascending trail characters (there are no duplicates). |
|
721 * A linear search is used. |
|
722 * |
|
723 * See normalizer2impl.h for a more detailed description |
|
724 * of the compositions list format. |
|
725 */ |
|
726 int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { |
|
727 uint16_t key1, firstUnit; |
|
728 if(trail<COMP_1_TRAIL_LIMIT) { |
|
729 // trail character is 0..33FF |
|
730 // result entry may have 2 or 3 units |
|
731 key1=(uint16_t)(trail<<1); |
|
732 while(key1>(firstUnit=*list)) { |
|
733 list+=2+(firstUnit&COMP_1_TRIPLE); |
|
734 } |
|
735 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { |
|
736 if(firstUnit&COMP_1_TRIPLE) { |
|
737 return ((int32_t)list[1]<<16)|list[2]; |
|
738 } else { |
|
739 return list[1]; |
|
740 } |
|
741 } |
|
742 } else { |
|
743 // trail character is 3400..10FFFF |
|
744 // result entry has 3 units |
|
745 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ |
|
746 (((trail>>COMP_1_TRAIL_SHIFT))& |
|
747 ~COMP_1_TRIPLE)); |
|
748 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); |
|
749 uint16_t secondUnit; |
|
750 for(;;) { |
|
751 if(key1>(firstUnit=*list)) { |
|
752 list+=2+(firstUnit&COMP_1_TRIPLE); |
|
753 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { |
|
754 if(key2>(secondUnit=list[1])) { |
|
755 if(firstUnit&COMP_1_LAST_TUPLE) { |
|
756 break; |
|
757 } else { |
|
758 list+=3; |
|
759 } |
|
760 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { |
|
761 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; |
|
762 } else { |
|
763 break; |
|
764 } |
|
765 } else { |
|
766 break; |
|
767 } |
|
768 } |
|
769 } |
|
770 return -1; |
|
771 } |
|
772 |
|
773 /** |
|
774 * @param list some character's compositions list |
|
775 * @param set recursively receives the composites from these compositions |
|
776 */ |
|
777 void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { |
|
778 uint16_t firstUnit; |
|
779 int32_t compositeAndFwd; |
|
780 do { |
|
781 firstUnit=*list; |
|
782 if((firstUnit&COMP_1_TRIPLE)==0) { |
|
783 compositeAndFwd=list[1]; |
|
784 list+=2; |
|
785 } else { |
|
786 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; |
|
787 list+=3; |
|
788 } |
|
789 UChar32 composite=compositeAndFwd>>1; |
|
790 if((compositeAndFwd&1)!=0) { |
|
791 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); |
|
792 } |
|
793 set.add(composite); |
|
794 } while((firstUnit&COMP_1_LAST_TUPLE)==0); |
|
795 } |
|
796 |
|
797 /* |
|
798 * Recomposes the buffer text starting at recomposeStartIndex |
|
799 * (which is in NFD - decomposed and canonically ordered), |
|
800 * and truncates the buffer contents. |
|
801 * |
|
802 * Note that recomposition never lengthens the text: |
|
803 * Any character consists of either one or two code units; |
|
804 * a composition may contain at most one more code unit than the original starter, |
|
805 * while the combining mark that is removed has at least one code unit. |
|
806 */ |
|
807 void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, |
|
808 UBool onlyContiguous) const { |
|
809 UChar *p=buffer.getStart()+recomposeStartIndex; |
|
810 UChar *limit=buffer.getLimit(); |
|
811 if(p==limit) { |
|
812 return; |
|
813 } |
|
814 |
|
815 UChar *starter, *pRemove, *q, *r; |
|
816 const uint16_t *compositionsList; |
|
817 UChar32 c, compositeAndFwd; |
|
818 uint16_t norm16; |
|
819 uint8_t cc, prevCC; |
|
820 UBool starterIsSupplementary; |
|
821 |
|
822 // Some of the following variables are not used until we have a forward-combining starter |
|
823 // and are only initialized now to avoid compiler warnings. |
|
824 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter |
|
825 starter=NULL; |
|
826 starterIsSupplementary=FALSE; |
|
827 prevCC=0; |
|
828 |
|
829 for(;;) { |
|
830 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16); |
|
831 cc=getCCFromYesOrMaybe(norm16); |
|
832 if( // this character combines backward and |
|
833 isMaybe(norm16) && |
|
834 // we have seen a starter that combines forward and |
|
835 compositionsList!=NULL && |
|
836 // the backward-combining character is not blocked |
|
837 (prevCC<cc || prevCC==0) |
|
838 ) { |
|
839 if(isJamoVT(norm16)) { |
|
840 // c is a Jamo V/T, see if we can compose it with the previous character. |
|
841 if(c<Hangul::JAMO_T_BASE) { |
|
842 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. |
|
843 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE); |
|
844 if(prev<Hangul::JAMO_L_COUNT) { |
|
845 pRemove=p-1; |
|
846 UChar syllable=(UChar) |
|
847 (Hangul::HANGUL_BASE+ |
|
848 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* |
|
849 Hangul::JAMO_T_COUNT); |
|
850 UChar t; |
|
851 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { |
|
852 ++p; |
|
853 syllable+=t; // The next character was a Jamo T. |
|
854 } |
|
855 *starter=syllable; |
|
856 // remove the Jamo V/T |
|
857 q=pRemove; |
|
858 r=p; |
|
859 while(r<limit) { |
|
860 *q++=*r++; |
|
861 } |
|
862 limit=q; |
|
863 p=pRemove; |
|
864 } |
|
865 } |
|
866 /* |
|
867 * No "else" for Jamo T: |
|
868 * Since the input is in NFD, there are no Hangul LV syllables that |
|
869 * a Jamo T could combine with. |
|
870 * All Jamo Ts are combined above when handling Jamo Vs. |
|
871 */ |
|
872 if(p==limit) { |
|
873 break; |
|
874 } |
|
875 compositionsList=NULL; |
|
876 continue; |
|
877 } else if((compositeAndFwd=combine(compositionsList, c))>=0) { |
|
878 // The starter and the combining mark (c) do combine. |
|
879 UChar32 composite=compositeAndFwd>>1; |
|
880 |
|
881 // Replace the starter with the composite, remove the combining mark. |
|
882 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark |
|
883 if(starterIsSupplementary) { |
|
884 if(U_IS_SUPPLEMENTARY(composite)) { |
|
885 // both are supplementary |
|
886 starter[0]=U16_LEAD(composite); |
|
887 starter[1]=U16_TRAIL(composite); |
|
888 } else { |
|
889 *starter=(UChar)composite; |
|
890 // The composite is shorter than the starter, |
|
891 // move the intermediate characters forward one. |
|
892 starterIsSupplementary=FALSE; |
|
893 q=starter+1; |
|
894 r=q+1; |
|
895 while(r<pRemove) { |
|
896 *q++=*r++; |
|
897 } |
|
898 --pRemove; |
|
899 } |
|
900 } else if(U_IS_SUPPLEMENTARY(composite)) { |
|
901 // The composite is longer than the starter, |
|
902 // move the intermediate characters back one. |
|
903 starterIsSupplementary=TRUE; |
|
904 ++starter; // temporarily increment for the loop boundary |
|
905 q=pRemove; |
|
906 r=++pRemove; |
|
907 while(starter<q) { |
|
908 *--r=*--q; |
|
909 } |
|
910 *starter=U16_TRAIL(composite); |
|
911 *--starter=U16_LEAD(composite); // undo the temporary increment |
|
912 } else { |
|
913 // both are on the BMP |
|
914 *starter=(UChar)composite; |
|
915 } |
|
916 |
|
917 /* remove the combining mark by moving the following text over it */ |
|
918 if(pRemove<p) { |
|
919 q=pRemove; |
|
920 r=p; |
|
921 while(r<limit) { |
|
922 *q++=*r++; |
|
923 } |
|
924 limit=q; |
|
925 p=pRemove; |
|
926 } |
|
927 // Keep prevCC because we removed the combining mark. |
|
928 |
|
929 if(p==limit) { |
|
930 break; |
|
931 } |
|
932 // Is the composite a starter that combines forward? |
|
933 if(compositeAndFwd&1) { |
|
934 compositionsList= |
|
935 getCompositionsListForComposite(getNorm16(composite)); |
|
936 } else { |
|
937 compositionsList=NULL; |
|
938 } |
|
939 |
|
940 // We combined; continue with looking for compositions. |
|
941 continue; |
|
942 } |
|
943 } |
|
944 |
|
945 // no combination this time |
|
946 prevCC=cc; |
|
947 if(p==limit) { |
|
948 break; |
|
949 } |
|
950 |
|
951 // If c did not combine, then check if it is a starter. |
|
952 if(cc==0) { |
|
953 // Found a new starter. |
|
954 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { |
|
955 // It may combine with something, prepare for it. |
|
956 if(U_IS_BMP(c)) { |
|
957 starterIsSupplementary=FALSE; |
|
958 starter=p-1; |
|
959 } else { |
|
960 starterIsSupplementary=TRUE; |
|
961 starter=p-2; |
|
962 } |
|
963 } |
|
964 } else if(onlyContiguous) { |
|
965 // FCC: no discontiguous compositions; any intervening character blocks. |
|
966 compositionsList=NULL; |
|
967 } |
|
968 } |
|
969 buffer.setReorderingLimit(limit); |
|
970 } |
|
971 |
|
972 UChar32 |
|
973 Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { |
|
974 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 |
|
975 const uint16_t *list; |
|
976 if(isInert(norm16)) { |
|
977 return U_SENTINEL; |
|
978 } else if(norm16<minYesNoMappingsOnly) { |
|
979 if(isJamoL(norm16)) { |
|
980 b-=Hangul::JAMO_V_BASE; |
|
981 if(0<=b && b<Hangul::JAMO_V_COUNT) { |
|
982 return |
|
983 (Hangul::HANGUL_BASE+ |
|
984 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)* |
|
985 Hangul::JAMO_T_COUNT); |
|
986 } else { |
|
987 return U_SENTINEL; |
|
988 } |
|
989 } else if(isHangul(norm16)) { |
|
990 b-=Hangul::JAMO_T_BASE; |
|
991 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0! |
|
992 return a+b; |
|
993 } else { |
|
994 return U_SENTINEL; |
|
995 } |
|
996 } else { |
|
997 // 'a' has a compositions list in extraData |
|
998 list=extraData+norm16; |
|
999 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list |
|
1000 list+= // mapping pointer |
|
1001 1+ // +1 to skip the first unit with the mapping lenth |
|
1002 (*list&MAPPING_LENGTH_MASK); // + mapping length |
|
1003 } |
|
1004 } |
|
1005 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { |
|
1006 return U_SENTINEL; |
|
1007 } else { |
|
1008 list=maybeYesCompositions+norm16-minMaybeYes; |
|
1009 } |
|
1010 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b |
|
1011 return U_SENTINEL; |
|
1012 } |
|
1013 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC |
|
1014 return combine(list, b)>>1; |
|
1015 #else |
|
1016 int32_t compositeAndFwd=combine(list, b); |
|
1017 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL; |
|
1018 #endif |
|
1019 } |
|
1020 |
|
1021 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. |
|
1022 // doCompose: normalize |
|
1023 // !doCompose: isNormalized (buffer must be empty and initialized) |
|
1024 UBool |
|
1025 Normalizer2Impl::compose(const UChar *src, const UChar *limit, |
|
1026 UBool onlyContiguous, |
|
1027 UBool doCompose, |
|
1028 ReorderingBuffer &buffer, |
|
1029 UErrorCode &errorCode) const { |
|
1030 /* |
|
1031 * prevBoundary points to the last character before the current one |
|
1032 * that has a composition boundary before it with ccc==0 and quick check "yes". |
|
1033 * Keeping track of prevBoundary saves us looking for a composition boundary |
|
1034 * when we find a "no" or "maybe". |
|
1035 * |
|
1036 * When we back out from prevSrc back to prevBoundary, |
|
1037 * then we also remove those same characters (which had been simply copied |
|
1038 * or canonically-order-inserted) from the ReorderingBuffer. |
|
1039 * Therefore, at all times, the [prevBoundary..prevSrc[ source units |
|
1040 * must correspond 1:1 to destination units at the end of the destination buffer. |
|
1041 */ |
|
1042 const UChar *prevBoundary=src; |
|
1043 UChar32 minNoMaybeCP=minCompNoMaybeCP; |
|
1044 if(limit==NULL) { |
|
1045 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, |
|
1046 doCompose ? &buffer : NULL, |
|
1047 errorCode); |
|
1048 if(U_FAILURE(errorCode)) { |
|
1049 return FALSE; |
|
1050 } |
|
1051 if(prevBoundary<src) { |
|
1052 // Set prevBoundary to the last character in the prefix. |
|
1053 prevBoundary=src-1; |
|
1054 } |
|
1055 limit=u_strchr(src, 0); |
|
1056 } |
|
1057 |
|
1058 const UChar *prevSrc; |
|
1059 UChar32 c=0; |
|
1060 uint16_t norm16=0; |
|
1061 |
|
1062 // only for isNormalized |
|
1063 uint8_t prevCC=0; |
|
1064 |
|
1065 for(;;) { |
|
1066 // count code units below the minimum or with irrelevant data for the quick check |
|
1067 for(prevSrc=src; src!=limit;) { |
|
1068 if( (c=*src)<minNoMaybeCP || |
|
1069 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) |
|
1070 ) { |
|
1071 ++src; |
|
1072 } else if(!U16_IS_SURROGATE(c)) { |
|
1073 break; |
|
1074 } else { |
|
1075 UChar c2; |
|
1076 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1077 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { |
|
1078 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
1079 } |
|
1080 } else /* trail surrogate */ { |
|
1081 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { |
|
1082 --src; |
|
1083 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
1084 } |
|
1085 } |
|
1086 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { |
|
1087 src+=U16_LENGTH(c); |
|
1088 } else { |
|
1089 break; |
|
1090 } |
|
1091 } |
|
1092 } |
|
1093 // copy these code units all at once |
|
1094 if(src!=prevSrc) { |
|
1095 if(doCompose) { |
|
1096 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) { |
|
1097 break; |
|
1098 } |
|
1099 } else { |
|
1100 prevCC=0; |
|
1101 } |
|
1102 if(src==limit) { |
|
1103 break; |
|
1104 } |
|
1105 // Set prevBoundary to the last character in the quick check loop. |
|
1106 prevBoundary=src-1; |
|
1107 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && |
|
1108 U16_IS_LEAD(*(prevBoundary-1)) |
|
1109 ) { |
|
1110 --prevBoundary; |
|
1111 } |
|
1112 // The start of the current character (c). |
|
1113 prevSrc=src; |
|
1114 } else if(src==limit) { |
|
1115 break; |
|
1116 } |
|
1117 |
|
1118 src+=U16_LENGTH(c); |
|
1119 /* |
|
1120 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. |
|
1121 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) |
|
1122 * or has ccc!=0. |
|
1123 * Check for Jamo V/T, then for regular characters. |
|
1124 * c is not a Hangul syllable or Jamo L because those have "yes" properties. |
|
1125 */ |
|
1126 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { |
|
1127 UChar prev=*(prevSrc-1); |
|
1128 UBool needToDecompose=FALSE; |
|
1129 if(c<Hangul::JAMO_T_BASE) { |
|
1130 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. |
|
1131 prev=(UChar)(prev-Hangul::JAMO_L_BASE); |
|
1132 if(prev<Hangul::JAMO_L_COUNT) { |
|
1133 if(!doCompose) { |
|
1134 return FALSE; |
|
1135 } |
|
1136 UChar syllable=(UChar) |
|
1137 (Hangul::HANGUL_BASE+ |
|
1138 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* |
|
1139 Hangul::JAMO_T_COUNT); |
|
1140 UChar t; |
|
1141 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { |
|
1142 ++src; |
|
1143 syllable+=t; // The next character was a Jamo T. |
|
1144 prevBoundary=src; |
|
1145 buffer.setLastChar(syllable); |
|
1146 continue; |
|
1147 } |
|
1148 // If we see L+V+x where x!=T then we drop to the slow path, |
|
1149 // decompose and recompose. |
|
1150 // This is to deal with NFKC finding normal L and V but a |
|
1151 // compatibility variant of a T. We need to either fully compose that |
|
1152 // combination here (which would complicate the code and may not work |
|
1153 // with strange custom data) or use the slow path -- or else our replacing |
|
1154 // two input characters (L+V) with one output character (LV syllable) |
|
1155 // would violate the invariant that [prevBoundary..prevSrc[ has the same |
|
1156 // length as what we appended to the buffer since prevBoundary. |
|
1157 needToDecompose=TRUE; |
|
1158 } |
|
1159 } else if(Hangul::isHangulWithoutJamoT(prev)) { |
|
1160 // c is a Jamo Trailing consonant, |
|
1161 // compose with previous Hangul LV that does not contain a Jamo T. |
|
1162 if(!doCompose) { |
|
1163 return FALSE; |
|
1164 } |
|
1165 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE)); |
|
1166 prevBoundary=src; |
|
1167 continue; |
|
1168 } |
|
1169 if(!needToDecompose) { |
|
1170 // The Jamo V/T did not compose into a Hangul syllable. |
|
1171 if(doCompose) { |
|
1172 if(!buffer.appendBMP((UChar)c, 0, errorCode)) { |
|
1173 break; |
|
1174 } |
|
1175 } else { |
|
1176 prevCC=0; |
|
1177 } |
|
1178 continue; |
|
1179 } |
|
1180 } |
|
1181 /* |
|
1182 * Source buffer pointers: |
|
1183 * |
|
1184 * all done quick check current char not yet |
|
1185 * "yes" but (c) processed |
|
1186 * may combine |
|
1187 * forward |
|
1188 * [-------------[-------------[-------------[-------------[ |
|
1189 * | | | | | |
|
1190 * orig. src prevBoundary prevSrc src limit |
|
1191 * |
|
1192 * |
|
1193 * Destination buffer pointers inside the ReorderingBuffer: |
|
1194 * |
|
1195 * all done might take not filled yet |
|
1196 * characters for |
|
1197 * reordering |
|
1198 * [-------------[-------------[-------------[ |
|
1199 * | | | | |
|
1200 * start reorderStart limit | |
|
1201 * +remainingCap.+ |
|
1202 */ |
|
1203 if(norm16>=MIN_YES_YES_WITH_CC) { |
|
1204 uint8_t cc=(uint8_t)norm16; // cc!=0 |
|
1205 if( onlyContiguous && // FCC |
|
1206 (doCompose ? buffer.getLastCC() : prevCC)==0 && |
|
1207 prevBoundary<prevSrc && |
|
1208 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that |
|
1209 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) |
|
1210 // passed the quick check "yes && ccc==0" test. |
|
1211 // Check whether the last character was a "yesYes" or a "yesNo". |
|
1212 // If a "yesNo", then we get its trailing ccc from its |
|
1213 // mapping and check for canonical order. |
|
1214 // All other cases are ok. |
|
1215 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc |
|
1216 ) { |
|
1217 // Fails FCD test, need to decompose and contiguously recompose. |
|
1218 if(!doCompose) { |
|
1219 return FALSE; |
|
1220 } |
|
1221 } else if(doCompose) { |
|
1222 if(!buffer.append(c, cc, errorCode)) { |
|
1223 break; |
|
1224 } |
|
1225 continue; |
|
1226 } else if(prevCC<=cc) { |
|
1227 prevCC=cc; |
|
1228 continue; |
|
1229 } else { |
|
1230 return FALSE; |
|
1231 } |
|
1232 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { |
|
1233 return FALSE; |
|
1234 } |
|
1235 |
|
1236 /* |
|
1237 * Find appropriate boundaries around this character, |
|
1238 * decompose the source text from between the boundaries, |
|
1239 * and recompose it. |
|
1240 * |
|
1241 * We may need to remove the last few characters from the ReorderingBuffer |
|
1242 * to account for source text that was copied or appended |
|
1243 * but needs to take part in the recomposition. |
|
1244 */ |
|
1245 |
|
1246 /* |
|
1247 * Find the last composition boundary in [prevBoundary..src[. |
|
1248 * It is either the decomposition of the current character (at prevSrc), |
|
1249 * or prevBoundary. |
|
1250 */ |
|
1251 if(hasCompBoundaryBefore(c, norm16)) { |
|
1252 prevBoundary=prevSrc; |
|
1253 } else if(doCompose) { |
|
1254 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary)); |
|
1255 } |
|
1256 |
|
1257 // Find the next composition boundary in [src..limit[ - |
|
1258 // modifies src to point to the next starter. |
|
1259 src=(UChar *)findNextCompBoundary(src, limit); |
|
1260 |
|
1261 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. |
|
1262 int32_t recomposeStartIndex=buffer.length(); |
|
1263 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) { |
|
1264 break; |
|
1265 } |
|
1266 recompose(buffer, recomposeStartIndex, onlyContiguous); |
|
1267 if(!doCompose) { |
|
1268 if(!buffer.equals(prevBoundary, src)) { |
|
1269 return FALSE; |
|
1270 } |
|
1271 buffer.remove(); |
|
1272 prevCC=0; |
|
1273 } |
|
1274 |
|
1275 // Move to the next starter. We never need to look back before this point again. |
|
1276 prevBoundary=src; |
|
1277 } |
|
1278 return TRUE; |
|
1279 } |
|
1280 |
|
1281 // Very similar to compose(): Make the same changes in both places if relevant. |
|
1282 // pQCResult==NULL: spanQuickCheckYes |
|
1283 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES) |
|
1284 const UChar * |
|
1285 Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit, |
|
1286 UBool onlyContiguous, |
|
1287 UNormalizationCheckResult *pQCResult) const { |
|
1288 /* |
|
1289 * prevBoundary points to the last character before the current one |
|
1290 * that has a composition boundary before it with ccc==0 and quick check "yes". |
|
1291 */ |
|
1292 const UChar *prevBoundary=src; |
|
1293 UChar32 minNoMaybeCP=minCompNoMaybeCP; |
|
1294 if(limit==NULL) { |
|
1295 UErrorCode errorCode=U_ZERO_ERROR; |
|
1296 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode); |
|
1297 if(prevBoundary<src) { |
|
1298 // Set prevBoundary to the last character in the prefix. |
|
1299 prevBoundary=src-1; |
|
1300 } |
|
1301 limit=u_strchr(src, 0); |
|
1302 } |
|
1303 |
|
1304 const UChar *prevSrc; |
|
1305 UChar32 c=0; |
|
1306 uint16_t norm16=0; |
|
1307 uint8_t prevCC=0; |
|
1308 |
|
1309 for(;;) { |
|
1310 // count code units below the minimum or with irrelevant data for the quick check |
|
1311 for(prevSrc=src;;) { |
|
1312 if(src==limit) { |
|
1313 return src; |
|
1314 } |
|
1315 if( (c=*src)<minNoMaybeCP || |
|
1316 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c)) |
|
1317 ) { |
|
1318 ++src; |
|
1319 } else if(!U16_IS_SURROGATE(c)) { |
|
1320 break; |
|
1321 } else { |
|
1322 UChar c2; |
|
1323 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1324 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { |
|
1325 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
1326 } |
|
1327 } else /* trail surrogate */ { |
|
1328 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { |
|
1329 --src; |
|
1330 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
1331 } |
|
1332 } |
|
1333 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { |
|
1334 src+=U16_LENGTH(c); |
|
1335 } else { |
|
1336 break; |
|
1337 } |
|
1338 } |
|
1339 } |
|
1340 if(src!=prevSrc) { |
|
1341 // Set prevBoundary to the last character in the quick check loop. |
|
1342 prevBoundary=src-1; |
|
1343 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary && |
|
1344 U16_IS_LEAD(*(prevBoundary-1)) |
|
1345 ) { |
|
1346 --prevBoundary; |
|
1347 } |
|
1348 prevCC=0; |
|
1349 // The start of the current character (c). |
|
1350 prevSrc=src; |
|
1351 } |
|
1352 |
|
1353 src+=U16_LENGTH(c); |
|
1354 /* |
|
1355 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. |
|
1356 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) |
|
1357 * or has ccc!=0. |
|
1358 */ |
|
1359 if(isMaybeOrNonZeroCC(norm16)) { |
|
1360 uint8_t cc=getCCFromYesOrMaybe(norm16); |
|
1361 if( onlyContiguous && // FCC |
|
1362 cc!=0 && |
|
1363 prevCC==0 && |
|
1364 prevBoundary<prevSrc && |
|
1365 // prevCC==0 && prevBoundary<prevSrc tell us that |
|
1366 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) |
|
1367 // passed the quick check "yes && ccc==0" test. |
|
1368 // Check whether the last character was a "yesYes" or a "yesNo". |
|
1369 // If a "yesNo", then we get its trailing ccc from its |
|
1370 // mapping and check for canonical order. |
|
1371 // All other cases are ok. |
|
1372 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc |
|
1373 ) { |
|
1374 // Fails FCD test. |
|
1375 } else if(prevCC<=cc || cc==0) { |
|
1376 prevCC=cc; |
|
1377 if(norm16<MIN_YES_YES_WITH_CC) { |
|
1378 if(pQCResult!=NULL) { |
|
1379 *pQCResult=UNORM_MAYBE; |
|
1380 } else { |
|
1381 return prevBoundary; |
|
1382 } |
|
1383 } |
|
1384 continue; |
|
1385 } |
|
1386 } |
|
1387 if(pQCResult!=NULL) { |
|
1388 *pQCResult=UNORM_NO; |
|
1389 } |
|
1390 return prevBoundary; |
|
1391 } |
|
1392 } |
|
1393 |
|
1394 void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit, |
|
1395 UBool doCompose, |
|
1396 UBool onlyContiguous, |
|
1397 UnicodeString &safeMiddle, |
|
1398 ReorderingBuffer &buffer, |
|
1399 UErrorCode &errorCode) const { |
|
1400 if(!buffer.isEmpty()) { |
|
1401 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit); |
|
1402 if(src!=firstStarterInSrc) { |
|
1403 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), |
|
1404 buffer.getLimit()); |
|
1405 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); |
|
1406 UnicodeString middle(lastStarterInDest, destSuffixLength); |
|
1407 buffer.removeSuffix(destSuffixLength); |
|
1408 safeMiddle=middle; |
|
1409 middle.append(src, (int32_t)(firstStarterInSrc-src)); |
|
1410 const UChar *middleStart=middle.getBuffer(); |
|
1411 compose(middleStart, middleStart+middle.length(), onlyContiguous, |
|
1412 TRUE, buffer, errorCode); |
|
1413 if(U_FAILURE(errorCode)) { |
|
1414 return; |
|
1415 } |
|
1416 src=firstStarterInSrc; |
|
1417 } |
|
1418 } |
|
1419 if(doCompose) { |
|
1420 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode); |
|
1421 } else { |
|
1422 if(limit==NULL) { // appendZeroCC() needs limit!=NULL |
|
1423 limit=u_strchr(src, 0); |
|
1424 } |
|
1425 buffer.appendZeroCC(src, limit, errorCode); |
|
1426 } |
|
1427 } |
|
1428 |
|
1429 /** |
|
1430 * Does c have a composition boundary before it? |
|
1431 * True if its decomposition begins with a character that has |
|
1432 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). |
|
1433 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes |
|
1434 * (isCompYesAndZeroCC()) so we need not decompose. |
|
1435 */ |
|
1436 UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { |
|
1437 for(;;) { |
|
1438 if(isCompYesAndZeroCC(norm16)) { |
|
1439 return TRUE; |
|
1440 } else if(isMaybeOrNonZeroCC(norm16)) { |
|
1441 return FALSE; |
|
1442 } else if(isDecompNoAlgorithmic(norm16)) { |
|
1443 c=mapAlgorithmic(c, norm16); |
|
1444 norm16=getNorm16(c); |
|
1445 } else { |
|
1446 // c decomposes, get everything from the variable-length extra data |
|
1447 const uint16_t *mapping=getMapping(norm16); |
|
1448 uint16_t firstUnit=*mapping; |
|
1449 if((firstUnit&MAPPING_LENGTH_MASK)==0) { |
|
1450 return FALSE; |
|
1451 } |
|
1452 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) { |
|
1453 return FALSE; // non-zero leadCC |
|
1454 } |
|
1455 int32_t i=1; // skip over the firstUnit |
|
1456 UChar32 c; |
|
1457 U16_NEXT_UNSAFE(mapping, i, c); |
|
1458 return isCompYesAndZeroCC(getNorm16(c)); |
|
1459 } |
|
1460 } |
|
1461 } |
|
1462 |
|
1463 UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const { |
|
1464 for(;;) { |
|
1465 uint16_t norm16=getNorm16(c); |
|
1466 if(isInert(norm16)) { |
|
1467 return TRUE; |
|
1468 } else if(norm16<=minYesNo) { |
|
1469 // Hangul: norm16==minYesNo |
|
1470 // Hangul LVT has a boundary after it. |
|
1471 // Hangul LV and non-inert yesYes characters combine forward. |
|
1472 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c); |
|
1473 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { |
|
1474 return FALSE; |
|
1475 } else if(isDecompNoAlgorithmic(norm16)) { |
|
1476 c=mapAlgorithmic(c, norm16); |
|
1477 } else { |
|
1478 // c decomposes, get everything from the variable-length extra data. |
|
1479 // If testInert, then c must be a yesNo character which has lccc=0, |
|
1480 // otherwise it could be a noNo. |
|
1481 const uint16_t *mapping=getMapping(norm16); |
|
1482 uint16_t firstUnit=*mapping; |
|
1483 // TRUE if |
|
1484 // not MAPPING_NO_COMP_BOUNDARY_AFTER |
|
1485 // (which is set if |
|
1486 // c is not deleted, and |
|
1487 // it and its decomposition do not combine forward, and it has a starter) |
|
1488 // and if FCC then trailCC<=1 |
|
1489 return |
|
1490 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && |
|
1491 (!onlyContiguous || firstUnit<=0x1ff); |
|
1492 } |
|
1493 } |
|
1494 } |
|
1495 |
|
1496 const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const { |
|
1497 BackwardUTrie2StringIterator iter(normTrie, start, p); |
|
1498 uint16_t norm16; |
|
1499 do { |
|
1500 norm16=iter.previous16(); |
|
1501 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); |
|
1502 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, |
|
1503 // but that's probably not worth the extra cost. |
|
1504 return iter.codePointStart; |
|
1505 } |
|
1506 |
|
1507 const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const { |
|
1508 ForwardUTrie2StringIterator iter(normTrie, p, limit); |
|
1509 uint16_t norm16; |
|
1510 do { |
|
1511 norm16=iter.next16(); |
|
1512 } while(!hasCompBoundaryBefore(iter.codePoint, norm16)); |
|
1513 return iter.codePointStart; |
|
1514 } |
|
1515 |
|
1516 // Note: normalizer2impl.cpp r30982 (2011-nov-27) |
|
1517 // still had getFCDTrie() which built and cached an FCD trie. |
|
1518 // That provided faster access to FCD data than getFCD16FromNormData() |
|
1519 // but required synchronization and consumed some 10kB of heap memory |
|
1520 // in any process that uses FCD (e.g., via collation). |
|
1521 // tccc180[] and smallFCD[] are intended to help with any loss of performance, |
|
1522 // at least for Latin & CJK. |
|
1523 |
|
1524 // Gets the FCD value from the regular normalization data. |
|
1525 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { |
|
1526 // Only loops for 1:1 algorithmic mappings. |
|
1527 for(;;) { |
|
1528 uint16_t norm16=getNorm16(c); |
|
1529 if(norm16<=minYesNo) { |
|
1530 // no decomposition or Hangul syllable, all zeros |
|
1531 return 0; |
|
1532 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { |
|
1533 // combining mark |
|
1534 norm16&=0xff; |
|
1535 return norm16|(norm16<<8); |
|
1536 } else if(norm16>=minMaybeYes) { |
|
1537 return 0; |
|
1538 } else if(isDecompNoAlgorithmic(norm16)) { |
|
1539 c=mapAlgorithmic(c, norm16); |
|
1540 } else { |
|
1541 // c decomposes, get everything from the variable-length extra data |
|
1542 const uint16_t *mapping=getMapping(norm16); |
|
1543 uint16_t firstUnit=*mapping; |
|
1544 if((firstUnit&MAPPING_LENGTH_MASK)==0) { |
|
1545 // A character that is deleted (maps to an empty string) must |
|
1546 // get the worst-case lccc and tccc values because arbitrary |
|
1547 // characters on both sides will become adjacent. |
|
1548 return 0x1ff; |
|
1549 } else { |
|
1550 norm16=firstUnit>>8; // tccc |
|
1551 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { |
|
1552 norm16|=*(mapping-1)&0xff00; // lccc |
|
1553 } |
|
1554 return norm16; |
|
1555 } |
|
1556 } |
|
1557 } |
|
1558 } |
|
1559 |
|
1560 // Dual functionality: |
|
1561 // buffer!=NULL: normalize |
|
1562 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes |
|
1563 const UChar * |
|
1564 Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit, |
|
1565 ReorderingBuffer *buffer, |
|
1566 UErrorCode &errorCode) const { |
|
1567 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. |
|
1568 // Similar to the prevBoundary in the compose() implementation. |
|
1569 const UChar *prevBoundary=src; |
|
1570 int32_t prevFCD16=0; |
|
1571 if(limit==NULL) { |
|
1572 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode); |
|
1573 if(U_FAILURE(errorCode)) { |
|
1574 return src; |
|
1575 } |
|
1576 if(prevBoundary<src) { |
|
1577 prevBoundary=src; |
|
1578 // We know that the previous character's lccc==0. |
|
1579 // Fetching the fcd16 value was deferred for this below-U+0300 code point. |
|
1580 prevFCD16=getFCD16(*(src-1)); |
|
1581 if(prevFCD16>1) { |
|
1582 --prevBoundary; |
|
1583 } |
|
1584 } |
|
1585 limit=u_strchr(src, 0); |
|
1586 } |
|
1587 |
|
1588 // Note: In this function we use buffer->appendZeroCC() because we track |
|
1589 // the lead and trail combining classes here, rather than leaving it to |
|
1590 // the ReorderingBuffer. |
|
1591 // The exception is the call to decomposeShort() which uses the buffer |
|
1592 // in the normal way. |
|
1593 |
|
1594 const UChar *prevSrc; |
|
1595 UChar32 c=0; |
|
1596 uint16_t fcd16=0; |
|
1597 |
|
1598 for(;;) { |
|
1599 // count code units with lccc==0 |
|
1600 for(prevSrc=src; src!=limit;) { |
|
1601 if((c=*src)<MIN_CCC_LCCC_CP) { |
|
1602 prevFCD16=~c; |
|
1603 ++src; |
|
1604 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { |
|
1605 prevFCD16=0; |
|
1606 ++src; |
|
1607 } else { |
|
1608 if(U16_IS_SURROGATE(c)) { |
|
1609 UChar c2; |
|
1610 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1611 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { |
|
1612 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
1613 } |
|
1614 } else /* trail surrogate */ { |
|
1615 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) { |
|
1616 --src; |
|
1617 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
1618 } |
|
1619 } |
|
1620 } |
|
1621 if((fcd16=getFCD16FromNormData(c))<=0xff) { |
|
1622 prevFCD16=fcd16; |
|
1623 src+=U16_LENGTH(c); |
|
1624 } else { |
|
1625 break; |
|
1626 } |
|
1627 } |
|
1628 } |
|
1629 // copy these code units all at once |
|
1630 if(src!=prevSrc) { |
|
1631 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { |
|
1632 break; |
|
1633 } |
|
1634 if(src==limit) { |
|
1635 break; |
|
1636 } |
|
1637 prevBoundary=src; |
|
1638 // We know that the previous character's lccc==0. |
|
1639 if(prevFCD16<0) { |
|
1640 // Fetching the fcd16 value was deferred for this below-U+0300 code point. |
|
1641 UChar32 prev=~prevFCD16; |
|
1642 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); |
|
1643 if(prevFCD16>1) { |
|
1644 --prevBoundary; |
|
1645 } |
|
1646 } else { |
|
1647 const UChar *p=src-1; |
|
1648 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { |
|
1649 --p; |
|
1650 // Need to fetch the previous character's FCD value because |
|
1651 // prevFCD16 was just for the trail surrogate code point. |
|
1652 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1])); |
|
1653 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. |
|
1654 } |
|
1655 if(prevFCD16>1) { |
|
1656 prevBoundary=p; |
|
1657 } |
|
1658 } |
|
1659 // The start of the current character (c). |
|
1660 prevSrc=src; |
|
1661 } else if(src==limit) { |
|
1662 break; |
|
1663 } |
|
1664 |
|
1665 src+=U16_LENGTH(c); |
|
1666 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. |
|
1667 // Check for proper order, and decompose locally if necessary. |
|
1668 if((prevFCD16&0xff)<=(fcd16>>8)) { |
|
1669 // proper order: prev tccc <= current lccc |
|
1670 if((fcd16&0xff)<=1) { |
|
1671 prevBoundary=src; |
|
1672 } |
|
1673 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { |
|
1674 break; |
|
1675 } |
|
1676 prevFCD16=fcd16; |
|
1677 continue; |
|
1678 } else if(buffer==NULL) { |
|
1679 return prevBoundary; // quick check "no" |
|
1680 } else { |
|
1681 /* |
|
1682 * Back out the part of the source that we copied or appended |
|
1683 * already but is now going to be decomposed. |
|
1684 * prevSrc is set to after what was copied/appended. |
|
1685 */ |
|
1686 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); |
|
1687 /* |
|
1688 * Find the part of the source that needs to be decomposed, |
|
1689 * up to the next safe boundary. |
|
1690 */ |
|
1691 src=findNextFCDBoundary(src, limit); |
|
1692 /* |
|
1693 * The source text does not fulfill the conditions for FCD. |
|
1694 * Decompose and reorder a limited piece of the text. |
|
1695 */ |
|
1696 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) { |
|
1697 break; |
|
1698 } |
|
1699 prevBoundary=src; |
|
1700 prevFCD16=0; |
|
1701 } |
|
1702 } |
|
1703 return src; |
|
1704 } |
|
1705 |
|
1706 void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit, |
|
1707 UBool doMakeFCD, |
|
1708 UnicodeString &safeMiddle, |
|
1709 ReorderingBuffer &buffer, |
|
1710 UErrorCode &errorCode) const { |
|
1711 if(!buffer.isEmpty()) { |
|
1712 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit); |
|
1713 if(src!=firstBoundaryInSrc) { |
|
1714 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), |
|
1715 buffer.getLimit()); |
|
1716 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); |
|
1717 UnicodeString middle(lastBoundaryInDest, destSuffixLength); |
|
1718 buffer.removeSuffix(destSuffixLength); |
|
1719 safeMiddle=middle; |
|
1720 middle.append(src, (int32_t)(firstBoundaryInSrc-src)); |
|
1721 const UChar *middleStart=middle.getBuffer(); |
|
1722 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); |
|
1723 if(U_FAILURE(errorCode)) { |
|
1724 return; |
|
1725 } |
|
1726 src=firstBoundaryInSrc; |
|
1727 } |
|
1728 } |
|
1729 if(doMakeFCD) { |
|
1730 makeFCD(src, limit, &buffer, errorCode); |
|
1731 } else { |
|
1732 if(limit==NULL) { // appendZeroCC() needs limit!=NULL |
|
1733 limit=u_strchr(src, 0); |
|
1734 } |
|
1735 buffer.appendZeroCC(src, limit, errorCode); |
|
1736 } |
|
1737 } |
|
1738 |
|
1739 const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { |
|
1740 while(start<p && previousFCD16(start, p)>0xff) {} |
|
1741 return p; |
|
1742 } |
|
1743 |
|
1744 const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { |
|
1745 while(p<limit) { |
|
1746 const UChar *codePointStart=p; |
|
1747 if(nextFCD16(p, limit)<=0xff) { |
|
1748 return codePointStart; |
|
1749 } |
|
1750 } |
|
1751 return p; |
|
1752 } |
|
1753 |
|
1754 // CanonicalIterator data -------------------------------------------------- *** |
|
1755 |
|
1756 CanonIterData::CanonIterData(UErrorCode &errorCode) : |
|
1757 trie(utrie2_open(0, 0, &errorCode)), |
|
1758 canonStartSets(uprv_deleteUObject, NULL, errorCode) {} |
|
1759 |
|
1760 CanonIterData::~CanonIterData() { |
|
1761 utrie2_close(trie); |
|
1762 } |
|
1763 |
|
1764 void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { |
|
1765 uint32_t canonValue=utrie2_get32(trie, decompLead); |
|
1766 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { |
|
1767 // origin is the first character whose decomposition starts with |
|
1768 // the character for which we are setting the value. |
|
1769 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode); |
|
1770 } else { |
|
1771 // origin is not the first character, or it is U+0000. |
|
1772 UnicodeSet *set; |
|
1773 if((canonValue&CANON_HAS_SET)==0) { |
|
1774 set=new UnicodeSet; |
|
1775 if(set==NULL) { |
|
1776 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
1777 return; |
|
1778 } |
|
1779 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); |
|
1780 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); |
|
1781 utrie2_set32(trie, decompLead, canonValue, &errorCode); |
|
1782 canonStartSets.addElement(set, errorCode); |
|
1783 if(firstOrigin!=0) { |
|
1784 set->add(firstOrigin); |
|
1785 } |
|
1786 } else { |
|
1787 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; |
|
1788 } |
|
1789 set->add(origin); |
|
1790 } |
|
1791 } |
|
1792 |
|
1793 U_CDECL_BEGIN |
|
1794 |
|
1795 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters. |
|
1796 // context: the Normalizer2Impl |
|
1797 static UBool U_CALLCONV |
|
1798 enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
|
1799 UErrorCode errorCode = U_ZERO_ERROR; |
|
1800 if (value != 0) { |
|
1801 Normalizer2Impl *impl = (Normalizer2Impl *)context; |
|
1802 impl->makeCanonIterDataFromNorm16( |
|
1803 start, end, (uint16_t)value, *impl->fCanonIterData, errorCode); |
|
1804 } |
|
1805 return U_SUCCESS(errorCode); |
|
1806 } |
|
1807 |
|
1808 |
|
1809 |
|
1810 // UInitOnce instantiation function for CanonIterData |
|
1811 |
|
1812 static void U_CALLCONV |
|
1813 initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { |
|
1814 U_ASSERT(impl->fCanonIterData == NULL); |
|
1815 impl->fCanonIterData = new CanonIterData(errorCode); |
|
1816 if (impl->fCanonIterData == NULL) { |
|
1817 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
1818 } |
|
1819 if (U_SUCCESS(errorCode)) { |
|
1820 utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl); |
|
1821 utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode); |
|
1822 } |
|
1823 if (U_FAILURE(errorCode)) { |
|
1824 delete impl->fCanonIterData; |
|
1825 impl->fCanonIterData = NULL; |
|
1826 } |
|
1827 } |
|
1828 |
|
1829 U_CDECL_END |
|
1830 |
|
1831 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, |
|
1832 CanonIterData &newData, |
|
1833 UErrorCode &errorCode) const { |
|
1834 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { |
|
1835 // Inert, or 2-way mapping (including Hangul syllable). |
|
1836 // We do not write a canonStartSet for any yesNo character. |
|
1837 // Composites from 2-way mappings are added at runtime from the |
|
1838 // starter's compositions list, and the other characters in |
|
1839 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are |
|
1840 // "maybe" characters. |
|
1841 return; |
|
1842 } |
|
1843 for(UChar32 c=start; c<=end; ++c) { |
|
1844 uint32_t oldValue=utrie2_get32(newData.trie, c); |
|
1845 uint32_t newValue=oldValue; |
|
1846 if(norm16>=minMaybeYes) { |
|
1847 // not a segment starter if it occurs in a decomposition or has cc!=0 |
|
1848 newValue|=CANON_NOT_SEGMENT_STARTER; |
|
1849 if(norm16<MIN_NORMAL_MAYBE_YES) { |
|
1850 newValue|=CANON_HAS_COMPOSITIONS; |
|
1851 } |
|
1852 } else if(norm16<minYesNo) { |
|
1853 newValue|=CANON_HAS_COMPOSITIONS; |
|
1854 } else { |
|
1855 // c has a one-way decomposition |
|
1856 UChar32 c2=c; |
|
1857 uint16_t norm16_2=norm16; |
|
1858 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { |
|
1859 c2=mapAlgorithmic(c2, norm16_2); |
|
1860 norm16_2=getNorm16(c2); |
|
1861 } |
|
1862 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { |
|
1863 // c decomposes, get everything from the variable-length extra data |
|
1864 const uint16_t *mapping=getMapping(norm16_2); |
|
1865 uint16_t firstUnit=*mapping; |
|
1866 int32_t length=firstUnit&MAPPING_LENGTH_MASK; |
|
1867 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { |
|
1868 if(c==c2 && (*(mapping-1)&0xff)!=0) { |
|
1869 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 |
|
1870 } |
|
1871 } |
|
1872 // Skip empty mappings (no characters in the decomposition). |
|
1873 if(length!=0) { |
|
1874 ++mapping; // skip over the firstUnit |
|
1875 // add c to first code point's start set |
|
1876 int32_t i=0; |
|
1877 U16_NEXT_UNSAFE(mapping, i, c2); |
|
1878 newData.addToStartSet(c, c2, errorCode); |
|
1879 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a |
|
1880 // one-way mapping. A 2-way mapping is possible here after |
|
1881 // intermediate algorithmic mapping. |
|
1882 if(norm16_2>=minNoNo) { |
|
1883 while(i<length) { |
|
1884 U16_NEXT_UNSAFE(mapping, i, c2); |
|
1885 uint32_t c2Value=utrie2_get32(newData.trie, c2); |
|
1886 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { |
|
1887 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER, |
|
1888 &errorCode); |
|
1889 } |
|
1890 } |
|
1891 } |
|
1892 } |
|
1893 } else { |
|
1894 // c decomposed to c2 algorithmically; c has cc==0 |
|
1895 newData.addToStartSet(c, c2, errorCode); |
|
1896 } |
|
1897 } |
|
1898 if(newValue!=oldValue) { |
|
1899 utrie2_set32(newData.trie, c, newValue, &errorCode); |
|
1900 } |
|
1901 } |
|
1902 } |
|
1903 |
|
1904 UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { |
|
1905 // Logically const: Synchronized instantiation. |
|
1906 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this); |
|
1907 umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode); |
|
1908 return U_SUCCESS(errorCode); |
|
1909 } |
|
1910 |
|
1911 int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { |
|
1912 return (int32_t)utrie2_get32(fCanonIterData->trie, c); |
|
1913 } |
|
1914 |
|
1915 const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { |
|
1916 return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; |
|
1917 } |
|
1918 |
|
1919 UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { |
|
1920 return getCanonValue(c)>=0; |
|
1921 } |
|
1922 |
|
1923 UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { |
|
1924 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER; |
|
1925 if(canonValue==0) { |
|
1926 return FALSE; |
|
1927 } |
|
1928 set.clear(); |
|
1929 int32_t value=canonValue&CANON_VALUE_MASK; |
|
1930 if((canonValue&CANON_HAS_SET)!=0) { |
|
1931 set.addAll(getCanonStartSet(value)); |
|
1932 } else if(value!=0) { |
|
1933 set.add(value); |
|
1934 } |
|
1935 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { |
|
1936 uint16_t norm16=getNorm16(c); |
|
1937 if(norm16==JAMO_L) { |
|
1938 UChar32 syllable= |
|
1939 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); |
|
1940 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); |
|
1941 } else { |
|
1942 addComposites(getCompositionsList(norm16), set); |
|
1943 } |
|
1944 } |
|
1945 return TRUE; |
|
1946 } |
|
1947 |
|
1948 U_NAMESPACE_END |
|
1949 |
|
1950 // Normalizer2 data swapping ----------------------------------------------- *** |
|
1951 |
|
1952 U_NAMESPACE_USE |
|
1953 |
|
1954 U_CAPI int32_t U_EXPORT2 |
|
1955 unorm2_swap(const UDataSwapper *ds, |
|
1956 const void *inData, int32_t length, void *outData, |
|
1957 UErrorCode *pErrorCode) { |
|
1958 const UDataInfo *pInfo; |
|
1959 int32_t headerSize; |
|
1960 |
|
1961 const uint8_t *inBytes; |
|
1962 uint8_t *outBytes; |
|
1963 |
|
1964 const int32_t *inIndexes; |
|
1965 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1]; |
|
1966 |
|
1967 int32_t i, offset, nextOffset, size; |
|
1968 |
|
1969 /* udata_swapDataHeader checks the arguments */ |
|
1970 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
|
1971 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1972 return 0; |
|
1973 } |
|
1974 |
|
1975 /* check data format and format version */ |
|
1976 pInfo=(const UDataInfo *)((const char *)inData+4); |
|
1977 if(!( |
|
1978 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ |
|
1979 pInfo->dataFormat[1]==0x72 && |
|
1980 pInfo->dataFormat[2]==0x6d && |
|
1981 pInfo->dataFormat[3]==0x32 && |
|
1982 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2) |
|
1983 )) { |
|
1984 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", |
|
1985 pInfo->dataFormat[0], pInfo->dataFormat[1], |
|
1986 pInfo->dataFormat[2], pInfo->dataFormat[3], |
|
1987 pInfo->formatVersion[0]); |
|
1988 *pErrorCode=U_UNSUPPORTED_ERROR; |
|
1989 return 0; |
|
1990 } |
|
1991 |
|
1992 inBytes=(const uint8_t *)inData+headerSize; |
|
1993 outBytes=(uint8_t *)outData+headerSize; |
|
1994 |
|
1995 inIndexes=(const int32_t *)inBytes; |
|
1996 |
|
1997 if(length>=0) { |
|
1998 length-=headerSize; |
|
1999 if(length<(int32_t)sizeof(indexes)) { |
|
2000 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n", |
|
2001 length); |
|
2002 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
2003 return 0; |
|
2004 } |
|
2005 } |
|
2006 |
|
2007 /* read the first few indexes */ |
|
2008 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) { |
|
2009 indexes[i]=udata_readInt32(ds, inIndexes[i]); |
|
2010 } |
|
2011 |
|
2012 /* get the total length of the data */ |
|
2013 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; |
|
2014 |
|
2015 if(length>=0) { |
|
2016 if(length<size) { |
|
2017 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n", |
|
2018 length); |
|
2019 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
2020 return 0; |
|
2021 } |
|
2022 |
|
2023 /* copy the data for inaccessible bytes */ |
|
2024 if(inBytes!=outBytes) { |
|
2025 uprv_memcpy(outBytes, inBytes, size); |
|
2026 } |
|
2027 |
|
2028 offset=0; |
|
2029 |
|
2030 /* swap the int32_t indexes[] */ |
|
2031 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]; |
|
2032 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode); |
|
2033 offset=nextOffset; |
|
2034 |
|
2035 /* swap the UTrie2 */ |
|
2036 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]; |
|
2037 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); |
|
2038 offset=nextOffset; |
|
2039 |
|
2040 /* swap the uint16_t extraData[] */ |
|
2041 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]; |
|
2042 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode); |
|
2043 offset=nextOffset; |
|
2044 |
|
2045 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */ |
|
2046 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1]; |
|
2047 offset=nextOffset; |
|
2048 |
|
2049 U_ASSERT(offset==size); |
|
2050 } |
|
2051 |
|
2052 return headerSize+size; |
|
2053 } |
|
2054 |
|
2055 #endif // !UCONFIG_NO_NORMALIZATION |