1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/normalizer2impl.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,779 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2009-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: normalizer2impl.h 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2009nov22 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#ifndef __NORMALIZER2IMPL_H__ 1.21 +#define __NORMALIZER2IMPL_H__ 1.22 + 1.23 +#include "unicode/utypes.h" 1.24 + 1.25 +#if !UCONFIG_NO_NORMALIZATION 1.26 + 1.27 +#include "unicode/normalizer2.h" 1.28 +#include "unicode/udata.h" 1.29 +#include "unicode/unistr.h" 1.30 +#include "unicode/unorm.h" 1.31 +#include "unicode/utf16.h" 1.32 +#include "mutex.h" 1.33 +#include "uset_imp.h" 1.34 +#include "utrie2.h" 1.35 + 1.36 +U_NAMESPACE_BEGIN 1.37 + 1.38 +struct CanonIterData; 1.39 + 1.40 +class Hangul { 1.41 +public: 1.42 + /* Korean Hangul and Jamo constants */ 1.43 + enum { 1.44 + JAMO_L_BASE=0x1100, /* "lead" jamo */ 1.45 + JAMO_V_BASE=0x1161, /* "vowel" jamo */ 1.46 + JAMO_T_BASE=0x11a7, /* "trail" jamo */ 1.47 + 1.48 + HANGUL_BASE=0xac00, 1.49 + 1.50 + JAMO_L_COUNT=19, 1.51 + JAMO_V_COUNT=21, 1.52 + JAMO_T_COUNT=28, 1.53 + 1.54 + JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, 1.55 + 1.56 + HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, 1.57 + HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT 1.58 + }; 1.59 + 1.60 + static inline UBool isHangul(UChar32 c) { 1.61 + return HANGUL_BASE<=c && c<HANGUL_LIMIT; 1.62 + } 1.63 + static inline UBool 1.64 + isHangulWithoutJamoT(UChar c) { 1.65 + c-=HANGUL_BASE; 1.66 + return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 1.67 + } 1.68 + static inline UBool isJamoL(UChar32 c) { 1.69 + return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT; 1.70 + } 1.71 + static inline UBool isJamoV(UChar32 c) { 1.72 + return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT; 1.73 + } 1.74 + 1.75 + /** 1.76 + * Decomposes c, which must be a Hangul syllable, into buffer 1.77 + * and returns the length of the decomposition (2 or 3). 1.78 + */ 1.79 + static inline int32_t decompose(UChar32 c, UChar buffer[3]) { 1.80 + c-=HANGUL_BASE; 1.81 + UChar32 c2=c%JAMO_T_COUNT; 1.82 + c/=JAMO_T_COUNT; 1.83 + buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 1.84 + buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 1.85 + if(c2==0) { 1.86 + return 2; 1.87 + } else { 1.88 + buffer[2]=(UChar)(JAMO_T_BASE+c2); 1.89 + return 3; 1.90 + } 1.91 + } 1.92 + 1.93 + /** 1.94 + * Decomposes c, which must be a Hangul syllable, into buffer. 1.95 + * This is the raw, not recursive, decomposition. Its length is always 2. 1.96 + */ 1.97 + static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { 1.98 + UChar32 orig=c; 1.99 + c-=HANGUL_BASE; 1.100 + UChar32 c2=c%JAMO_T_COUNT; 1.101 + if(c2==0) { 1.102 + c/=JAMO_T_COUNT; 1.103 + buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); 1.104 + buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); 1.105 + } else { 1.106 + buffer[0]=orig-c2; // LV syllable 1.107 + buffer[1]=(UChar)(JAMO_T_BASE+c2); 1.108 + } 1.109 + } 1.110 +private: 1.111 + Hangul(); // no instantiation 1.112 +}; 1.113 + 1.114 +class Normalizer2Impl; 1.115 + 1.116 +class ReorderingBuffer : public UMemory { 1.117 +public: 1.118 + ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) : 1.119 + impl(ni), str(dest), 1.120 + start(NULL), reorderStart(NULL), limit(NULL), 1.121 + remainingCapacity(0), lastCC(0) {} 1.122 + ~ReorderingBuffer() { 1.123 + if(start!=NULL) { 1.124 + str.releaseBuffer((int32_t)(limit-start)); 1.125 + } 1.126 + } 1.127 + UBool init(int32_t destCapacity, UErrorCode &errorCode); 1.128 + 1.129 + UBool isEmpty() const { return start==limit; } 1.130 + int32_t length() const { return (int32_t)(limit-start); } 1.131 + UChar *getStart() { return start; } 1.132 + UChar *getLimit() { return limit; } 1.133 + uint8_t getLastCC() const { return lastCC; } 1.134 + 1.135 + UBool equals(const UChar *start, const UChar *limit) const; 1.136 + 1.137 + // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 1.138 + void setLastChar(UChar c) { 1.139 + *(limit-1)=c; 1.140 + } 1.141 + 1.142 + UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { 1.143 + return (c<=0xffff) ? 1.144 + appendBMP((UChar)c, cc, errorCode) : 1.145 + appendSupplementary(c, cc, errorCode); 1.146 + } 1.147 + // s must be in NFD, otherwise change the implementation. 1.148 + UBool append(const UChar *s, int32_t length, 1.149 + uint8_t leadCC, uint8_t trailCC, 1.150 + UErrorCode &errorCode); 1.151 + UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { 1.152 + if(remainingCapacity==0 && !resize(1, errorCode)) { 1.153 + return FALSE; 1.154 + } 1.155 + if(lastCC<=cc || cc==0) { 1.156 + *limit++=c; 1.157 + lastCC=cc; 1.158 + if(cc<=1) { 1.159 + reorderStart=limit; 1.160 + } 1.161 + } else { 1.162 + insert(c, cc); 1.163 + } 1.164 + --remainingCapacity; 1.165 + return TRUE; 1.166 + } 1.167 + UBool appendZeroCC(UChar32 c, UErrorCode &errorCode); 1.168 + UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode); 1.169 + void remove(); 1.170 + void removeSuffix(int32_t suffixLength); 1.171 + void setReorderingLimit(UChar *newLimit) { 1.172 + remainingCapacity+=(int32_t)(limit-newLimit); 1.173 + reorderStart=limit=newLimit; 1.174 + lastCC=0; 1.175 + } 1.176 + void copyReorderableSuffixTo(UnicodeString &s) const { 1.177 + s.setTo(reorderStart, (int32_t)(limit-reorderStart)); 1.178 + } 1.179 +private: 1.180 + /* 1.181 + * TODO: Revisit whether it makes sense to track reorderStart. 1.182 + * It is set to after the last known character with cc<=1, 1.183 + * which stops previousCC() before it reads that character and looks up its cc. 1.184 + * previousCC() is normally only called from insert(). 1.185 + * In other words, reorderStart speeds up the insertion of a combining mark 1.186 + * into a multi-combining mark sequence where it does not belong at the end. 1.187 + * This might not be worth the trouble. 1.188 + * On the other hand, it's not a huge amount of trouble. 1.189 + * 1.190 + * We probably need it for UNORM_SIMPLE_APPEND. 1.191 + */ 1.192 + 1.193 + UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode); 1.194 + void insert(UChar32 c, uint8_t cc); 1.195 + static void writeCodePoint(UChar *p, UChar32 c) { 1.196 + if(c<=0xffff) { 1.197 + *p=(UChar)c; 1.198 + } else { 1.199 + p[0]=U16_LEAD(c); 1.200 + p[1]=U16_TRAIL(c); 1.201 + } 1.202 + } 1.203 + UBool resize(int32_t appendLength, UErrorCode &errorCode); 1.204 + 1.205 + const Normalizer2Impl &impl; 1.206 + UnicodeString &str; 1.207 + UChar *start, *reorderStart, *limit; 1.208 + int32_t remainingCapacity; 1.209 + uint8_t lastCC; 1.210 + 1.211 + // private backward iterator 1.212 + void setIterator() { codePointStart=limit; } 1.213 + void skipPrevious(); // Requires start<codePointStart. 1.214 + uint8_t previousCC(); // Returns 0 if there is no previous character. 1.215 + 1.216 + UChar *codePointStart, *codePointLimit; 1.217 +}; 1.218 + 1.219 +class U_COMMON_API Normalizer2Impl : public UMemory { 1.220 +public: 1.221 + Normalizer2Impl() : memory(NULL), normTrie(NULL), fCanonIterData(NULL) { 1.222 + fCanonIterDataInitOnce.reset(); 1.223 + } 1.224 + ~Normalizer2Impl(); 1.225 + 1.226 + void load(const char *packageName, const char *name, UErrorCode &errorCode); 1.227 + 1.228 + void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 1.229 + void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const; 1.230 + 1.231 + // low-level properties ------------------------------------------------ *** 1.232 + 1.233 + const UTrie2 *getNormTrie() const { return normTrie; } 1.234 + 1.235 + UBool ensureCanonIterData(UErrorCode &errorCode) const; 1.236 + 1.237 + uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } 1.238 + 1.239 + UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { 1.240 + if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 1.241 + return UNORM_YES; 1.242 + } else if(minMaybeYes<=norm16) { 1.243 + return UNORM_MAYBE; 1.244 + } else { 1.245 + return UNORM_NO; 1.246 + } 1.247 + } 1.248 + UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } 1.249 + UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } 1.250 + 1.251 + uint8_t getCC(uint16_t norm16) const { 1.252 + if(norm16>=MIN_NORMAL_MAYBE_YES) { 1.253 + return (uint8_t)norm16; 1.254 + } 1.255 + if(norm16<minNoNo || limitNoNo<=norm16) { 1.256 + return 0; 1.257 + } 1.258 + return getCCFromNoNo(norm16); 1.259 + } 1.260 + static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { 1.261 + return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; 1.262 + } 1.263 + 1.264 + /** 1.265 + * Returns the FCD data for code point c. 1.266 + * @param c A Unicode code point. 1.267 + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 1.268 + */ 1.269 + uint16_t getFCD16(UChar32 c) const { 1.270 + if(c<0) { 1.271 + return 0; 1.272 + } else if(c<0x180) { 1.273 + return tccc180[c]; 1.274 + } else if(c<=0xffff) { 1.275 + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 1.276 + } 1.277 + return getFCD16FromNormData(c); 1.278 + } 1.279 + /** 1.280 + * Returns the FCD data for the next code point (post-increment). 1.281 + * Might skip only a lead surrogate rather than the whole surrogate pair if none of 1.282 + * the supplementary code points associated with the lead surrogate have non-zero FCD data. 1.283 + * @param s A valid pointer into a string. Requires s!=limit. 1.284 + * @param limit The end of the string, or NULL. 1.285 + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 1.286 + */ 1.287 + uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { 1.288 + UChar32 c=*s++; 1.289 + if(c<0x180) { 1.290 + return tccc180[c]; 1.291 + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1.292 + return 0; 1.293 + } 1.294 + UChar c2; 1.295 + if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { 1.296 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.297 + ++s; 1.298 + } 1.299 + return getFCD16FromNormData(c); 1.300 + } 1.301 + /** 1.302 + * Returns the FCD data for the previous code point (pre-decrement). 1.303 + * @param start The start of the string. 1.304 + * @param s A valid pointer into a string. Requires start<s. 1.305 + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 1.306 + */ 1.307 + uint16_t previousFCD16(const UChar *start, const UChar *&s) const { 1.308 + UChar32 c=*--s; 1.309 + if(c<0x180) { 1.310 + return tccc180[c]; 1.311 + } 1.312 + if(!U16_IS_TRAIL(c)) { 1.313 + if(!singleLeadMightHaveNonZeroFCD16(c)) { 1.314 + return 0; 1.315 + } 1.316 + } else { 1.317 + UChar c2; 1.318 + if(start<s && U16_IS_LEAD(c2=*(s-1))) { 1.319 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.320 + --s; 1.321 + } 1.322 + } 1.323 + return getFCD16FromNormData(c); 1.324 + } 1.325 + 1.326 + /** Returns the FCD data for U+0000<=c<U+0180. */ 1.327 + uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; } 1.328 + /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */ 1.329 + UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const { 1.330 + // 0<=lead<=0xffff 1.331 + uint8_t bits=smallFCD[lead>>8]; 1.332 + if(bits==0) { return false; } 1.333 + return (UBool)((bits>>((lead>>5)&7))&1); 1.334 + } 1.335 + /** Returns the FCD value from the regular normalization data. */ 1.336 + uint16_t getFCD16FromNormData(UChar32 c) const; 1.337 + 1.338 + void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, 1.339 + CanonIterData &newData, UErrorCode &errorCode) const; 1.340 + 1.341 + /** 1.342 + * Gets the decomposition for one code point. 1.343 + * @param c code point 1.344 + * @param buffer out-only buffer for algorithmic decompositions 1.345 + * @param length out-only, takes the length of the decomposition, if any 1.346 + * @return pointer to the decomposition, or NULL if none 1.347 + */ 1.348 + const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; 1.349 + 1.350 + /** 1.351 + * Gets the raw decomposition for one code point. 1.352 + * @param c code point 1.353 + * @param buffer out-only buffer for algorithmic decompositions 1.354 + * @param length out-only, takes the length of the decomposition, if any 1.355 + * @return pointer to the decomposition, or NULL if none 1.356 + */ 1.357 + const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; 1.358 + 1.359 + UChar32 composePair(UChar32 a, UChar32 b) const; 1.360 + 1.361 + UBool isCanonSegmentStarter(UChar32 c) const; 1.362 + UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; 1.363 + 1.364 + enum { 1.365 + MIN_CCC_LCCC_CP=0x300 1.366 + }; 1.367 + 1.368 + enum { 1.369 + MIN_YES_YES_WITH_CC=0xff01, 1.370 + JAMO_VT=0xff00, 1.371 + MIN_NORMAL_MAYBE_YES=0xfe00, 1.372 + JAMO_L=1, 1.373 + MAX_DELTA=0x40 1.374 + }; 1.375 + 1.376 + enum { 1.377 + // Byte offsets from the start of the data, after the generic header. 1.378 + IX_NORM_TRIE_OFFSET, 1.379 + IX_EXTRA_DATA_OFFSET, 1.380 + IX_SMALL_FCD_OFFSET, 1.381 + IX_RESERVED3_OFFSET, 1.382 + IX_RESERVED4_OFFSET, 1.383 + IX_RESERVED5_OFFSET, 1.384 + IX_RESERVED6_OFFSET, 1.385 + IX_TOTAL_SIZE, 1.386 + 1.387 + // Code point thresholds for quick check codes. 1.388 + IX_MIN_DECOMP_NO_CP, 1.389 + IX_MIN_COMP_NO_MAYBE_CP, 1.390 + 1.391 + // Norm16 value thresholds for quick check combinations and types of extra data. 1.392 + IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 1.393 + IX_MIN_NO_NO, 1.394 + IX_LIMIT_NO_NO, 1.395 + IX_MIN_MAYBE_YES, 1.396 + 1.397 + IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[. 1.398 + 1.399 + IX_RESERVED15, 1.400 + IX_COUNT 1.401 + }; 1.402 + 1.403 + enum { 1.404 + MAPPING_HAS_CCC_LCCC_WORD=0x80, 1.405 + MAPPING_HAS_RAW_MAPPING=0x40, 1.406 + MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, 1.407 + MAPPING_LENGTH_MASK=0x1f 1.408 + }; 1.409 + 1.410 + enum { 1.411 + COMP_1_LAST_TUPLE=0x8000, 1.412 + COMP_1_TRIPLE=1, 1.413 + COMP_1_TRAIL_LIMIT=0x3400, 1.414 + COMP_1_TRAIL_MASK=0x7ffe, 1.415 + COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit 1.416 + COMP_2_TRAIL_SHIFT=6, 1.417 + COMP_2_TRAIL_MASK=0xffc0 1.418 + }; 1.419 + 1.420 + // higher-level functionality ------------------------------------------ *** 1.421 + 1.422 + const UChar *decompose(const UChar *src, const UChar *limit, 1.423 + ReorderingBuffer *buffer, UErrorCode &errorCode) const; 1.424 + void decomposeAndAppend(const UChar *src, const UChar *limit, 1.425 + UBool doDecompose, 1.426 + UnicodeString &safeMiddle, 1.427 + ReorderingBuffer &buffer, 1.428 + UErrorCode &errorCode) const; 1.429 + UBool compose(const UChar *src, const UChar *limit, 1.430 + UBool onlyContiguous, 1.431 + UBool doCompose, 1.432 + ReorderingBuffer &buffer, 1.433 + UErrorCode &errorCode) const; 1.434 + const UChar *composeQuickCheck(const UChar *src, const UChar *limit, 1.435 + UBool onlyContiguous, 1.436 + UNormalizationCheckResult *pQCResult) const; 1.437 + void composeAndAppend(const UChar *src, const UChar *limit, 1.438 + UBool doCompose, 1.439 + UBool onlyContiguous, 1.440 + UnicodeString &safeMiddle, 1.441 + ReorderingBuffer &buffer, 1.442 + UErrorCode &errorCode) const; 1.443 + const UChar *makeFCD(const UChar *src, const UChar *limit, 1.444 + ReorderingBuffer *buffer, UErrorCode &errorCode) const; 1.445 + void makeFCDAndAppend(const UChar *src, const UChar *limit, 1.446 + UBool doMakeFCD, 1.447 + UnicodeString &safeMiddle, 1.448 + ReorderingBuffer &buffer, 1.449 + UErrorCode &errorCode) const; 1.450 + 1.451 + UBool hasDecompBoundary(UChar32 c, UBool before) const; 1.452 + UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } 1.453 + 1.454 + UBool hasCompBoundaryBefore(UChar32 c) const { 1.455 + return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1.456 + } 1.457 + UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const; 1.458 + 1.459 + UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } 1.460 + UBool hasFCDBoundaryAfter(UChar32 c) const { 1.461 + uint16_t fcd16=getFCD16(c); 1.462 + return fcd16<=1 || (fcd16&0xff)==0; 1.463 + } 1.464 + UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } 1.465 +private: 1.466 + static UBool U_CALLCONV 1.467 + isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); 1.468 + 1.469 + UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } 1.470 + UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } 1.471 + static UBool isInert(uint16_t norm16) { return norm16==0; } 1.472 + static UBool isJamoL(uint16_t norm16) { return norm16==1; } 1.473 + static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } 1.474 + UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } 1.475 + UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } 1.476 + // UBool isCompYes(uint16_t norm16) const { 1.477 + // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1.478 + // } 1.479 + // UBool isCompYesOrMaybe(uint16_t norm16) const { 1.480 + // return norm16<minNoNo || minMaybeYes<=norm16; 1.481 + // } 1.482 + // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 1.483 + // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1.484 + // } 1.485 + UBool isDecompYesAndZeroCC(uint16_t norm16) const { 1.486 + return norm16<minYesNo || 1.487 + norm16==JAMO_VT || 1.488 + (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1.489 + } 1.490 + /** 1.491 + * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1.492 + * the MaybeYes which combine-forward and have ccc=0. 1.493 + * (Standard Unicode 5.2 normalization does not have such characters.) 1.494 + */ 1.495 + UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { 1.496 + return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1.497 + } 1.498 + UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } 1.499 + 1.500 + // For use with isCompYes(). 1.501 + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1.502 + // static uint8_t getCCFromYes(uint16_t norm16) { 1.503 + // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1.504 + // } 1.505 + uint8_t getCCFromNoNo(uint16_t norm16) const { 1.506 + const uint16_t *mapping=getMapping(norm16); 1.507 + if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { 1.508 + return (uint8_t)*(mapping-1); 1.509 + } else { 1.510 + return 0; 1.511 + } 1.512 + } 1.513 + // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 1.514 + uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; 1.515 + 1.516 + // Requires algorithmic-NoNo. 1.517 + UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { 1.518 + return c+norm16-(minMaybeYes-MAX_DELTA-1); 1.519 + } 1.520 + 1.521 + // Requires minYesNo<norm16<limitNoNo. 1.522 + const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; } 1.523 + const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { 1.524 + if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1.525 + return NULL; 1.526 + } else if(norm16<minMaybeYes) { 1.527 + return extraData+norm16; // for yesYes; if Jamo L: harmless empty list 1.528 + } else { 1.529 + return maybeYesCompositions+norm16-minMaybeYes; 1.530 + } 1.531 + } 1.532 + const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { 1.533 + const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list 1.534 + return list+ // mapping pointer 1.535 + 1+ // +1 to skip the first unit with the mapping lenth 1.536 + (*list&MAPPING_LENGTH_MASK); // + mapping length 1.537 + } 1.538 + /** 1.539 + * @param c code point must have compositions 1.540 + * @return compositions list pointer 1.541 + */ 1.542 + const uint16_t *getCompositionsList(uint16_t norm16) const { 1.543 + return isDecompYes(norm16) ? 1.544 + getCompositionsListForDecompYes(norm16) : 1.545 + getCompositionsListForComposite(norm16); 1.546 + } 1.547 + 1.548 + const UChar *copyLowPrefixFromNulTerminated(const UChar *src, 1.549 + UChar32 minNeedDataCP, 1.550 + ReorderingBuffer *buffer, 1.551 + UErrorCode &errorCode) const; 1.552 + UBool decomposeShort(const UChar *src, const UChar *limit, 1.553 + ReorderingBuffer &buffer, UErrorCode &errorCode) const; 1.554 + UBool decompose(UChar32 c, uint16_t norm16, 1.555 + ReorderingBuffer &buffer, UErrorCode &errorCode) const; 1.556 + 1.557 + static int32_t combine(const uint16_t *list, UChar32 trail); 1.558 + void addComposites(const uint16_t *list, UnicodeSet &set) const; 1.559 + void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, 1.560 + UBool onlyContiguous) const; 1.561 + 1.562 + UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; 1.563 + const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; 1.564 + const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; 1.565 + 1.566 + const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; 1.567 + const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; 1.568 + 1.569 + int32_t getCanonValue(UChar32 c) const; 1.570 + const UnicodeSet &getCanonStartSet(int32_t n) const; 1.571 + 1.572 + UDataMemory *memory; 1.573 + UVersionInfo dataVersion; 1.574 + 1.575 + // Code point thresholds for quick check codes. 1.576 + UChar32 minDecompNoCP; 1.577 + UChar32 minCompNoMaybeCP; 1.578 + 1.579 + // Norm16 value thresholds for quick check combinations and types of extra data. 1.580 + uint16_t minYesNo; 1.581 + uint16_t minYesNoMappingsOnly; 1.582 + uint16_t minNoNo; 1.583 + uint16_t limitNoNo; 1.584 + uint16_t minMaybeYes; 1.585 + 1.586 + UTrie2 *normTrie; 1.587 + const uint16_t *maybeYesCompositions; 1.588 + const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 1.589 + const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 1.590 + uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F 1.591 + 1.592 + public: // CanonIterData is public to allow access from C callback functions. 1.593 + UInitOnce fCanonIterDataInitOnce; 1.594 + CanonIterData *fCanonIterData; 1.595 +}; 1.596 + 1.597 +// bits in canonIterData 1.598 +#define CANON_NOT_SEGMENT_STARTER 0x80000000 1.599 +#define CANON_HAS_COMPOSITIONS 0x40000000 1.600 +#define CANON_HAS_SET 0x200000 1.601 +#define CANON_VALUE_MASK 0x1fffff 1.602 + 1.603 +/** 1.604 + * ICU-internal shortcut for quick access to standard Unicode normalization. 1.605 + */ 1.606 +class U_COMMON_API Normalizer2Factory { 1.607 +public: 1.608 + static const Normalizer2 *getNFCInstance(UErrorCode &errorCode); 1.609 + static const Normalizer2 *getNFDInstance(UErrorCode &errorCode); 1.610 + static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); 1.611 + static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); 1.612 + static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode); 1.613 + static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode); 1.614 + static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode); 1.615 + static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); 1.616 + 1.617 + static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); 1.618 + 1.619 + static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); 1.620 + static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); 1.621 + static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); 1.622 + 1.623 + // Get the Impl instance of the Normalizer2. 1.624 + // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 1.625 + static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); 1.626 +private: 1.627 + Normalizer2Factory(); // No instantiation. 1.628 +}; 1.629 + 1.630 +U_NAMESPACE_END 1.631 + 1.632 +U_CAPI int32_t U_EXPORT2 1.633 +unorm2_swap(const UDataSwapper *ds, 1.634 + const void *inData, int32_t length, void *outData, 1.635 + UErrorCode *pErrorCode); 1.636 + 1.637 +/** 1.638 + * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 1.639 + * @internal 1.640 + */ 1.641 +U_CFUNC UNormalizationCheckResult 1.642 +unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); 1.643 + 1.644 +/** 1.645 + * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). 1.646 + * @internal 1.647 + */ 1.648 +U_CFUNC uint16_t 1.649 +unorm_getFCD16(UChar32 c); 1.650 + 1.651 +/** 1.652 + * Format of Normalizer2 .nrm data files. 1.653 + * Format version 2.0. 1.654 + * 1.655 + * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 1.656 + * ICU ships with data files for standard Unicode Normalization Forms 1.657 + * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 1.658 + * Custom (application-specific) data can be built into additional .nrm files 1.659 + * with the gennorm2 build tool. 1.660 + * 1.661 + * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 1.662 + * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 1.663 + * 1.664 + * A .nrm file begins with a standard ICU data file header 1.665 + * (DataHeader, see ucmndata.h and unicode/udata.h). 1.666 + * The UDataInfo.dataVersion field usually contains the Unicode version 1.667 + * for which the data was generated. 1.668 + * 1.669 + * After the header, the file contains the following parts. 1.670 + * Constants are defined as enum values of the Normalizer2Impl class. 1.671 + * 1.672 + * Many details of the data structures are described in the design doc 1.673 + * which is at http://site.icu-project.org/design/normalization/custom 1.674 + * 1.675 + * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 1.676 + * 1.677 + * The first eight indexes are byte offsets in ascending order. 1.678 + * Each byte offset marks the start of the next part in the data file, 1.679 + * and the end of the previous one. 1.680 + * When two consecutive byte offsets are the same, then the corresponding part is empty. 1.681 + * Byte offsets are offsets from after the header, 1.682 + * that is, from the beginning of the indexes[]. 1.683 + * Each part starts at an offset with proper alignment for its data. 1.684 + * If necessary, the previous part may include padding bytes to achieve this alignment. 1.685 + * 1.686 + * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 1.687 + * with a decomposition mapping, that is, with NF*D_QC=No. 1.688 + * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 1.689 + * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 1.690 + * 1.691 + * The next five indexes are thresholds of 16-bit trie values for ranges of 1.692 + * values indicating multiple normalization properties. 1.693 + * minYesNo=indexes[IX_MIN_YES_NO]; 1.694 + * minNoNo=indexes[IX_MIN_NO_NO]; 1.695 + * limitNoNo=indexes[IX_LIMIT_NO_NO]; 1.696 + * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 1.697 + * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 1.698 + * See the normTrie description below and the design doc for details. 1.699 + * 1.700 + * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h 1.701 + * 1.702 + * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 1.703 + * Rather than using independent bits in the value (which would require more than 16 bits), 1.704 + * information is extracted primarily via range checks. 1.705 + * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 1.706 + * means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 1.707 + * which means it has a two-way (round-trip) decomposition mapping. 1.708 + * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 1.709 + * pointing to mappings, compositions lists, or both. 1.710 + * Value norm16==0 means that the character is normalization-inert, that is, 1.711 + * it does not have a mapping, does not participate in composition, has a zero 1.712 + * canonical combining class, and forms a boundary where text before it and after it 1.713 + * can be normalized independently. 1.714 + * For details about how multiple properties are encoded in 16-bit values 1.715 + * see the design doc. 1.716 + * Note that the encoding cannot express all combinations of the properties involved; 1.717 + * it only supports those combinations that are allowed by 1.718 + * the Unicode Normalization algorithms. Details are in the design doc as well. 1.719 + * The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 1.720 + * 1.721 + * The trie has a value for each lead surrogate code unit representing the "worst case" 1.722 + * properties of the 1024 supplementary characters whose UTF-16 form starts with 1.723 + * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 1.724 + * then their lead surrogate code unit has the trie value 0. 1.725 + * When the lead surrogate unit's value exceeds the quick check minimum during processing, 1.726 + * the properties for the full supplementary code point need to be looked up. 1.727 + * 1.728 + * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 1.729 + * uint16_t extraData[]; 1.730 + * 1.731 + * There is only one byte offset for the end of these two arrays. 1.732 + * The split between them is given by the constant and variable mentioned above. 1.733 + * 1.734 + * The maybeYesCompositions array contains compositions lists for characters that 1.735 + * combine both forward (as starters in composition pairs) 1.736 + * and backward (as trailing characters in composition pairs). 1.737 + * Such characters do not occur in Unicode 5.2 but are allowed by 1.738 + * the Unicode Normalization algorithms. 1.739 + * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 1.740 + * and the maybeYesCompositions array is empty. 1.741 + * If there are such characters, then minMaybeYes is subtracted from their norm16 values 1.742 + * to get the index into this array. 1.743 + * 1.744 + * The extraData array contains compositions lists for "YesYes" characters, 1.745 + * followed by mappings and optional compositions lists for "YesNo" characters, 1.746 + * followed by only mappings for "NoNo" characters. 1.747 + * (Referring to pairs of NFC/NFD quick check values.) 1.748 + * The norm16 values of those characters are directly indexes into the extraData array. 1.749 + * 1.750 + * The data structures for compositions lists and mappings are described in the design doc. 1.751 + * 1.752 + * uint8_t smallFCD[0x100]; -- new in format version 2 1.753 + * 1.754 + * This is a bit set to help speed up FCD value lookups in the absence of a full 1.755 + * UTrie2 or other large data structure with the full FCD value mapping. 1.756 + * 1.757 + * Each smallFCD bit is set if any of the corresponding 32 BMP code points 1.758 + * has a non-zero FCD value (lccc!=0 or tccc!=0). 1.759 + * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF. 1.760 + * A bit for 32 lead surrogates is set if any of the 32k corresponding 1.761 + * _supplementary_ code points has a non-zero FCD value. 1.762 + * 1.763 + * This bit set is most useful for the large blocks of CJK characters with FCD=0. 1.764 + * 1.765 + * Changes from format version 1 to format version 2 --------------------------- 1.766 + * 1.767 + * - Addition of data for raw (not recursively decomposed) mappings. 1.768 + * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when 1.769 + * the mapping is to an empty string or when the character combines-forward. 1.770 + * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which 1.771 + * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit. 1.772 + * + For details see the design doc. 1.773 + * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into 1.774 + * distinct ranges (combines-forward vs. not) 1.775 + * so that a range check can be used to find out if there is a compositions list. 1.776 + * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag. 1.777 + * It is needed for the new (in ICU 49) composePair(), not for other normalization. 1.778 + * - Addition of the smallFCD[] bit set. 1.779 + */ 1.780 + 1.781 +#endif /* !UCONFIG_NO_NORMALIZATION */ 1.782 +#endif /* __NORMALIZER2IMPL_H__ */