Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2002-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: uprops.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2002feb24 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * Implementations for mostly non-core Unicode character properties |
michael@0 | 17 | * stored in uprops.icu. |
michael@0 | 18 | * |
michael@0 | 19 | * With the APIs implemented here, almost all properties files and |
michael@0 | 20 | * their associated implementation files are used from this file, |
michael@0 | 21 | * including those for normalization and case mappings. |
michael@0 | 22 | */ |
michael@0 | 23 | |
michael@0 | 24 | #include "unicode/utypes.h" |
michael@0 | 25 | #include "unicode/uchar.h" |
michael@0 | 26 | #include "unicode/unorm2.h" |
michael@0 | 27 | #include "unicode/uscript.h" |
michael@0 | 28 | #include "unicode/ustring.h" |
michael@0 | 29 | #include "cstring.h" |
michael@0 | 30 | #include "normalizer2impl.h" |
michael@0 | 31 | #include "ucln_cmn.h" |
michael@0 | 32 | #include "umutex.h" |
michael@0 | 33 | #include "ubidi_props.h" |
michael@0 | 34 | #include "uprops.h" |
michael@0 | 35 | #include "ucase.h" |
michael@0 | 36 | #include "ustr_imp.h" |
michael@0 | 37 | |
michael@0 | 38 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 39 | |
michael@0 | 40 | U_NAMESPACE_USE |
michael@0 | 41 | |
michael@0 | 42 | #define GET_BIDI_PROPS() ubidi_getSingleton() |
michael@0 | 43 | |
michael@0 | 44 | /* general properties API functions ----------------------------------------- */ |
michael@0 | 45 | |
michael@0 | 46 | struct BinaryProperty; |
michael@0 | 47 | |
michael@0 | 48 | typedef UBool BinaryPropertyContains(const BinaryProperty &prop, UChar32 c, UProperty which); |
michael@0 | 49 | |
michael@0 | 50 | struct BinaryProperty { |
michael@0 | 51 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
michael@0 | 52 | uint32_t mask; |
michael@0 | 53 | BinaryPropertyContains *contains; |
michael@0 | 54 | }; |
michael@0 | 55 | |
michael@0 | 56 | static UBool defaultContains(const BinaryProperty &prop, UChar32 c, UProperty /*which*/) { |
michael@0 | 57 | /* systematic, directly stored properties */ |
michael@0 | 58 | return (u_getUnicodeProperties(c, prop.column)&prop.mask)!=0; |
michael@0 | 59 | } |
michael@0 | 60 | |
michael@0 | 61 | static UBool caseBinaryPropertyContains(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
michael@0 | 62 | return ucase_hasBinaryProperty(c, which); |
michael@0 | 63 | } |
michael@0 | 64 | |
michael@0 | 65 | static UBool isBidiControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 66 | return ubidi_isBidiControl(GET_BIDI_PROPS(), c); |
michael@0 | 67 | } |
michael@0 | 68 | |
michael@0 | 69 | static UBool isMirrored(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 70 | return ubidi_isMirrored(GET_BIDI_PROPS(), c); |
michael@0 | 71 | } |
michael@0 | 72 | |
michael@0 | 73 | static UBool isJoinControl(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 74 | return ubidi_isJoinControl(GET_BIDI_PROPS(), c); |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 78 | static UBool hasFullCompositionExclusion(const BinaryProperty &, UChar32, UProperty) { |
michael@0 | 79 | return FALSE; |
michael@0 | 80 | } |
michael@0 | 81 | #else |
michael@0 | 82 | static UBool hasFullCompositionExclusion(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 83 | // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. |
michael@0 | 84 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 85 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
michael@0 | 86 | return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c)); |
michael@0 | 87 | } |
michael@0 | 88 | #endif |
michael@0 | 89 | |
michael@0 | 90 | // UCHAR_NF*_INERT properties |
michael@0 | 91 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 92 | static UBool isNormInert(const BinaryProperty &, UChar32, UProperty) { |
michael@0 | 93 | return FALSE; |
michael@0 | 94 | } |
michael@0 | 95 | #else |
michael@0 | 96 | static UBool isNormInert(const BinaryProperty &/*prop*/, UChar32 c, UProperty which) { |
michael@0 | 97 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 98 | const Normalizer2 *norm2=Normalizer2Factory::getInstance( |
michael@0 | 99 | (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode); |
michael@0 | 100 | return U_SUCCESS(errorCode) && norm2->isInert(c); |
michael@0 | 101 | } |
michael@0 | 102 | #endif |
michael@0 | 103 | |
michael@0 | 104 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 105 | static UBool changesWhenCasefolded(const BinaryProperty &, UChar32, UProperty) { |
michael@0 | 106 | return FALSE; |
michael@0 | 107 | } |
michael@0 | 108 | #else |
michael@0 | 109 | static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 110 | UnicodeString nfd; |
michael@0 | 111 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 112 | const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode); |
michael@0 | 113 | if(U_FAILURE(errorCode)) { |
michael@0 | 114 | return FALSE; |
michael@0 | 115 | } |
michael@0 | 116 | if(nfcNorm2->getDecomposition(c, nfd)) { |
michael@0 | 117 | /* c has a decomposition */ |
michael@0 | 118 | if(nfd.length()==1) { |
michael@0 | 119 | c=nfd[0]; /* single BMP code point */ |
michael@0 | 120 | } else if(nfd.length()<=U16_MAX_LENGTH && |
michael@0 | 121 | nfd.length()==U16_LENGTH(c=nfd.char32At(0)) |
michael@0 | 122 | ) { |
michael@0 | 123 | /* single supplementary code point */ |
michael@0 | 124 | } else { |
michael@0 | 125 | c=U_SENTINEL; |
michael@0 | 126 | } |
michael@0 | 127 | } else if(c<0) { |
michael@0 | 128 | return FALSE; /* protect against bad input */ |
michael@0 | 129 | } |
michael@0 | 130 | if(c>=0) { |
michael@0 | 131 | /* single code point */ |
michael@0 | 132 | const UCaseProps *csp=ucase_getSingleton(); |
michael@0 | 133 | const UChar *resultString; |
michael@0 | 134 | return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0); |
michael@0 | 135 | } else { |
michael@0 | 136 | /* guess some large but stack-friendly capacity */ |
michael@0 | 137 | UChar dest[2*UCASE_MAX_STRING_LENGTH]; |
michael@0 | 138 | int32_t destLength; |
michael@0 | 139 | destLength=u_strFoldCase(dest, LENGTHOF(dest), |
michael@0 | 140 | nfd.getBuffer(), nfd.length(), |
michael@0 | 141 | U_FOLD_CASE_DEFAULT, &errorCode); |
michael@0 | 142 | return (UBool)(U_SUCCESS(errorCode) && |
michael@0 | 143 | 0!=u_strCompare(nfd.getBuffer(), nfd.length(), |
michael@0 | 144 | dest, destLength, FALSE)); |
michael@0 | 145 | } |
michael@0 | 146 | } |
michael@0 | 147 | #endif |
michael@0 | 148 | |
michael@0 | 149 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 150 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &, UChar32, UProperty) { |
michael@0 | 151 | return FALSE; |
michael@0 | 152 | } |
michael@0 | 153 | #else |
michael@0 | 154 | static UBool changesWhenNFKC_Casefolded(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 155 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 156 | const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode); |
michael@0 | 157 | if(U_FAILURE(errorCode)) { |
michael@0 | 158 | return FALSE; |
michael@0 | 159 | } |
michael@0 | 160 | UnicodeString src(c); |
michael@0 | 161 | UnicodeString dest; |
michael@0 | 162 | { |
michael@0 | 163 | // The ReorderingBuffer must be in a block because its destructor |
michael@0 | 164 | // needs to release dest's buffer before we look at its contents. |
michael@0 | 165 | ReorderingBuffer buffer(*kcf, dest); |
michael@0 | 166 | // Small destCapacity for NFKC_CF(c). |
michael@0 | 167 | if(buffer.init(5, errorCode)) { |
michael@0 | 168 | const UChar *srcArray=src.getBuffer(); |
michael@0 | 169 | kcf->compose(srcArray, srcArray+src.length(), FALSE, |
michael@0 | 170 | TRUE, buffer, errorCode); |
michael@0 | 171 | } |
michael@0 | 172 | } |
michael@0 | 173 | return U_SUCCESS(errorCode) && dest!=src; |
michael@0 | 174 | } |
michael@0 | 175 | #endif |
michael@0 | 176 | |
michael@0 | 177 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 178 | static UBool isCanonSegmentStarter(const BinaryProperty &, UChar32, UProperty) { |
michael@0 | 179 | return FALSE; |
michael@0 | 180 | } |
michael@0 | 181 | #else |
michael@0 | 182 | static UBool isCanonSegmentStarter(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 183 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 184 | const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode); |
michael@0 | 185 | return |
michael@0 | 186 | U_SUCCESS(errorCode) && impl->ensureCanonIterData(errorCode) && |
michael@0 | 187 | impl->isCanonSegmentStarter(c); |
michael@0 | 188 | } |
michael@0 | 189 | #endif |
michael@0 | 190 | |
michael@0 | 191 | static UBool isPOSIX_alnum(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 192 | return u_isalnumPOSIX(c); |
michael@0 | 193 | } |
michael@0 | 194 | |
michael@0 | 195 | static UBool isPOSIX_blank(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 196 | return u_isblank(c); |
michael@0 | 197 | } |
michael@0 | 198 | |
michael@0 | 199 | static UBool isPOSIX_graph(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 200 | return u_isgraphPOSIX(c); |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | static UBool isPOSIX_print(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 204 | return u_isprintPOSIX(c); |
michael@0 | 205 | } |
michael@0 | 206 | |
michael@0 | 207 | static UBool isPOSIX_xdigit(const BinaryProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 208 | return u_isxdigit(c); |
michael@0 | 209 | } |
michael@0 | 210 | |
michael@0 | 211 | static const BinaryProperty binProps[UCHAR_BINARY_LIMIT]={ |
michael@0 | 212 | /* |
michael@0 | 213 | * column and mask values for binary properties from u_getUnicodeProperties(). |
michael@0 | 214 | * Must be in order of corresponding UProperty, |
michael@0 | 215 | * and there must be exactly one entry per binary UProperty. |
michael@0 | 216 | * |
michael@0 | 217 | * Properties with mask==0 are handled in code. |
michael@0 | 218 | * For them, column is the UPropertySource value. |
michael@0 | 219 | */ |
michael@0 | 220 | { 1, U_MASK(UPROPS_ALPHABETIC), defaultContains }, |
michael@0 | 221 | { 1, U_MASK(UPROPS_ASCII_HEX_DIGIT), defaultContains }, |
michael@0 | 222 | { UPROPS_SRC_BIDI, 0, isBidiControl }, |
michael@0 | 223 | { UPROPS_SRC_BIDI, 0, isMirrored }, |
michael@0 | 224 | { 1, U_MASK(UPROPS_DASH), defaultContains }, |
michael@0 | 225 | { 1, U_MASK(UPROPS_DEFAULT_IGNORABLE_CODE_POINT), defaultContains }, |
michael@0 | 226 | { 1, U_MASK(UPROPS_DEPRECATED), defaultContains }, |
michael@0 | 227 | { 1, U_MASK(UPROPS_DIACRITIC), defaultContains }, |
michael@0 | 228 | { 1, U_MASK(UPROPS_EXTENDER), defaultContains }, |
michael@0 | 229 | { UPROPS_SRC_NFC, 0, hasFullCompositionExclusion }, |
michael@0 | 230 | { 1, U_MASK(UPROPS_GRAPHEME_BASE), defaultContains }, |
michael@0 | 231 | { 1, U_MASK(UPROPS_GRAPHEME_EXTEND), defaultContains }, |
michael@0 | 232 | { 1, U_MASK(UPROPS_GRAPHEME_LINK), defaultContains }, |
michael@0 | 233 | { 1, U_MASK(UPROPS_HEX_DIGIT), defaultContains }, |
michael@0 | 234 | { 1, U_MASK(UPROPS_HYPHEN), defaultContains }, |
michael@0 | 235 | { 1, U_MASK(UPROPS_ID_CONTINUE), defaultContains }, |
michael@0 | 236 | { 1, U_MASK(UPROPS_ID_START), defaultContains }, |
michael@0 | 237 | { 1, U_MASK(UPROPS_IDEOGRAPHIC), defaultContains }, |
michael@0 | 238 | { 1, U_MASK(UPROPS_IDS_BINARY_OPERATOR), defaultContains }, |
michael@0 | 239 | { 1, U_MASK(UPROPS_IDS_TRINARY_OPERATOR), defaultContains }, |
michael@0 | 240 | { UPROPS_SRC_BIDI, 0, isJoinControl }, |
michael@0 | 241 | { 1, U_MASK(UPROPS_LOGICAL_ORDER_EXCEPTION), defaultContains }, |
michael@0 | 242 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_LOWERCASE |
michael@0 | 243 | { 1, U_MASK(UPROPS_MATH), defaultContains }, |
michael@0 | 244 | { 1, U_MASK(UPROPS_NONCHARACTER_CODE_POINT), defaultContains }, |
michael@0 | 245 | { 1, U_MASK(UPROPS_QUOTATION_MARK), defaultContains }, |
michael@0 | 246 | { 1, U_MASK(UPROPS_RADICAL), defaultContains }, |
michael@0 | 247 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_SOFT_DOTTED |
michael@0 | 248 | { 1, U_MASK(UPROPS_TERMINAL_PUNCTUATION), defaultContains }, |
michael@0 | 249 | { 1, U_MASK(UPROPS_UNIFIED_IDEOGRAPH), defaultContains }, |
michael@0 | 250 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_UPPERCASE |
michael@0 | 251 | { 1, U_MASK(UPROPS_WHITE_SPACE), defaultContains }, |
michael@0 | 252 | { 1, U_MASK(UPROPS_XID_CONTINUE), defaultContains }, |
michael@0 | 253 | { 1, U_MASK(UPROPS_XID_START), defaultContains }, |
michael@0 | 254 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_SENSITIVE |
michael@0 | 255 | { 1, U_MASK(UPROPS_S_TERM), defaultContains }, |
michael@0 | 256 | { 1, U_MASK(UPROPS_VARIATION_SELECTOR), defaultContains }, |
michael@0 | 257 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFD_INERT |
michael@0 | 258 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKD_INERT |
michael@0 | 259 | { UPROPS_SRC_NFC, 0, isNormInert }, // UCHAR_NFC_INERT |
michael@0 | 260 | { UPROPS_SRC_NFKC, 0, isNormInert }, // UCHAR_NFKC_INERT |
michael@0 | 261 | { UPROPS_SRC_NFC_CANON_ITER, 0, isCanonSegmentStarter }, |
michael@0 | 262 | { 1, U_MASK(UPROPS_PATTERN_SYNTAX), defaultContains }, |
michael@0 | 263 | { 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE), defaultContains }, |
michael@0 | 264 | { UPROPS_SRC_CHAR_AND_PROPSVEC, 0, isPOSIX_alnum }, |
michael@0 | 265 | { UPROPS_SRC_CHAR, 0, isPOSIX_blank }, |
michael@0 | 266 | { UPROPS_SRC_CHAR, 0, isPOSIX_graph }, |
michael@0 | 267 | { UPROPS_SRC_CHAR, 0, isPOSIX_print }, |
michael@0 | 268 | { UPROPS_SRC_CHAR, 0, isPOSIX_xdigit }, |
michael@0 | 269 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASED |
michael@0 | 270 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CASE_IGNORABLE |
michael@0 | 271 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_LOWERCASED |
michael@0 | 272 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_UPPERCASED |
michael@0 | 273 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_TITLECASED |
michael@0 | 274 | { UPROPS_SRC_CASE_AND_NORM, 0, changesWhenCasefolded }, |
michael@0 | 275 | { UPROPS_SRC_CASE, 0, caseBinaryPropertyContains }, // UCHAR_CHANGES_WHEN_CASEMAPPED |
michael@0 | 276 | { UPROPS_SRC_NFKC_CF, 0, changesWhenNFKC_Casefolded } |
michael@0 | 277 | }; |
michael@0 | 278 | |
michael@0 | 279 | U_CAPI UBool U_EXPORT2 |
michael@0 | 280 | u_hasBinaryProperty(UChar32 c, UProperty which) { |
michael@0 | 281 | /* c is range-checked in the functions that are called from here */ |
michael@0 | 282 | if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) { |
michael@0 | 283 | /* not a known binary property */ |
michael@0 | 284 | return FALSE; |
michael@0 | 285 | } else { |
michael@0 | 286 | const BinaryProperty &prop=binProps[which]; |
michael@0 | 287 | return prop.contains(prop, c, which); |
michael@0 | 288 | } |
michael@0 | 289 | } |
michael@0 | 290 | |
michael@0 | 291 | struct IntProperty; |
michael@0 | 292 | |
michael@0 | 293 | typedef int32_t IntPropertyGetValue(const IntProperty &prop, UChar32 c, UProperty which); |
michael@0 | 294 | typedef int32_t IntPropertyGetMaxValue(const IntProperty &prop, UProperty which); |
michael@0 | 295 | |
michael@0 | 296 | struct IntProperty { |
michael@0 | 297 | int32_t column; // SRC_PROPSVEC column, or "source" if mask==0 |
michael@0 | 298 | uint32_t mask; |
michael@0 | 299 | int32_t shift; // =maxValue if getMaxValueFromShift() is used |
michael@0 | 300 | IntPropertyGetValue *getValue; |
michael@0 | 301 | IntPropertyGetMaxValue *getMaxValue; |
michael@0 | 302 | }; |
michael@0 | 303 | |
michael@0 | 304 | static int32_t defaultGetValue(const IntProperty &prop, UChar32 c, UProperty /*which*/) { |
michael@0 | 305 | /* systematic, directly stored properties */ |
michael@0 | 306 | return (int32_t)(u_getUnicodeProperties(c, prop.column)&prop.mask)>>prop.shift; |
michael@0 | 307 | } |
michael@0 | 308 | |
michael@0 | 309 | static int32_t defaultGetMaxValue(const IntProperty &prop, UProperty /*which*/) { |
michael@0 | 310 | return (uprv_getMaxValues(prop.column)&prop.mask)>>prop.shift; |
michael@0 | 311 | } |
michael@0 | 312 | |
michael@0 | 313 | static int32_t getMaxValueFromShift(const IntProperty &prop, UProperty /*which*/) { |
michael@0 | 314 | return prop.shift; |
michael@0 | 315 | } |
michael@0 | 316 | |
michael@0 | 317 | static int32_t getBiDiClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 318 | return (int32_t)u_charDirection(c); |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | static int32_t getBiDiPairedBracketType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 322 | return (int32_t)ubidi_getPairedBracketType(GET_BIDI_PROPS(), c); |
michael@0 | 323 | } |
michael@0 | 324 | |
michael@0 | 325 | static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) { |
michael@0 | 326 | return ubidi_getMaxValue(GET_BIDI_PROPS(), which); |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 330 | static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) { |
michael@0 | 331 | return 0; |
michael@0 | 332 | } |
michael@0 | 333 | #else |
michael@0 | 334 | static int32_t getCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 335 | return u_getCombiningClass(c); |
michael@0 | 336 | } |
michael@0 | 337 | #endif |
michael@0 | 338 | |
michael@0 | 339 | static int32_t getGeneralCategory(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 340 | return (int32_t)u_charType(c); |
michael@0 | 341 | } |
michael@0 | 342 | |
michael@0 | 343 | static int32_t getJoiningGroup(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 344 | return ubidi_getJoiningGroup(GET_BIDI_PROPS(), c); |
michael@0 | 345 | } |
michael@0 | 346 | |
michael@0 | 347 | static int32_t getJoiningType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 348 | return ubidi_getJoiningType(GET_BIDI_PROPS(), c); |
michael@0 | 349 | } |
michael@0 | 350 | |
michael@0 | 351 | static int32_t getNumericType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 352 | int32_t ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(u_getMainProperties(c)); |
michael@0 | 353 | return UPROPS_NTV_GET_TYPE(ntv); |
michael@0 | 354 | } |
michael@0 | 355 | |
michael@0 | 356 | static int32_t getScript(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 357 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 358 | return (int32_t)uscript_getScript(c, &errorCode); |
michael@0 | 359 | } |
michael@0 | 360 | |
michael@0 | 361 | /* |
michael@0 | 362 | * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. |
michael@0 | 363 | * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. |
michael@0 | 364 | */ |
michael@0 | 365 | static const UHangulSyllableType gcbToHst[]={ |
michael@0 | 366 | U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */ |
michael@0 | 367 | U_HST_NOT_APPLICABLE, /* U_GCB_CONTROL */ |
michael@0 | 368 | U_HST_NOT_APPLICABLE, /* U_GCB_CR */ |
michael@0 | 369 | U_HST_NOT_APPLICABLE, /* U_GCB_EXTEND */ |
michael@0 | 370 | U_HST_LEADING_JAMO, /* U_GCB_L */ |
michael@0 | 371 | U_HST_NOT_APPLICABLE, /* U_GCB_LF */ |
michael@0 | 372 | U_HST_LV_SYLLABLE, /* U_GCB_LV */ |
michael@0 | 373 | U_HST_LVT_SYLLABLE, /* U_GCB_LVT */ |
michael@0 | 374 | U_HST_TRAILING_JAMO, /* U_GCB_T */ |
michael@0 | 375 | U_HST_VOWEL_JAMO /* U_GCB_V */ |
michael@0 | 376 | /* |
michael@0 | 377 | * Omit GCB values beyond what we need for hst. |
michael@0 | 378 | * The code below checks for the array length. |
michael@0 | 379 | */ |
michael@0 | 380 | }; |
michael@0 | 381 | |
michael@0 | 382 | static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 383 | /* see comments on gcbToHst[] above */ |
michael@0 | 384 | int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT; |
michael@0 | 385 | if(gcb<LENGTHOF(gcbToHst)) { |
michael@0 | 386 | return gcbToHst[gcb]; |
michael@0 | 387 | } else { |
michael@0 | 388 | return U_HST_NOT_APPLICABLE; |
michael@0 | 389 | } |
michael@0 | 390 | } |
michael@0 | 391 | |
michael@0 | 392 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 393 | static int32_t getNormQuickCheck(const IntProperty &, UChar32, UProperty) { |
michael@0 | 394 | return 0; |
michael@0 | 395 | } |
michael@0 | 396 | #else |
michael@0 | 397 | static int32_t getNormQuickCheck(const IntProperty &/*prop*/, UChar32 c, UProperty which) { |
michael@0 | 398 | return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD)); |
michael@0 | 399 | } |
michael@0 | 400 | #endif |
michael@0 | 401 | |
michael@0 | 402 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 403 | static int32_t getLeadCombiningClass(const IntProperty &, UChar32, UProperty) { |
michael@0 | 404 | return 0; |
michael@0 | 405 | } |
michael@0 | 406 | #else |
michael@0 | 407 | static int32_t getLeadCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 408 | return unorm_getFCD16(c)>>8; |
michael@0 | 409 | } |
michael@0 | 410 | #endif |
michael@0 | 411 | |
michael@0 | 412 | #if UCONFIG_NO_NORMALIZATION |
michael@0 | 413 | static int32_t getTrailCombiningClass(const IntProperty &, UChar32, UProperty) { |
michael@0 | 414 | return 0; |
michael@0 | 415 | } |
michael@0 | 416 | #else |
michael@0 | 417 | static int32_t getTrailCombiningClass(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) { |
michael@0 | 418 | return unorm_getFCD16(c)&0xff; |
michael@0 | 419 | } |
michael@0 | 420 | #endif |
michael@0 | 421 | |
michael@0 | 422 | static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={ |
michael@0 | 423 | /* |
michael@0 | 424 | * column, mask and shift values for int-value properties from u_getUnicodeProperties(). |
michael@0 | 425 | * Must be in order of corresponding UProperty, |
michael@0 | 426 | * and there must be exactly one entry per int UProperty. |
michael@0 | 427 | * |
michael@0 | 428 | * Properties with mask==0 are handled in code. |
michael@0 | 429 | * For them, column is the UPropertySource value. |
michael@0 | 430 | */ |
michael@0 | 431 | { UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue }, |
michael@0 | 432 | { 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 433 | { UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift }, |
michael@0 | 434 | { 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 435 | { 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 436 | { UPROPS_SRC_CHAR, 0, (int32_t)U_CHAR_CATEGORY_COUNT-1,getGeneralCategory, getMaxValueFromShift }, |
michael@0 | 437 | { UPROPS_SRC_BIDI, 0, 0, getJoiningGroup, biDiGetMaxValue }, |
michael@0 | 438 | { UPROPS_SRC_BIDI, 0, 0, getJoiningType, biDiGetMaxValue }, |
michael@0 | 439 | { 2, UPROPS_LB_MASK, UPROPS_LB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 440 | { UPROPS_SRC_CHAR, 0, (int32_t)U_NT_COUNT-1, getNumericType, getMaxValueFromShift }, |
michael@0 | 441 | { 0, UPROPS_SCRIPT_MASK, 0, getScript, defaultGetMaxValue }, |
michael@0 | 442 | { UPROPS_SRC_PROPSVEC, 0, (int32_t)U_HST_COUNT-1, getHangulSyllableType, getMaxValueFromShift }, |
michael@0 | 443 | // UCHAR_NFD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
michael@0 | 444 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
michael@0 | 445 | // UCHAR_NFKD_QUICK_CHECK: max=1=YES -- never "maybe", only "no" or "yes" |
michael@0 | 446 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_YES, getNormQuickCheck, getMaxValueFromShift }, |
michael@0 | 447 | // UCHAR_NFC_QUICK_CHECK: max=2=MAYBE |
michael@0 | 448 | { UPROPS_SRC_NFC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
michael@0 | 449 | // UCHAR_NFKC_QUICK_CHECK: max=2=MAYBE |
michael@0 | 450 | { UPROPS_SRC_NFKC, 0, (int32_t)UNORM_MAYBE, getNormQuickCheck, getMaxValueFromShift }, |
michael@0 | 451 | { UPROPS_SRC_NFC, 0, 0xff, getLeadCombiningClass, getMaxValueFromShift }, |
michael@0 | 452 | { UPROPS_SRC_NFC, 0, 0xff, getTrailCombiningClass, getMaxValueFromShift }, |
michael@0 | 453 | { 2, UPROPS_GCB_MASK, UPROPS_GCB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 454 | { 2, UPROPS_SB_MASK, UPROPS_SB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 455 | { 2, UPROPS_WB_MASK, UPROPS_WB_SHIFT, defaultGetValue, defaultGetMaxValue }, |
michael@0 | 456 | { UPROPS_SRC_BIDI, 0, 0, getBiDiPairedBracketType, biDiGetMaxValue }, |
michael@0 | 457 | }; |
michael@0 | 458 | |
michael@0 | 459 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 460 | u_getIntPropertyValue(UChar32 c, UProperty which) { |
michael@0 | 461 | if(which<UCHAR_INT_START) { |
michael@0 | 462 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
michael@0 | 463 | const BinaryProperty &prop=binProps[which]; |
michael@0 | 464 | return prop.contains(prop, c, which); |
michael@0 | 465 | } |
michael@0 | 466 | } else if(which<UCHAR_INT_LIMIT) { |
michael@0 | 467 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
michael@0 | 468 | return prop.getValue(prop, c, which); |
michael@0 | 469 | } else if(which==UCHAR_GENERAL_CATEGORY_MASK) { |
michael@0 | 470 | return U_MASK(u_charType(c)); |
michael@0 | 471 | } |
michael@0 | 472 | return 0; // undefined |
michael@0 | 473 | } |
michael@0 | 474 | |
michael@0 | 475 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 476 | u_getIntPropertyMinValue(UProperty /*which*/) { |
michael@0 | 477 | return 0; /* all binary/enum/int properties have a minimum value of 0 */ |
michael@0 | 478 | } |
michael@0 | 479 | |
michael@0 | 480 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 481 | u_getIntPropertyMaxValue(UProperty which) { |
michael@0 | 482 | if(which<UCHAR_INT_START) { |
michael@0 | 483 | if(UCHAR_BINARY_START<=which && which<UCHAR_BINARY_LIMIT) { |
michael@0 | 484 | return 1; // maximum TRUE for all binary properties |
michael@0 | 485 | } |
michael@0 | 486 | } else if(which<UCHAR_INT_LIMIT) { |
michael@0 | 487 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
michael@0 | 488 | return prop.getMaxValue(prop, which); |
michael@0 | 489 | } |
michael@0 | 490 | return -1; // undefined |
michael@0 | 491 | } |
michael@0 | 492 | |
michael@0 | 493 | U_CFUNC UPropertySource U_EXPORT2 |
michael@0 | 494 | uprops_getSource(UProperty which) { |
michael@0 | 495 | if(which<UCHAR_BINARY_START) { |
michael@0 | 496 | return UPROPS_SRC_NONE; /* undefined */ |
michael@0 | 497 | } else if(which<UCHAR_BINARY_LIMIT) { |
michael@0 | 498 | const BinaryProperty &prop=binProps[which]; |
michael@0 | 499 | if(prop.mask!=0) { |
michael@0 | 500 | return UPROPS_SRC_PROPSVEC; |
michael@0 | 501 | } else { |
michael@0 | 502 | return (UPropertySource)prop.column; |
michael@0 | 503 | } |
michael@0 | 504 | } else if(which<UCHAR_INT_START) { |
michael@0 | 505 | return UPROPS_SRC_NONE; /* undefined */ |
michael@0 | 506 | } else if(which<UCHAR_INT_LIMIT) { |
michael@0 | 507 | const IntProperty &prop=intProps[which-UCHAR_INT_START]; |
michael@0 | 508 | if(prop.mask!=0) { |
michael@0 | 509 | return UPROPS_SRC_PROPSVEC; |
michael@0 | 510 | } else { |
michael@0 | 511 | return (UPropertySource)prop.column; |
michael@0 | 512 | } |
michael@0 | 513 | } else if(which<UCHAR_STRING_START) { |
michael@0 | 514 | switch(which) { |
michael@0 | 515 | case UCHAR_GENERAL_CATEGORY_MASK: |
michael@0 | 516 | case UCHAR_NUMERIC_VALUE: |
michael@0 | 517 | return UPROPS_SRC_CHAR; |
michael@0 | 518 | |
michael@0 | 519 | default: |
michael@0 | 520 | return UPROPS_SRC_NONE; |
michael@0 | 521 | } |
michael@0 | 522 | } else if(which<UCHAR_STRING_LIMIT) { |
michael@0 | 523 | switch(which) { |
michael@0 | 524 | case UCHAR_AGE: |
michael@0 | 525 | return UPROPS_SRC_PROPSVEC; |
michael@0 | 526 | |
michael@0 | 527 | case UCHAR_BIDI_MIRRORING_GLYPH: |
michael@0 | 528 | return UPROPS_SRC_BIDI; |
michael@0 | 529 | |
michael@0 | 530 | case UCHAR_CASE_FOLDING: |
michael@0 | 531 | case UCHAR_LOWERCASE_MAPPING: |
michael@0 | 532 | case UCHAR_SIMPLE_CASE_FOLDING: |
michael@0 | 533 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
michael@0 | 534 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
michael@0 | 535 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
michael@0 | 536 | case UCHAR_TITLECASE_MAPPING: |
michael@0 | 537 | case UCHAR_UPPERCASE_MAPPING: |
michael@0 | 538 | return UPROPS_SRC_CASE; |
michael@0 | 539 | |
michael@0 | 540 | case UCHAR_ISO_COMMENT: |
michael@0 | 541 | case UCHAR_NAME: |
michael@0 | 542 | case UCHAR_UNICODE_1_NAME: |
michael@0 | 543 | return UPROPS_SRC_NAMES; |
michael@0 | 544 | |
michael@0 | 545 | default: |
michael@0 | 546 | return UPROPS_SRC_NONE; |
michael@0 | 547 | } |
michael@0 | 548 | } else { |
michael@0 | 549 | switch(which) { |
michael@0 | 550 | case UCHAR_SCRIPT_EXTENSIONS: |
michael@0 | 551 | return UPROPS_SRC_PROPSVEC; |
michael@0 | 552 | default: |
michael@0 | 553 | return UPROPS_SRC_NONE; /* undefined */ |
michael@0 | 554 | } |
michael@0 | 555 | } |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | #if !UCONFIG_NO_NORMALIZATION |
michael@0 | 559 | |
michael@0 | 560 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 561 | u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) { |
michael@0 | 562 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
michael@0 | 563 | return 0; |
michael@0 | 564 | } |
michael@0 | 565 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { |
michael@0 | 566 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 567 | return 0; |
michael@0 | 568 | } |
michael@0 | 569 | // Compute the FC_NFKC_Closure on the fly: |
michael@0 | 570 | // We have the API for complete coverage of Unicode properties, although |
michael@0 | 571 | // this value by itself is not useful via API. |
michael@0 | 572 | // (What could be useful is a custom normalization table that combines |
michael@0 | 573 | // case folding and NFKC.) |
michael@0 | 574 | // For the derivation, see Unicode's DerivedNormalizationProps.txt. |
michael@0 | 575 | const Normalizer2 *nfkc=Normalizer2Factory::getNFKCInstance(*pErrorCode); |
michael@0 | 576 | const UCaseProps *csp=ucase_getSingleton(); |
michael@0 | 577 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 578 | return 0; |
michael@0 | 579 | } |
michael@0 | 580 | // first: b = NFKC(Fold(a)) |
michael@0 | 581 | UnicodeString folded1String; |
michael@0 | 582 | const UChar *folded1; |
michael@0 | 583 | int32_t folded1Length=ucase_toFullFolding(csp, c, &folded1, U_FOLD_CASE_DEFAULT); |
michael@0 | 584 | if(folded1Length<0) { |
michael@0 | 585 | const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc); |
michael@0 | 586 | if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) { |
michael@0 | 587 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); // c does not change at all under CaseFolding+NFKC |
michael@0 | 588 | } |
michael@0 | 589 | folded1String.setTo(c); |
michael@0 | 590 | } else { |
michael@0 | 591 | if(folded1Length>UCASE_MAX_STRING_LENGTH) { |
michael@0 | 592 | folded1String.setTo(folded1Length); |
michael@0 | 593 | } else { |
michael@0 | 594 | folded1String.setTo(FALSE, folded1, folded1Length); |
michael@0 | 595 | } |
michael@0 | 596 | } |
michael@0 | 597 | UnicodeString kc1=nfkc->normalize(folded1String, *pErrorCode); |
michael@0 | 598 | // second: c = NFKC(Fold(b)) |
michael@0 | 599 | UnicodeString folded2String(kc1); |
michael@0 | 600 | UnicodeString kc2=nfkc->normalize(folded2String.foldCase(), *pErrorCode); |
michael@0 | 601 | // if (c != b) add the mapping from a to c |
michael@0 | 602 | if(U_FAILURE(*pErrorCode) || kc1==kc2) { |
michael@0 | 603 | return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
michael@0 | 604 | } else { |
michael@0 | 605 | return kc2.extract(dest, destCapacity, *pErrorCode); |
michael@0 | 606 | } |
michael@0 | 607 | } |
michael@0 | 608 | |
michael@0 | 609 | #endif |