intl/icu/source/common/uchar.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 ********************************************************************************
michael@0 3 * Copyright (C) 1996-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 ********************************************************************************
michael@0 6 *
michael@0 7 * File UCHAR.C
michael@0 8 *
michael@0 9 * Modification History:
michael@0 10 *
michael@0 11 * Date Name Description
michael@0 12 * 04/02/97 aliu Creation.
michael@0 13 * 4/15/99 Madhu Updated all the function definitions for C Implementation
michael@0 14 * 5/20/99 Madhu Added the function u_getVersion()
michael@0 15 * 8/19/1999 srl Upgraded scripts to Unicode3.0
michael@0 16 * 11/11/1999 weiv added u_isalnum(), cleaned comments
michael@0 17 * 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion.
michael@0 18 * 06/20/2000 helena OS/400 port changes; mostly typecast.
michael@0 19 ******************************************************************************
michael@0 20 */
michael@0 21
michael@0 22 #include "unicode/utypes.h"
michael@0 23 #include "unicode/uchar.h"
michael@0 24 #include "unicode/uscript.h"
michael@0 25 #include "unicode/udata.h"
michael@0 26 #include "uassert.h"
michael@0 27 #include "cmemory.h"
michael@0 28 #include "ucln_cmn.h"
michael@0 29 #include "utrie2.h"
michael@0 30 #include "udataswp.h"
michael@0 31 #include "uprops.h"
michael@0 32 #include "ustr_imp.h"
michael@0 33
michael@0 34 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 35
michael@0 36 /* uchar_props_data.h is machine-generated by genprops --csource */
michael@0 37 #define INCLUDED_FROM_UCHAR_C
michael@0 38 #include "uchar_props_data.h"
michael@0 39
michael@0 40 /* constants and macros for access to the data ------------------------------ */
michael@0 41
michael@0 42 /* getting a uint32_t properties word from the data */
michael@0 43 #define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c));
michael@0 44
michael@0 45 U_CFUNC UBool
michael@0 46 uprv_haveProperties(UErrorCode *pErrorCode) {
michael@0 47 if(U_FAILURE(*pErrorCode)) {
michael@0 48 return FALSE;
michael@0 49 }
michael@0 50 return TRUE;
michael@0 51 }
michael@0 52
michael@0 53 /* API functions ------------------------------------------------------------ */
michael@0 54
michael@0 55 /* Gets the Unicode character's general category.*/
michael@0 56 U_CAPI int8_t U_EXPORT2
michael@0 57 u_charType(UChar32 c) {
michael@0 58 uint32_t props;
michael@0 59 GET_PROPS(c, props);
michael@0 60 return (int8_t)GET_CATEGORY(props);
michael@0 61 }
michael@0 62
michael@0 63 /* Enumerate all code points with their general categories. */
michael@0 64 struct _EnumTypeCallback {
michael@0 65 UCharEnumTypeRange *enumRange;
michael@0 66 const void *context;
michael@0 67 };
michael@0 68
michael@0 69 static uint32_t U_CALLCONV
michael@0 70 _enumTypeValue(const void *context, uint32_t value) {
michael@0 71 return GET_CATEGORY(value);
michael@0 72 }
michael@0 73
michael@0 74 static UBool U_CALLCONV
michael@0 75 _enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
michael@0 76 /* just cast the value to UCharCategory */
michael@0 77 return ((struct _EnumTypeCallback *)context)->
michael@0 78 enumRange(((struct _EnumTypeCallback *)context)->context,
michael@0 79 start, end+1, (UCharCategory)value);
michael@0 80 }
michael@0 81
michael@0 82 U_CAPI void U_EXPORT2
michael@0 83 u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
michael@0 84 struct _EnumTypeCallback callback;
michael@0 85
michael@0 86 if(enumRange==NULL) {
michael@0 87 return;
michael@0 88 }
michael@0 89
michael@0 90 callback.enumRange=enumRange;
michael@0 91 callback.context=context;
michael@0 92 utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback);
michael@0 93 }
michael@0 94
michael@0 95 /* Checks if ch is a lower case letter.*/
michael@0 96 U_CAPI UBool U_EXPORT2
michael@0 97 u_islower(UChar32 c) {
michael@0 98 uint32_t props;
michael@0 99 GET_PROPS(c, props);
michael@0 100 return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
michael@0 101 }
michael@0 102
michael@0 103 /* Checks if ch is an upper case letter.*/
michael@0 104 U_CAPI UBool U_EXPORT2
michael@0 105 u_isupper(UChar32 c) {
michael@0 106 uint32_t props;
michael@0 107 GET_PROPS(c, props);
michael@0 108 return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
michael@0 109 }
michael@0 110
michael@0 111 /* Checks if ch is a title case letter; usually upper case letters.*/
michael@0 112 U_CAPI UBool U_EXPORT2
michael@0 113 u_istitle(UChar32 c) {
michael@0 114 uint32_t props;
michael@0 115 GET_PROPS(c, props);
michael@0 116 return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
michael@0 117 }
michael@0 118
michael@0 119 /* Checks if ch is a decimal digit. */
michael@0 120 U_CAPI UBool U_EXPORT2
michael@0 121 u_isdigit(UChar32 c) {
michael@0 122 uint32_t props;
michael@0 123 GET_PROPS(c, props);
michael@0 124 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
michael@0 125 }
michael@0 126
michael@0 127 U_CAPI UBool U_EXPORT2
michael@0 128 u_isxdigit(UChar32 c) {
michael@0 129 uint32_t props;
michael@0 130
michael@0 131 /* check ASCII and Fullwidth ASCII a-fA-F */
michael@0 132 if(
michael@0 133 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
michael@0 134 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
michael@0 135 ) {
michael@0 136 return TRUE;
michael@0 137 }
michael@0 138
michael@0 139 GET_PROPS(c, props);
michael@0 140 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
michael@0 141 }
michael@0 142
michael@0 143 /* Checks if the Unicode character is a letter.*/
michael@0 144 U_CAPI UBool U_EXPORT2
michael@0 145 u_isalpha(UChar32 c) {
michael@0 146 uint32_t props;
michael@0 147 GET_PROPS(c, props);
michael@0 148 return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0);
michael@0 149 }
michael@0 150
michael@0 151 U_CAPI UBool U_EXPORT2
michael@0 152 u_isUAlphabetic(UChar32 c) {
michael@0 153 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
michael@0 154 }
michael@0 155
michael@0 156 /* Checks if c is a letter or a decimal digit */
michael@0 157 U_CAPI UBool U_EXPORT2
michael@0 158 u_isalnum(UChar32 c) {
michael@0 159 uint32_t props;
michael@0 160 GET_PROPS(c, props);
michael@0 161 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
michael@0 162 }
michael@0 163
michael@0 164 /**
michael@0 165 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
michael@0 166 * @internal
michael@0 167 */
michael@0 168 U_CFUNC UBool
michael@0 169 u_isalnumPOSIX(UChar32 c) {
michael@0 170 return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
michael@0 171 }
michael@0 172
michael@0 173 /* Checks if ch is a unicode character with assigned character type.*/
michael@0 174 U_CAPI UBool U_EXPORT2
michael@0 175 u_isdefined(UChar32 c) {
michael@0 176 uint32_t props;
michael@0 177 GET_PROPS(c, props);
michael@0 178 return (UBool)(GET_CATEGORY(props)!=0);
michael@0 179 }
michael@0 180
michael@0 181 /* Checks if the Unicode character is a base form character that can take a diacritic.*/
michael@0 182 U_CAPI UBool U_EXPORT2
michael@0 183 u_isbase(UChar32 c) {
michael@0 184 uint32_t props;
michael@0 185 GET_PROPS(c, props);
michael@0 186 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0);
michael@0 187 }
michael@0 188
michael@0 189 /* Checks if the Unicode character is a control character.*/
michael@0 190 U_CAPI UBool U_EXPORT2
michael@0 191 u_iscntrl(UChar32 c) {
michael@0 192 uint32_t props;
michael@0 193 GET_PROPS(c, props);
michael@0 194 return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0);
michael@0 195 }
michael@0 196
michael@0 197 U_CAPI UBool U_EXPORT2
michael@0 198 u_isISOControl(UChar32 c) {
michael@0 199 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
michael@0 200 }
michael@0 201
michael@0 202 /* Some control characters that are used as space. */
michael@0 203 #define IS_THAT_CONTROL_SPACE(c) \
michael@0 204 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
michael@0 205
michael@0 206 /* Java has decided that U+0085 New Line is not whitespace any more. */
michael@0 207 #define IS_THAT_ASCII_CONTROL_SPACE(c) \
michael@0 208 (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c))
michael@0 209
michael@0 210 /* Checks if the Unicode character is a space character.*/
michael@0 211 U_CAPI UBool U_EXPORT2
michael@0 212 u_isspace(UChar32 c) {
michael@0 213 uint32_t props;
michael@0 214 GET_PROPS(c, props);
michael@0 215 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c));
michael@0 216 }
michael@0 217
michael@0 218 U_CAPI UBool U_EXPORT2
michael@0 219 u_isJavaSpaceChar(UChar32 c) {
michael@0 220 uint32_t props;
michael@0 221 GET_PROPS(c, props);
michael@0 222 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0);
michael@0 223 }
michael@0 224
michael@0 225 /* Checks if the Unicode character is a whitespace character.*/
michael@0 226 U_CAPI UBool U_EXPORT2
michael@0 227 u_isWhitespace(UChar32 c) {
michael@0 228 uint32_t props;
michael@0 229 GET_PROPS(c, props);
michael@0 230 return (UBool)(
michael@0 231 ((CAT_MASK(props)&U_GC_Z_MASK)!=0 &&
michael@0 232 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
michael@0 233 IS_THAT_ASCII_CONTROL_SPACE(c)
michael@0 234 );
michael@0 235 }
michael@0 236
michael@0 237 U_CAPI UBool U_EXPORT2
michael@0 238 u_isblank(UChar32 c) {
michael@0 239 if((uint32_t)c<=0x9f) {
michael@0 240 return c==9 || c==0x20; /* TAB or SPACE */
michael@0 241 } else {
michael@0 242 /* Zs */
michael@0 243 uint32_t props;
michael@0 244 GET_PROPS(c, props);
michael@0 245 return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
michael@0 246 }
michael@0 247 }
michael@0 248
michael@0 249 U_CAPI UBool U_EXPORT2
michael@0 250 u_isUWhiteSpace(UChar32 c) {
michael@0 251 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
michael@0 252 }
michael@0 253
michael@0 254 /* Checks if the Unicode character is printable.*/
michael@0 255 U_CAPI UBool U_EXPORT2
michael@0 256 u_isprint(UChar32 c) {
michael@0 257 uint32_t props;
michael@0 258 GET_PROPS(c, props);
michael@0 259 /* comparing ==0 returns FALSE for the categories mentioned */
michael@0 260 return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
michael@0 261 }
michael@0 262
michael@0 263 /**
michael@0 264 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
michael@0 265 * Implements UCHAR_POSIX_PRINT.
michael@0 266 * @internal
michael@0 267 */
michael@0 268 U_CFUNC UBool
michael@0 269 u_isprintPOSIX(UChar32 c) {
michael@0 270 uint32_t props;
michael@0 271 GET_PROPS(c, props);
michael@0 272 /*
michael@0 273 * The only cntrl character in graph+blank is TAB (in blank).
michael@0 274 * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
michael@0 275 */
michael@0 276 return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
michael@0 277 }
michael@0 278
michael@0 279 U_CAPI UBool U_EXPORT2
michael@0 280 u_isgraph(UChar32 c) {
michael@0 281 uint32_t props;
michael@0 282 GET_PROPS(c, props);
michael@0 283 /* comparing ==0 returns FALSE for the categories mentioned */
michael@0 284 return (UBool)((CAT_MASK(props)&
michael@0 285 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
michael@0 286 ==0);
michael@0 287 }
michael@0 288
michael@0 289 /**
michael@0 290 * Checks if c is in
michael@0 291 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
michael@0 292 * with space=\p{Whitespace} and Control=Cc.
michael@0 293 * Implements UCHAR_POSIX_GRAPH.
michael@0 294 * @internal
michael@0 295 */
michael@0 296 U_CFUNC UBool
michael@0 297 u_isgraphPOSIX(UChar32 c) {
michael@0 298 uint32_t props;
michael@0 299 GET_PROPS(c, props);
michael@0 300 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
michael@0 301 /* comparing ==0 returns FALSE for the categories mentioned */
michael@0 302 return (UBool)((CAT_MASK(props)&
michael@0 303 (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
michael@0 304 ==0);
michael@0 305 }
michael@0 306
michael@0 307 U_CAPI UBool U_EXPORT2
michael@0 308 u_ispunct(UChar32 c) {
michael@0 309 uint32_t props;
michael@0 310 GET_PROPS(c, props);
michael@0 311 return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
michael@0 312 }
michael@0 313
michael@0 314 /* Checks if the Unicode character can start a Unicode identifier.*/
michael@0 315 U_CAPI UBool U_EXPORT2
michael@0 316 u_isIDStart(UChar32 c) {
michael@0 317 /* same as u_isalpha() */
michael@0 318 uint32_t props;
michael@0 319 GET_PROPS(c, props);
michael@0 320 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
michael@0 321 }
michael@0 322
michael@0 323 /* Checks if the Unicode character can be a Unicode identifier part other than starting the
michael@0 324 identifier.*/
michael@0 325 U_CAPI UBool U_EXPORT2
michael@0 326 u_isIDPart(UChar32 c) {
michael@0 327 uint32_t props;
michael@0 328 GET_PROPS(c, props);
michael@0 329 return (UBool)(
michael@0 330 (CAT_MASK(props)&
michael@0 331 (U_GC_ND_MASK|U_GC_NL_MASK|
michael@0 332 U_GC_L_MASK|
michael@0 333 U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
michael@0 334 )!=0 ||
michael@0 335 u_isIDIgnorable(c));
michael@0 336 }
michael@0 337
michael@0 338 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
michael@0 339 U_CAPI UBool U_EXPORT2
michael@0 340 u_isIDIgnorable(UChar32 c) {
michael@0 341 if(c<=0x9f) {
michael@0 342 return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c);
michael@0 343 } else {
michael@0 344 uint32_t props;
michael@0 345 GET_PROPS(c, props);
michael@0 346 return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR);
michael@0 347 }
michael@0 348 }
michael@0 349
michael@0 350 /*Checks if the Unicode character can start a Java identifier.*/
michael@0 351 U_CAPI UBool U_EXPORT2
michael@0 352 u_isJavaIDStart(UChar32 c) {
michael@0 353 uint32_t props;
michael@0 354 GET_PROPS(c, props);
michael@0 355 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0);
michael@0 356 }
michael@0 357
michael@0 358 /*Checks if the Unicode character can be a Java identifier part other than starting the
michael@0 359 * identifier.
michael@0 360 */
michael@0 361 U_CAPI UBool U_EXPORT2
michael@0 362 u_isJavaIDPart(UChar32 c) {
michael@0 363 uint32_t props;
michael@0 364 GET_PROPS(c, props);
michael@0 365 return (UBool)(
michael@0 366 (CAT_MASK(props)&
michael@0 367 (U_GC_ND_MASK|U_GC_NL_MASK|
michael@0 368 U_GC_L_MASK|
michael@0 369 U_GC_SC_MASK|U_GC_PC_MASK|
michael@0 370 U_GC_MC_MASK|U_GC_MN_MASK)
michael@0 371 )!=0 ||
michael@0 372 u_isIDIgnorable(c));
michael@0 373 }
michael@0 374
michael@0 375 U_CAPI int32_t U_EXPORT2
michael@0 376 u_charDigitValue(UChar32 c) {
michael@0 377 uint32_t props;
michael@0 378 int32_t value;
michael@0 379 GET_PROPS(c, props);
michael@0 380 value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START;
michael@0 381 if(value<=9) {
michael@0 382 return value;
michael@0 383 } else {
michael@0 384 return -1;
michael@0 385 }
michael@0 386 }
michael@0 387
michael@0 388 U_CAPI double U_EXPORT2
michael@0 389 u_getNumericValue(UChar32 c) {
michael@0 390 uint32_t props;
michael@0 391 int32_t ntv;
michael@0 392 GET_PROPS(c, props);
michael@0 393 ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props);
michael@0 394
michael@0 395 if(ntv==UPROPS_NTV_NONE) {
michael@0 396 return U_NO_NUMERIC_VALUE;
michael@0 397 } else if(ntv<UPROPS_NTV_DIGIT_START) {
michael@0 398 /* decimal digit */
michael@0 399 return ntv-UPROPS_NTV_DECIMAL_START;
michael@0 400 } else if(ntv<UPROPS_NTV_NUMERIC_START) {
michael@0 401 /* other digit */
michael@0 402 return ntv-UPROPS_NTV_DIGIT_START;
michael@0 403 } else if(ntv<UPROPS_NTV_FRACTION_START) {
michael@0 404 /* small integer */
michael@0 405 return ntv-UPROPS_NTV_NUMERIC_START;
michael@0 406 } else if(ntv<UPROPS_NTV_LARGE_START) {
michael@0 407 /* fraction */
michael@0 408 int32_t numerator=(ntv>>4)-12;
michael@0 409 int32_t denominator=(ntv&0xf)+1;
michael@0 410 return (double)numerator/denominator;
michael@0 411 } else if(ntv<UPROPS_NTV_BASE60_START) {
michael@0 412 /* large, single-significant-digit integer */
michael@0 413 double numValue;
michael@0 414 int32_t mant=(ntv>>5)-14;
michael@0 415 int32_t exp=(ntv&0x1f)+2;
michael@0 416 numValue=mant;
michael@0 417
michael@0 418 /* multiply by 10^exp without math.h */
michael@0 419 while(exp>=4) {
michael@0 420 numValue*=10000.;
michael@0 421 exp-=4;
michael@0 422 }
michael@0 423 switch(exp) {
michael@0 424 case 3:
michael@0 425 numValue*=1000.;
michael@0 426 break;
michael@0 427 case 2:
michael@0 428 numValue*=100.;
michael@0 429 break;
michael@0 430 case 1:
michael@0 431 numValue*=10.;
michael@0 432 break;
michael@0 433 case 0:
michael@0 434 default:
michael@0 435 break;
michael@0 436 }
michael@0 437
michael@0 438 return numValue;
michael@0 439 } else if(ntv<UPROPS_NTV_RESERVED_START) {
michael@0 440 /* sexagesimal (base 60) integer */
michael@0 441 int32_t numValue=(ntv>>2)-0xbf;
michael@0 442 int32_t exp=(ntv&3)+1;
michael@0 443
michael@0 444 switch(exp) {
michael@0 445 case 4:
michael@0 446 numValue*=60*60*60*60;
michael@0 447 break;
michael@0 448 case 3:
michael@0 449 numValue*=60*60*60;
michael@0 450 break;
michael@0 451 case 2:
michael@0 452 numValue*=60*60;
michael@0 453 break;
michael@0 454 case 1:
michael@0 455 numValue*=60;
michael@0 456 break;
michael@0 457 case 0:
michael@0 458 default:
michael@0 459 break;
michael@0 460 }
michael@0 461
michael@0 462 return numValue;
michael@0 463 } else {
michael@0 464 /* reserved */
michael@0 465 return U_NO_NUMERIC_VALUE;
michael@0 466 }
michael@0 467 }
michael@0 468
michael@0 469 U_CAPI int32_t U_EXPORT2
michael@0 470 u_digit(UChar32 ch, int8_t radix) {
michael@0 471 int8_t value;
michael@0 472 if((uint8_t)(radix-2)<=(36-2)) {
michael@0 473 value=(int8_t)u_charDigitValue(ch);
michael@0 474 if(value<0) {
michael@0 475 /* ch is not a decimal digit, try latin letters */
michael@0 476 if(ch>=0x61 && ch<=0x7A) {
michael@0 477 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */
michael@0 478 } else if(ch>=0x41 && ch<=0x5A) {
michael@0 479 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */
michael@0 480 } else if(ch>=0xFF41 && ch<=0xFF5A) {
michael@0 481 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */
michael@0 482 } else if(ch>=0xFF21 && ch<=0xFF3A) {
michael@0 483 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */
michael@0 484 }
michael@0 485 }
michael@0 486 } else {
michael@0 487 value=-1; /* invalid radix */
michael@0 488 }
michael@0 489 return (int8_t)((value<radix) ? value : -1);
michael@0 490 }
michael@0 491
michael@0 492 U_CAPI UChar32 U_EXPORT2
michael@0 493 u_forDigit(int32_t digit, int8_t radix) {
michael@0 494 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) {
michael@0 495 return 0;
michael@0 496 } else if(digit<10) {
michael@0 497 return (UChar32)(0x30+digit);
michael@0 498 } else {
michael@0 499 return (UChar32)((0x61-10)+digit);
michael@0 500 }
michael@0 501 }
michael@0 502
michael@0 503 /* miscellaneous, and support for uprops.cpp -------------------------------- */
michael@0 504
michael@0 505 U_CAPI void U_EXPORT2
michael@0 506 u_getUnicodeVersion(UVersionInfo versionArray) {
michael@0 507 if(versionArray!=NULL) {
michael@0 508 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
michael@0 509 }
michael@0 510 }
michael@0 511
michael@0 512 U_CFUNC uint32_t
michael@0 513 u_getMainProperties(UChar32 c) {
michael@0 514 uint32_t props;
michael@0 515 GET_PROPS(c, props);
michael@0 516 return props;
michael@0 517 }
michael@0 518
michael@0 519 U_CFUNC uint32_t
michael@0 520 u_getUnicodeProperties(UChar32 c, int32_t column) {
michael@0 521 U_ASSERT(column>=0);
michael@0 522 if(column>=propsVectorsColumns) {
michael@0 523 return 0;
michael@0 524 } else {
michael@0 525 uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
michael@0 526 return propsVectors[vecIndex+column];
michael@0 527 }
michael@0 528 }
michael@0 529
michael@0 530 U_CFUNC int32_t
michael@0 531 uprv_getMaxValues(int32_t column) {
michael@0 532 switch(column) {
michael@0 533 case 0:
michael@0 534 return indexes[UPROPS_MAX_VALUES_INDEX];
michael@0 535 case 2:
michael@0 536 return indexes[UPROPS_MAX_VALUES_2_INDEX];
michael@0 537 default:
michael@0 538 return 0;
michael@0 539 }
michael@0 540 }
michael@0 541
michael@0 542 U_CAPI void U_EXPORT2
michael@0 543 u_charAge(UChar32 c, UVersionInfo versionArray) {
michael@0 544 if(versionArray!=NULL) {
michael@0 545 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
michael@0 546 versionArray[0]=(uint8_t)(version>>4);
michael@0 547 versionArray[1]=(uint8_t)(version&0xf);
michael@0 548 versionArray[2]=versionArray[3]=0;
michael@0 549 }
michael@0 550 }
michael@0 551
michael@0 552 U_CAPI UScriptCode U_EXPORT2
michael@0 553 uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
michael@0 554 uint32_t scriptX;
michael@0 555 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 556 return USCRIPT_INVALID_CODE;
michael@0 557 }
michael@0 558 if((uint32_t)c>0x10ffff) {
michael@0 559 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 560 return USCRIPT_INVALID_CODE;
michael@0 561 }
michael@0 562 scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
michael@0 563 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
michael@0 564 return (UScriptCode)scriptX;
michael@0 565 } else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
michael@0 566 return USCRIPT_COMMON;
michael@0 567 } else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
michael@0 568 return USCRIPT_INHERITED;
michael@0 569 } else {
michael@0 570 return (UScriptCode)scriptExtensions[scriptX&UPROPS_SCRIPT_MASK];
michael@0 571 }
michael@0 572 }
michael@0 573
michael@0 574 U_CAPI UBool U_EXPORT2
michael@0 575 uscript_hasScript(UChar32 c, UScriptCode sc) {
michael@0 576 const uint16_t *scx;
michael@0 577 uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
michael@0 578 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
michael@0 579 return sc==(UScriptCode)scriptX;
michael@0 580 }
michael@0 581
michael@0 582 scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
michael@0 583 if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
michael@0 584 scx=scriptExtensions+scx[1];
michael@0 585 }
michael@0 586 if(sc>=USCRIPT_CODE_LIMIT) {
michael@0 587 /* Guard against bogus input that would make us go past the Script_Extensions terminator. */
michael@0 588 return FALSE;
michael@0 589 }
michael@0 590 while(sc>*scx) {
michael@0 591 ++scx;
michael@0 592 }
michael@0 593 return sc==(*scx&0x7fff);
michael@0 594 }
michael@0 595
michael@0 596 U_CAPI int32_t U_EXPORT2
michael@0 597 uscript_getScriptExtensions(UChar32 c,
michael@0 598 UScriptCode *scripts, int32_t capacity,
michael@0 599 UErrorCode *pErrorCode) {
michael@0 600 uint32_t scriptX;
michael@0 601 int32_t length;
michael@0 602 const uint16_t *scx;
michael@0 603 uint16_t sx;
michael@0 604 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 605 return 0;
michael@0 606 }
michael@0 607 if(capacity<0 || (capacity>0 && scripts==NULL)) {
michael@0 608 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 609 return 0;
michael@0 610 }
michael@0 611 scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
michael@0 612 if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
michael@0 613 if(capacity==0) {
michael@0 614 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 615 } else {
michael@0 616 scripts[0]=(UScriptCode)scriptX;
michael@0 617 }
michael@0 618 return 1;
michael@0 619 }
michael@0 620
michael@0 621 scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
michael@0 622 if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
michael@0 623 scx=scriptExtensions+scx[1];
michael@0 624 }
michael@0 625 length=0;
michael@0 626 do {
michael@0 627 sx=*scx++;
michael@0 628 if(length<capacity) {
michael@0 629 scripts[length]=(UScriptCode)(sx&0x7fff);
michael@0 630 }
michael@0 631 ++length;
michael@0 632 } while(sx<0x8000);
michael@0 633 if(length>capacity) {
michael@0 634 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 635 }
michael@0 636 return length;
michael@0 637 }
michael@0 638
michael@0 639 U_CAPI UBlockCode U_EXPORT2
michael@0 640 ublock_getCode(UChar32 c) {
michael@0 641 return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
michael@0 642 }
michael@0 643
michael@0 644 /* property starts for UnicodeSet ------------------------------------------- */
michael@0 645
michael@0 646 static UBool U_CALLCONV
michael@0 647 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
michael@0 648 /* add the start code point to the USet */
michael@0 649 const USetAdder *sa=(const USetAdder *)context;
michael@0 650 sa->add(sa->set, start);
michael@0 651 return TRUE;
michael@0 652 }
michael@0 653
michael@0 654 #define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
michael@0 655
michael@0 656 U_CFUNC void U_EXPORT2
michael@0 657 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
michael@0 658 if(U_FAILURE(*pErrorCode)) {
michael@0 659 return;
michael@0 660 }
michael@0 661
michael@0 662 /* add the start code point of each same-value range of the main trie */
michael@0 663 utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa);
michael@0 664
michael@0 665 /* add code points with hardcoded properties, plus the ones following them */
michael@0 666
michael@0 667 /* add for u_isblank() */
michael@0 668 USET_ADD_CP_AND_NEXT(sa, TAB);
michael@0 669
michael@0 670 /* add for IS_THAT_CONTROL_SPACE() */
michael@0 671 sa->add(sa->set, CR+1); /* range TAB..CR */
michael@0 672 sa->add(sa->set, 0x1c);
michael@0 673 sa->add(sa->set, 0x1f+1);
michael@0 674 USET_ADD_CP_AND_NEXT(sa, NL);
michael@0 675
michael@0 676 /* add for u_isIDIgnorable() what was not added above */
michael@0 677 sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
michael@0 678 sa->add(sa->set, HAIRSP);
michael@0 679 sa->add(sa->set, RLM+1);
michael@0 680 sa->add(sa->set, INHSWAP);
michael@0 681 sa->add(sa->set, NOMDIG+1);
michael@0 682 USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
michael@0 683
michael@0 684 /* add no-break spaces for u_isWhitespace() what was not added above */
michael@0 685 USET_ADD_CP_AND_NEXT(sa, NBSP);
michael@0 686 USET_ADD_CP_AND_NEXT(sa, FIGURESP);
michael@0 687 USET_ADD_CP_AND_NEXT(sa, NNBSP);
michael@0 688
michael@0 689 /* add for u_digit() */
michael@0 690 sa->add(sa->set, U_a);
michael@0 691 sa->add(sa->set, U_z+1);
michael@0 692 sa->add(sa->set, U_A);
michael@0 693 sa->add(sa->set, U_Z+1);
michael@0 694 sa->add(sa->set, U_FW_a);
michael@0 695 sa->add(sa->set, U_FW_z+1);
michael@0 696 sa->add(sa->set, U_FW_A);
michael@0 697 sa->add(sa->set, U_FW_Z+1);
michael@0 698
michael@0 699 /* add for u_isxdigit() */
michael@0 700 sa->add(sa->set, U_f+1);
michael@0 701 sa->add(sa->set, U_F+1);
michael@0 702 sa->add(sa->set, U_FW_f+1);
michael@0 703 sa->add(sa->set, U_FW_F+1);
michael@0 704
michael@0 705 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
michael@0 706 sa->add(sa->set, WJ); /* range WJ..NOMDIG */
michael@0 707 sa->add(sa->set, 0xfff0);
michael@0 708 sa->add(sa->set, 0xfffb+1);
michael@0 709 sa->add(sa->set, 0xe0000);
michael@0 710 sa->add(sa->set, 0xe0fff+1);
michael@0 711
michael@0 712 /* add for UCHAR_GRAPHEME_BASE and others */
michael@0 713 USET_ADD_CP_AND_NEXT(sa, CGJ);
michael@0 714 }
michael@0 715
michael@0 716 U_CFUNC void U_EXPORT2
michael@0 717 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
michael@0 718 if(U_FAILURE(*pErrorCode)) {
michael@0 719 return;
michael@0 720 }
michael@0 721
michael@0 722 /* add the start code point of each same-value range of the properties vectors trie */
michael@0 723 if(propsVectorsColumns>0) {
michael@0 724 /* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */
michael@0 725 utrie2_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa);
michael@0 726 }
michael@0 727 }

mercurial