intl/icu/source/common/uchar.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/uchar.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,727 @@
     1.4 +/*
     1.5 +********************************************************************************
     1.6 +*   Copyright (C) 1996-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +********************************************************************************
     1.9 +*
    1.10 +* File UCHAR.C
    1.11 +*
    1.12 +* Modification History:
    1.13 +*
    1.14 +*   Date        Name        Description
    1.15 +*   04/02/97    aliu        Creation.
    1.16 +*   4/15/99     Madhu       Updated all the function definitions for C Implementation
    1.17 +*   5/20/99     Madhu       Added the function u_getVersion()
    1.18 +*   8/19/1999   srl         Upgraded scripts to Unicode3.0 
    1.19 +*   11/11/1999  weiv        added u_isalnum(), cleaned comments
    1.20 +*   01/11/2000  helena      Renamed u_getVersion to u_getUnicodeVersion.
    1.21 +*   06/20/2000  helena      OS/400 port changes; mostly typecast.
    1.22 +******************************************************************************
    1.23 +*/
    1.24 +
    1.25 +#include "unicode/utypes.h"
    1.26 +#include "unicode/uchar.h"
    1.27 +#include "unicode/uscript.h"
    1.28 +#include "unicode/udata.h"
    1.29 +#include "uassert.h"
    1.30 +#include "cmemory.h"
    1.31 +#include "ucln_cmn.h"
    1.32 +#include "utrie2.h"
    1.33 +#include "udataswp.h"
    1.34 +#include "uprops.h"
    1.35 +#include "ustr_imp.h"
    1.36 +
    1.37 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.38 +
    1.39 +/* uchar_props_data.h is machine-generated by genprops --csource */
    1.40 +#define INCLUDED_FROM_UCHAR_C
    1.41 +#include "uchar_props_data.h"
    1.42 +
    1.43 +/* constants and macros for access to the data ------------------------------ */
    1.44 +
    1.45 +/* getting a uint32_t properties word from the data */
    1.46 +#define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c));
    1.47 +
    1.48 +U_CFUNC UBool
    1.49 +uprv_haveProperties(UErrorCode *pErrorCode) {
    1.50 +    if(U_FAILURE(*pErrorCode)) {
    1.51 +        return FALSE;
    1.52 +    }
    1.53 +    return TRUE;
    1.54 +}
    1.55 +
    1.56 +/* API functions ------------------------------------------------------------ */
    1.57 +
    1.58 +/* Gets the Unicode character's general category.*/
    1.59 +U_CAPI int8_t U_EXPORT2
    1.60 +u_charType(UChar32 c) {
    1.61 +    uint32_t props;
    1.62 +    GET_PROPS(c, props);
    1.63 +    return (int8_t)GET_CATEGORY(props);
    1.64 +}
    1.65 +
    1.66 +/* Enumerate all code points with their general categories. */
    1.67 +struct _EnumTypeCallback {
    1.68 +    UCharEnumTypeRange *enumRange;
    1.69 +    const void *context;
    1.70 +};
    1.71 +
    1.72 +static uint32_t U_CALLCONV
    1.73 +_enumTypeValue(const void *context, uint32_t value) {
    1.74 +    return GET_CATEGORY(value);
    1.75 +}
    1.76 +
    1.77 +static UBool U_CALLCONV
    1.78 +_enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    1.79 +    /* just cast the value to UCharCategory */
    1.80 +    return ((struct _EnumTypeCallback *)context)->
    1.81 +        enumRange(((struct _EnumTypeCallback *)context)->context,
    1.82 +                  start, end+1, (UCharCategory)value);
    1.83 +}
    1.84 +
    1.85 +U_CAPI void U_EXPORT2
    1.86 +u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) {
    1.87 +    struct _EnumTypeCallback callback;
    1.88 +
    1.89 +    if(enumRange==NULL) {
    1.90 +        return;
    1.91 +    }
    1.92 +
    1.93 +    callback.enumRange=enumRange;
    1.94 +    callback.context=context;
    1.95 +    utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback);
    1.96 +}
    1.97 +
    1.98 +/* Checks if ch is a lower case letter.*/
    1.99 +U_CAPI UBool U_EXPORT2
   1.100 +u_islower(UChar32 c) {
   1.101 +    uint32_t props;
   1.102 +    GET_PROPS(c, props);
   1.103 +    return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
   1.104 +}
   1.105 +
   1.106 +/* Checks if ch is an upper case letter.*/
   1.107 +U_CAPI UBool U_EXPORT2
   1.108 +u_isupper(UChar32 c) {
   1.109 +    uint32_t props;
   1.110 +    GET_PROPS(c, props);
   1.111 +    return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
   1.112 +}
   1.113 +
   1.114 +/* Checks if ch is a title case letter; usually upper case letters.*/
   1.115 +U_CAPI UBool U_EXPORT2
   1.116 +u_istitle(UChar32 c) {
   1.117 +    uint32_t props;
   1.118 +    GET_PROPS(c, props);
   1.119 +    return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
   1.120 +}
   1.121 +
   1.122 +/* Checks if ch is a decimal digit. */
   1.123 +U_CAPI UBool U_EXPORT2
   1.124 +u_isdigit(UChar32 c) {
   1.125 +    uint32_t props;
   1.126 +    GET_PROPS(c, props);
   1.127 +    return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
   1.128 +}
   1.129 +
   1.130 +U_CAPI UBool U_EXPORT2
   1.131 +u_isxdigit(UChar32 c) {
   1.132 +    uint32_t props;
   1.133 +
   1.134 +    /* check ASCII and Fullwidth ASCII a-fA-F */
   1.135 +    if(
   1.136 +        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
   1.137 +        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
   1.138 +    ) {
   1.139 +        return TRUE;
   1.140 +    }
   1.141 +
   1.142 +    GET_PROPS(c, props);
   1.143 +    return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER);
   1.144 +}
   1.145 +
   1.146 +/* Checks if the Unicode character is a letter.*/
   1.147 +U_CAPI UBool U_EXPORT2
   1.148 +u_isalpha(UChar32 c) {
   1.149 +    uint32_t props;
   1.150 +    GET_PROPS(c, props);
   1.151 +    return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0);
   1.152 +}
   1.153 +
   1.154 +U_CAPI UBool U_EXPORT2
   1.155 +u_isUAlphabetic(UChar32 c) {
   1.156 +    return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0;
   1.157 +}
   1.158 +
   1.159 +/* Checks if c is a letter or a decimal digit */
   1.160 +U_CAPI UBool U_EXPORT2
   1.161 +u_isalnum(UChar32 c) {
   1.162 +    uint32_t props;
   1.163 +    GET_PROPS(c, props);
   1.164 +    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0);
   1.165 +}
   1.166 +
   1.167 +/**
   1.168 + * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM.
   1.169 + * @internal
   1.170 + */
   1.171 +U_CFUNC UBool
   1.172 +u_isalnumPOSIX(UChar32 c) {
   1.173 +    return (UBool)(u_isUAlphabetic(c) || u_isdigit(c));
   1.174 +}
   1.175 +
   1.176 +/* Checks if ch is a unicode character with assigned character type.*/
   1.177 +U_CAPI UBool U_EXPORT2
   1.178 +u_isdefined(UChar32 c) {
   1.179 +    uint32_t props;
   1.180 +    GET_PROPS(c, props);
   1.181 +    return (UBool)(GET_CATEGORY(props)!=0);
   1.182 +}
   1.183 +
   1.184 +/* Checks if the Unicode character is a base form character that can take a diacritic.*/
   1.185 +U_CAPI UBool U_EXPORT2
   1.186 +u_isbase(UChar32 c) {
   1.187 +    uint32_t props;
   1.188 +    GET_PROPS(c, props);
   1.189 +    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0);
   1.190 +}
   1.191 +
   1.192 +/* Checks if the Unicode character is a control character.*/
   1.193 +U_CAPI UBool U_EXPORT2
   1.194 +u_iscntrl(UChar32 c) {
   1.195 +    uint32_t props;
   1.196 +    GET_PROPS(c, props);
   1.197 +    return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0);
   1.198 +}
   1.199 +
   1.200 +U_CAPI UBool U_EXPORT2
   1.201 +u_isISOControl(UChar32 c) {
   1.202 +    return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f);
   1.203 +}
   1.204 +
   1.205 +/* Some control characters that are used as space. */
   1.206 +#define IS_THAT_CONTROL_SPACE(c) \
   1.207 +    (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
   1.208 +
   1.209 +/* Java has decided that U+0085 New Line is not whitespace any more. */
   1.210 +#define IS_THAT_ASCII_CONTROL_SPACE(c) \
   1.211 +    (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c))
   1.212 +
   1.213 +/* Checks if the Unicode character is a space character.*/
   1.214 +U_CAPI UBool U_EXPORT2
   1.215 +u_isspace(UChar32 c) {
   1.216 +    uint32_t props;
   1.217 +    GET_PROPS(c, props);
   1.218 +    return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c));
   1.219 +}
   1.220 +
   1.221 +U_CAPI UBool U_EXPORT2
   1.222 +u_isJavaSpaceChar(UChar32 c) {
   1.223 +    uint32_t props;
   1.224 +    GET_PROPS(c, props);
   1.225 +    return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0);
   1.226 +}
   1.227 +
   1.228 +/* Checks if the Unicode character is a whitespace character.*/
   1.229 +U_CAPI UBool U_EXPORT2
   1.230 +u_isWhitespace(UChar32 c) {
   1.231 +    uint32_t props;
   1.232 +    GET_PROPS(c, props);
   1.233 +    return (UBool)(
   1.234 +                ((CAT_MASK(props)&U_GC_Z_MASK)!=0 &&
   1.235 +                    c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */
   1.236 +                IS_THAT_ASCII_CONTROL_SPACE(c)
   1.237 +           );
   1.238 +}
   1.239 +
   1.240 +U_CAPI UBool U_EXPORT2
   1.241 +u_isblank(UChar32 c) {
   1.242 +    if((uint32_t)c<=0x9f) {
   1.243 +        return c==9 || c==0x20; /* TAB or SPACE */
   1.244 +    } else {
   1.245 +        /* Zs */
   1.246 +        uint32_t props;
   1.247 +        GET_PROPS(c, props);
   1.248 +        return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR);
   1.249 +    }
   1.250 +}
   1.251 +
   1.252 +U_CAPI UBool U_EXPORT2
   1.253 +u_isUWhiteSpace(UChar32 c) {
   1.254 +    return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0;
   1.255 +}
   1.256 +
   1.257 +/* Checks if the Unicode character is printable.*/
   1.258 +U_CAPI UBool U_EXPORT2
   1.259 +u_isprint(UChar32 c) {
   1.260 +    uint32_t props;
   1.261 +    GET_PROPS(c, props);
   1.262 +    /* comparing ==0 returns FALSE for the categories mentioned */
   1.263 +    return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0);
   1.264 +}
   1.265 +
   1.266 +/**
   1.267 + * Checks if c is in \p{graph}\p{blank} - \p{cntrl}.
   1.268 + * Implements UCHAR_POSIX_PRINT.
   1.269 + * @internal
   1.270 + */
   1.271 +U_CFUNC UBool
   1.272 +u_isprintPOSIX(UChar32 c) {
   1.273 +    uint32_t props;
   1.274 +    GET_PROPS(c, props);
   1.275 +    /*
   1.276 +     * The only cntrl character in graph+blank is TAB (in blank).
   1.277 +     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
   1.278 +     */
   1.279 +    return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c));
   1.280 +}
   1.281 +
   1.282 +U_CAPI UBool U_EXPORT2
   1.283 +u_isgraph(UChar32 c) {
   1.284 +    uint32_t props;
   1.285 +    GET_PROPS(c, props);
   1.286 +    /* comparing ==0 returns FALSE for the categories mentioned */
   1.287 +    return (UBool)((CAT_MASK(props)&
   1.288 +                    (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
   1.289 +                   ==0);
   1.290 +}
   1.291 +
   1.292 +/**
   1.293 + * Checks if c is in
   1.294 + * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
   1.295 + * with space=\p{Whitespace} and Control=Cc.
   1.296 + * Implements UCHAR_POSIX_GRAPH.
   1.297 + * @internal
   1.298 + */
   1.299 +U_CFUNC UBool
   1.300 +u_isgraphPOSIX(UChar32 c) {
   1.301 +    uint32_t props;
   1.302 +    GET_PROPS(c, props);
   1.303 +    /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
   1.304 +    /* comparing ==0 returns FALSE for the categories mentioned */
   1.305 +    return (UBool)((CAT_MASK(props)&
   1.306 +                    (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK))
   1.307 +                   ==0);
   1.308 +}
   1.309 +
   1.310 +U_CAPI UBool U_EXPORT2
   1.311 +u_ispunct(UChar32 c) {
   1.312 +    uint32_t props;
   1.313 +    GET_PROPS(c, props);
   1.314 +    return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0);
   1.315 +}
   1.316 +
   1.317 +/* Checks if the Unicode character can start a Unicode identifier.*/
   1.318 +U_CAPI UBool U_EXPORT2
   1.319 +u_isIDStart(UChar32 c) {
   1.320 +    /* same as u_isalpha() */
   1.321 +    uint32_t props;
   1.322 +    GET_PROPS(c, props);
   1.323 +    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0);
   1.324 +}
   1.325 +
   1.326 +/* Checks if the Unicode character can be a Unicode identifier part other than starting the
   1.327 + identifier.*/
   1.328 +U_CAPI UBool U_EXPORT2
   1.329 +u_isIDPart(UChar32 c) {
   1.330 +    uint32_t props;
   1.331 +    GET_PROPS(c, props);
   1.332 +    return (UBool)(
   1.333 +           (CAT_MASK(props)&
   1.334 +            (U_GC_ND_MASK|U_GC_NL_MASK|
   1.335 +             U_GC_L_MASK|
   1.336 +             U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK)
   1.337 +           )!=0 ||
   1.338 +           u_isIDIgnorable(c));
   1.339 +}
   1.340 +
   1.341 +/*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
   1.342 +U_CAPI UBool U_EXPORT2
   1.343 +u_isIDIgnorable(UChar32 c) {
   1.344 +    if(c<=0x9f) {
   1.345 +        return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c);
   1.346 +    } else {
   1.347 +        uint32_t props;
   1.348 +        GET_PROPS(c, props);
   1.349 +        return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR);
   1.350 +    }
   1.351 +}
   1.352 +
   1.353 +/*Checks if the Unicode character can start a Java identifier.*/
   1.354 +U_CAPI UBool U_EXPORT2
   1.355 +u_isJavaIDStart(UChar32 c) {
   1.356 +    uint32_t props;
   1.357 +    GET_PROPS(c, props);
   1.358 +    return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0);
   1.359 +}
   1.360 +
   1.361 +/*Checks if the Unicode character can be a Java identifier part other than starting the
   1.362 + * identifier.
   1.363 + */
   1.364 +U_CAPI UBool U_EXPORT2
   1.365 +u_isJavaIDPart(UChar32 c) {
   1.366 +    uint32_t props;
   1.367 +    GET_PROPS(c, props);
   1.368 +    return (UBool)(
   1.369 +           (CAT_MASK(props)&
   1.370 +            (U_GC_ND_MASK|U_GC_NL_MASK|
   1.371 +             U_GC_L_MASK|
   1.372 +             U_GC_SC_MASK|U_GC_PC_MASK|
   1.373 +             U_GC_MC_MASK|U_GC_MN_MASK)
   1.374 +           )!=0 ||
   1.375 +           u_isIDIgnorable(c));
   1.376 +}
   1.377 +
   1.378 +U_CAPI int32_t U_EXPORT2
   1.379 +u_charDigitValue(UChar32 c) {
   1.380 +    uint32_t props;
   1.381 +    int32_t value;
   1.382 +    GET_PROPS(c, props);
   1.383 +    value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START;
   1.384 +    if(value<=9) {
   1.385 +        return value;
   1.386 +    } else {
   1.387 +        return -1;
   1.388 +    }
   1.389 +}
   1.390 +
   1.391 +U_CAPI double U_EXPORT2
   1.392 +u_getNumericValue(UChar32 c) {
   1.393 +    uint32_t props;
   1.394 +    int32_t ntv;
   1.395 +    GET_PROPS(c, props);
   1.396 +    ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props);
   1.397 +
   1.398 +    if(ntv==UPROPS_NTV_NONE) {
   1.399 +        return U_NO_NUMERIC_VALUE;
   1.400 +    } else if(ntv<UPROPS_NTV_DIGIT_START) {
   1.401 +        /* decimal digit */
   1.402 +        return ntv-UPROPS_NTV_DECIMAL_START;
   1.403 +    } else if(ntv<UPROPS_NTV_NUMERIC_START) {
   1.404 +        /* other digit */
   1.405 +        return ntv-UPROPS_NTV_DIGIT_START;
   1.406 +    } else if(ntv<UPROPS_NTV_FRACTION_START) {
   1.407 +        /* small integer */
   1.408 +        return ntv-UPROPS_NTV_NUMERIC_START;
   1.409 +    } else if(ntv<UPROPS_NTV_LARGE_START) {
   1.410 +        /* fraction */
   1.411 +        int32_t numerator=(ntv>>4)-12;
   1.412 +        int32_t denominator=(ntv&0xf)+1;
   1.413 +        return (double)numerator/denominator;
   1.414 +    } else if(ntv<UPROPS_NTV_BASE60_START) {
   1.415 +        /* large, single-significant-digit integer */
   1.416 +        double numValue;
   1.417 +        int32_t mant=(ntv>>5)-14;
   1.418 +        int32_t exp=(ntv&0x1f)+2;
   1.419 +        numValue=mant;
   1.420 +
   1.421 +        /* multiply by 10^exp without math.h */
   1.422 +        while(exp>=4) {
   1.423 +            numValue*=10000.;
   1.424 +            exp-=4;
   1.425 +        }
   1.426 +        switch(exp) {
   1.427 +        case 3:
   1.428 +            numValue*=1000.;
   1.429 +            break;
   1.430 +        case 2:
   1.431 +            numValue*=100.;
   1.432 +            break;
   1.433 +        case 1:
   1.434 +            numValue*=10.;
   1.435 +            break;
   1.436 +        case 0:
   1.437 +        default:
   1.438 +            break;
   1.439 +        }
   1.440 +
   1.441 +        return numValue;
   1.442 +    } else if(ntv<UPROPS_NTV_RESERVED_START) {
   1.443 +        /* sexagesimal (base 60) integer */
   1.444 +        int32_t numValue=(ntv>>2)-0xbf;
   1.445 +        int32_t exp=(ntv&3)+1;
   1.446 +
   1.447 +        switch(exp) {
   1.448 +        case 4:
   1.449 +            numValue*=60*60*60*60;
   1.450 +            break;
   1.451 +        case 3:
   1.452 +            numValue*=60*60*60;
   1.453 +            break;
   1.454 +        case 2:
   1.455 +            numValue*=60*60;
   1.456 +            break;
   1.457 +        case 1:
   1.458 +            numValue*=60;
   1.459 +            break;
   1.460 +        case 0:
   1.461 +        default:
   1.462 +            break;
   1.463 +        }
   1.464 +
   1.465 +        return numValue;
   1.466 +    } else {
   1.467 +        /* reserved */
   1.468 +        return U_NO_NUMERIC_VALUE;
   1.469 +    }
   1.470 +}
   1.471 +
   1.472 +U_CAPI int32_t U_EXPORT2
   1.473 +u_digit(UChar32 ch, int8_t radix) {
   1.474 +    int8_t value;
   1.475 +    if((uint8_t)(radix-2)<=(36-2)) {
   1.476 +        value=(int8_t)u_charDigitValue(ch);
   1.477 +        if(value<0) {
   1.478 +            /* ch is not a decimal digit, try latin letters */
   1.479 +            if(ch>=0x61 && ch<=0x7A) {
   1.480 +                value=(int8_t)(ch-0x57);  /* ch - 'a' + 10 */
   1.481 +            } else if(ch>=0x41 && ch<=0x5A) {
   1.482 +                value=(int8_t)(ch-0x37);  /* ch - 'A' + 10 */
   1.483 +            } else if(ch>=0xFF41 && ch<=0xFF5A) {
   1.484 +                value=(int8_t)(ch-0xFF37);  /* fullwidth ASCII a-z */
   1.485 +            } else if(ch>=0xFF21 && ch<=0xFF3A) {
   1.486 +                value=(int8_t)(ch-0xFF17);  /* fullwidth ASCII A-Z */
   1.487 +            }
   1.488 +        }
   1.489 +    } else {
   1.490 +        value=-1;   /* invalid radix */
   1.491 +    }
   1.492 +    return (int8_t)((value<radix) ? value : -1);
   1.493 +}
   1.494 +
   1.495 +U_CAPI UChar32 U_EXPORT2
   1.496 +u_forDigit(int32_t digit, int8_t radix) {
   1.497 +    if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) {
   1.498 +        return 0;
   1.499 +    } else if(digit<10) {
   1.500 +        return (UChar32)(0x30+digit);
   1.501 +    } else {
   1.502 +        return (UChar32)((0x61-10)+digit);
   1.503 +    }
   1.504 +}
   1.505 +
   1.506 +/* miscellaneous, and support for uprops.cpp -------------------------------- */
   1.507 +
   1.508 +U_CAPI void U_EXPORT2
   1.509 +u_getUnicodeVersion(UVersionInfo versionArray) {
   1.510 +    if(versionArray!=NULL) {
   1.511 +        uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
   1.512 +    }
   1.513 +}
   1.514 +
   1.515 +U_CFUNC uint32_t
   1.516 +u_getMainProperties(UChar32 c) {
   1.517 +    uint32_t props;
   1.518 +    GET_PROPS(c, props);
   1.519 +    return props;
   1.520 +}
   1.521 +
   1.522 +U_CFUNC uint32_t
   1.523 +u_getUnicodeProperties(UChar32 c, int32_t column) {
   1.524 +    U_ASSERT(column>=0);
   1.525 +    if(column>=propsVectorsColumns) {
   1.526 +        return 0;
   1.527 +    } else {
   1.528 +        uint16_t vecIndex=UTRIE2_GET16(&propsVectorsTrie, c);
   1.529 +        return propsVectors[vecIndex+column];
   1.530 +    }
   1.531 +}
   1.532 +
   1.533 +U_CFUNC int32_t
   1.534 +uprv_getMaxValues(int32_t column) {
   1.535 +    switch(column) {
   1.536 +    case 0:
   1.537 +        return indexes[UPROPS_MAX_VALUES_INDEX];
   1.538 +    case 2:
   1.539 +        return indexes[UPROPS_MAX_VALUES_2_INDEX];
   1.540 +    default:
   1.541 +        return 0;
   1.542 +    }
   1.543 +}
   1.544 +
   1.545 +U_CAPI void U_EXPORT2
   1.546 +u_charAge(UChar32 c, UVersionInfo versionArray) {
   1.547 +    if(versionArray!=NULL) {
   1.548 +        uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT;
   1.549 +        versionArray[0]=(uint8_t)(version>>4);
   1.550 +        versionArray[1]=(uint8_t)(version&0xf);
   1.551 +        versionArray[2]=versionArray[3]=0;
   1.552 +    }
   1.553 +}
   1.554 +
   1.555 +U_CAPI UScriptCode U_EXPORT2
   1.556 +uscript_getScript(UChar32 c, UErrorCode *pErrorCode) {
   1.557 +    uint32_t scriptX;
   1.558 +    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1.559 +        return USCRIPT_INVALID_CODE;
   1.560 +    }
   1.561 +    if((uint32_t)c>0x10ffff) {
   1.562 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.563 +        return USCRIPT_INVALID_CODE;
   1.564 +    }
   1.565 +    scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
   1.566 +    if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
   1.567 +        return (UScriptCode)scriptX;
   1.568 +    } else if(scriptX<UPROPS_SCRIPT_X_WITH_INHERITED) {
   1.569 +        return USCRIPT_COMMON;
   1.570 +    } else if(scriptX<UPROPS_SCRIPT_X_WITH_OTHER) {
   1.571 +        return USCRIPT_INHERITED;
   1.572 +    } else {
   1.573 +        return (UScriptCode)scriptExtensions[scriptX&UPROPS_SCRIPT_MASK];
   1.574 +    }
   1.575 +}
   1.576 +
   1.577 +U_CAPI UBool U_EXPORT2
   1.578 +uscript_hasScript(UChar32 c, UScriptCode sc) {
   1.579 +    const uint16_t *scx;
   1.580 +    uint32_t scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
   1.581 +    if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
   1.582 +        return sc==(UScriptCode)scriptX;
   1.583 +    }
   1.584 +
   1.585 +    scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
   1.586 +    if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
   1.587 +        scx=scriptExtensions+scx[1];
   1.588 +    }
   1.589 +    if(sc>=USCRIPT_CODE_LIMIT) {
   1.590 +        /* Guard against bogus input that would make us go past the Script_Extensions terminator. */
   1.591 +        return FALSE;
   1.592 +    }
   1.593 +    while(sc>*scx) {
   1.594 +        ++scx;
   1.595 +    }
   1.596 +    return sc==(*scx&0x7fff);
   1.597 +}
   1.598 +
   1.599 +U_CAPI int32_t U_EXPORT2
   1.600 +uscript_getScriptExtensions(UChar32 c,
   1.601 +                            UScriptCode *scripts, int32_t capacity,
   1.602 +                            UErrorCode *pErrorCode) {
   1.603 +    uint32_t scriptX;
   1.604 +    int32_t length;
   1.605 +    const uint16_t *scx;
   1.606 +    uint16_t sx;
   1.607 +    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1.608 +        return 0;
   1.609 +    }
   1.610 +    if(capacity<0 || (capacity>0 && scripts==NULL)) {
   1.611 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.612 +        return 0;
   1.613 +    }
   1.614 +    scriptX=u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_X_MASK;
   1.615 +    if(scriptX<UPROPS_SCRIPT_X_WITH_COMMON) {
   1.616 +        if(capacity==0) {
   1.617 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.618 +        } else {
   1.619 +            scripts[0]=(UScriptCode)scriptX;
   1.620 +        }
   1.621 +        return 1;
   1.622 +    }
   1.623 +
   1.624 +    scx=scriptExtensions+(scriptX&UPROPS_SCRIPT_MASK);
   1.625 +    if(scriptX>=UPROPS_SCRIPT_X_WITH_OTHER) {
   1.626 +        scx=scriptExtensions+scx[1];
   1.627 +    }
   1.628 +    length=0;
   1.629 +    do {
   1.630 +        sx=*scx++;
   1.631 +        if(length<capacity) {
   1.632 +            scripts[length]=(UScriptCode)(sx&0x7fff);
   1.633 +        }
   1.634 +        ++length;
   1.635 +    } while(sx<0x8000);
   1.636 +    if(length>capacity) {
   1.637 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.638 +    }
   1.639 +    return length;
   1.640 +}
   1.641 +
   1.642 +U_CAPI UBlockCode U_EXPORT2
   1.643 +ublock_getCode(UChar32 c) {
   1.644 +    return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
   1.645 +}
   1.646 +
   1.647 +/* property starts for UnicodeSet ------------------------------------------- */
   1.648 +
   1.649 +static UBool U_CALLCONV
   1.650 +_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
   1.651 +    /* add the start code point to the USet */
   1.652 +    const USetAdder *sa=(const USetAdder *)context;
   1.653 +    sa->add(sa->set, start);
   1.654 +    return TRUE;
   1.655 +}
   1.656 +
   1.657 +#define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
   1.658 +
   1.659 +U_CFUNC void U_EXPORT2
   1.660 +uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
   1.661 +    if(U_FAILURE(*pErrorCode)) {
   1.662 +        return;
   1.663 +    }
   1.664 +
   1.665 +    /* add the start code point of each same-value range of the main trie */
   1.666 +    utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa);
   1.667 +
   1.668 +    /* add code points with hardcoded properties, plus the ones following them */
   1.669 +
   1.670 +    /* add for u_isblank() */
   1.671 +    USET_ADD_CP_AND_NEXT(sa, TAB);
   1.672 +
   1.673 +    /* add for IS_THAT_CONTROL_SPACE() */
   1.674 +    sa->add(sa->set, CR+1); /* range TAB..CR */
   1.675 +    sa->add(sa->set, 0x1c);
   1.676 +    sa->add(sa->set, 0x1f+1);
   1.677 +    USET_ADD_CP_AND_NEXT(sa, NL);
   1.678 +
   1.679 +    /* add for u_isIDIgnorable() what was not added above */
   1.680 +    sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
   1.681 +    sa->add(sa->set, HAIRSP);
   1.682 +    sa->add(sa->set, RLM+1);
   1.683 +    sa->add(sa->set, INHSWAP);
   1.684 +    sa->add(sa->set, NOMDIG+1);
   1.685 +    USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
   1.686 +
   1.687 +    /* add no-break spaces for u_isWhitespace() what was not added above */
   1.688 +    USET_ADD_CP_AND_NEXT(sa, NBSP);
   1.689 +    USET_ADD_CP_AND_NEXT(sa, FIGURESP);
   1.690 +    USET_ADD_CP_AND_NEXT(sa, NNBSP);
   1.691 +
   1.692 +    /* add for u_digit() */
   1.693 +    sa->add(sa->set, U_a);
   1.694 +    sa->add(sa->set, U_z+1);
   1.695 +    sa->add(sa->set, U_A);
   1.696 +    sa->add(sa->set, U_Z+1);
   1.697 +    sa->add(sa->set, U_FW_a);
   1.698 +    sa->add(sa->set, U_FW_z+1);
   1.699 +    sa->add(sa->set, U_FW_A);
   1.700 +    sa->add(sa->set, U_FW_Z+1);
   1.701 +
   1.702 +    /* add for u_isxdigit() */
   1.703 +    sa->add(sa->set, U_f+1);
   1.704 +    sa->add(sa->set, U_F+1);
   1.705 +    sa->add(sa->set, U_FW_f+1);
   1.706 +    sa->add(sa->set, U_FW_F+1);
   1.707 +
   1.708 +    /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
   1.709 +    sa->add(sa->set, WJ); /* range WJ..NOMDIG */
   1.710 +    sa->add(sa->set, 0xfff0);
   1.711 +    sa->add(sa->set, 0xfffb+1);
   1.712 +    sa->add(sa->set, 0xe0000);
   1.713 +    sa->add(sa->set, 0xe0fff+1);
   1.714 +
   1.715 +    /* add for UCHAR_GRAPHEME_BASE and others */
   1.716 +    USET_ADD_CP_AND_NEXT(sa, CGJ);
   1.717 +}
   1.718 +
   1.719 +U_CFUNC void U_EXPORT2
   1.720 +upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
   1.721 +    if(U_FAILURE(*pErrorCode)) {
   1.722 +        return;
   1.723 +    }
   1.724 +
   1.725 +    /* add the start code point of each same-value range of the properties vectors trie */
   1.726 +    if(propsVectorsColumns>0) {
   1.727 +        /* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */
   1.728 +        utrie2_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa);
   1.729 +    }
   1.730 +}

mercurial