intl/icu/source/common/usc_impl.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/usc_impl.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,361 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 1999-2009, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*
    1.10 +* File USC_IMPL.C
    1.11 +*
    1.12 +* Modification History:
    1.13 +*
    1.14 +*   Date        Name        Description
    1.15 +*   07/08/2002  Eric Mader  Creation.
    1.16 +******************************************************************************
    1.17 +*/
    1.18 +
    1.19 +#include "unicode/uscript.h"
    1.20 +#include "usc_impl.h"
    1.21 +#include "cmemory.h"
    1.22 +
    1.23 +#define ARRAY_SIZE(array) (sizeof array  / sizeof array[0])
    1.24 +
    1.25 +#define PAREN_STACK_DEPTH 32
    1.26 +
    1.27 +#define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
    1.28 +#define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
    1.29 +#define INC(sp,count) (MOD((sp) + (count)))
    1.30 +#define INC1(sp) (INC(sp, 1))
    1.31 +#define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
    1.32 +#define DEC1(sp) (DEC(sp, 1))
    1.33 +#define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
    1.34 +#define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
    1.35 +#define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
    1.36 +#define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
    1.37 +
    1.38 +struct ParenStackEntry
    1.39 +{
    1.40 +    int32_t pairIndex;
    1.41 +    UScriptCode scriptCode;
    1.42 +};
    1.43 +
    1.44 +struct UScriptRun
    1.45 +{
    1.46 +    int32_t textLength;
    1.47 +    const UChar *textArray;
    1.48 +
    1.49 +    int32_t scriptStart;
    1.50 +    int32_t scriptLimit;
    1.51 +    UScriptCode scriptCode;
    1.52 +
    1.53 +    struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
    1.54 +    int32_t parenSP;
    1.55 +    int32_t pushCount;
    1.56 +    int32_t fixupCount;
    1.57 +};
    1.58 +
    1.59 +static int8_t highBit(int32_t value);
    1.60 +
    1.61 +static const UChar32 pairedChars[] = {
    1.62 +    0x0028, 0x0029, /* ascii paired punctuation */
    1.63 +    0x003c, 0x003e,
    1.64 +    0x005b, 0x005d,
    1.65 +    0x007b, 0x007d,
    1.66 +    0x00ab, 0x00bb, /* guillemets */
    1.67 +    0x2018, 0x2019, /* general punctuation */
    1.68 +    0x201c, 0x201d,
    1.69 +    0x2039, 0x203a,
    1.70 +    0x3008, 0x3009, /* chinese paired punctuation */
    1.71 +    0x300a, 0x300b,
    1.72 +    0x300c, 0x300d,
    1.73 +    0x300e, 0x300f,
    1.74 +    0x3010, 0x3011,
    1.75 +    0x3014, 0x3015,
    1.76 +    0x3016, 0x3017,
    1.77 +    0x3018, 0x3019,
    1.78 +    0x301a, 0x301b
    1.79 +};
    1.80 +
    1.81 +static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
    1.82 +{
    1.83 +    scriptRun->pushCount  = LIMIT_INC(scriptRun->pushCount);
    1.84 +    scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
    1.85 +    
    1.86 +    scriptRun->parenSP = INC1(scriptRun->parenSP);
    1.87 +    scriptRun->parenStack[scriptRun->parenSP].pairIndex  = pairIndex;
    1.88 +    scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
    1.89 +}
    1.90 +
    1.91 +static void pop(UScriptRun *scriptRun)
    1.92 +{
    1.93 +    if (STACK_IS_EMPTY(scriptRun)) {
    1.94 +        return;
    1.95 +    }
    1.96 +    
    1.97 +    if (scriptRun->fixupCount > 0) {
    1.98 +        scriptRun->fixupCount -= 1;
    1.99 +    }
   1.100 +    
   1.101 +    scriptRun->pushCount -= 1;
   1.102 +    scriptRun->parenSP = DEC1(scriptRun->parenSP);
   1.103 +    
   1.104 +    /* If the stack is now empty, reset the stack
   1.105 +       pointers to their initial values.
   1.106 +     */
   1.107 +    if (STACK_IS_EMPTY(scriptRun)) {
   1.108 +        scriptRun->parenSP = -1;
   1.109 +    }
   1.110 +}
   1.111 +
   1.112 +static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
   1.113 +{
   1.114 +    int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
   1.115 +    
   1.116 +    while (scriptRun->fixupCount-- > 0) {
   1.117 +        fixupSP = INC1(fixupSP);
   1.118 +        scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
   1.119 +    }
   1.120 +}
   1.121 +
   1.122 +static int8_t
   1.123 +highBit(int32_t value)
   1.124 +{
   1.125 +    int8_t bit = 0;
   1.126 +
   1.127 +    if (value <= 0) {
   1.128 +        return -32;
   1.129 +    }
   1.130 +
   1.131 +    if (value >= 1 << 16) {
   1.132 +        value >>= 16;
   1.133 +        bit += 16;
   1.134 +    }
   1.135 +
   1.136 +    if (value >= 1 << 8) {
   1.137 +        value >>= 8;
   1.138 +        bit += 8;
   1.139 +    }
   1.140 +
   1.141 +    if (value >= 1 << 4) {
   1.142 +        value >>= 4;
   1.143 +        bit += 4;
   1.144 +    }
   1.145 +
   1.146 +    if (value >= 1 << 2) {
   1.147 +        value >>= 2;
   1.148 +        bit += 2;
   1.149 +    }
   1.150 +
   1.151 +    if (value >= 1 << 1) {
   1.152 +        value >>= 1;
   1.153 +        bit += 1;
   1.154 +    }
   1.155 +
   1.156 +    return bit;
   1.157 +}
   1.158 +
   1.159 +static int32_t
   1.160 +getPairIndex(UChar32 ch)
   1.161 +{
   1.162 +    int32_t pairedCharCount = ARRAY_SIZE(pairedChars);
   1.163 +    int32_t pairedCharPower = 1 << highBit(pairedCharCount);
   1.164 +    int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
   1.165 +
   1.166 +    int32_t probe = pairedCharPower;
   1.167 +    int32_t pairIndex = 0;
   1.168 +
   1.169 +    if (ch >= pairedChars[pairedCharExtra]) {
   1.170 +        pairIndex = pairedCharExtra;
   1.171 +    }
   1.172 +
   1.173 +    while (probe > (1 << 0)) {
   1.174 +        probe >>= 1;
   1.175 +
   1.176 +        if (ch >= pairedChars[pairIndex + probe]) {
   1.177 +            pairIndex += probe;
   1.178 +        }
   1.179 +    }
   1.180 +
   1.181 +    if (pairedChars[pairIndex] != ch) {
   1.182 +        pairIndex = -1;
   1.183 +    }
   1.184 +
   1.185 +    return pairIndex;
   1.186 +}
   1.187 +
   1.188 +static UBool
   1.189 +sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
   1.190 +{
   1.191 +    return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
   1.192 +}
   1.193 +
   1.194 +U_CAPI UScriptRun * U_EXPORT2
   1.195 +uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
   1.196 +{
   1.197 +    UScriptRun *result = NULL;
   1.198 +
   1.199 +    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
   1.200 +        return NULL;
   1.201 +    }
   1.202 +
   1.203 +    result = uprv_malloc(sizeof (UScriptRun));
   1.204 +
   1.205 +    if (result == NULL) {
   1.206 +        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
   1.207 +        return NULL;
   1.208 +    }
   1.209 +
   1.210 +    uscript_setRunText(result, src, length, pErrorCode);
   1.211 +
   1.212 +    /* Release the UScriptRun if uscript_setRunText() returns an error */
   1.213 +    if (U_FAILURE(*pErrorCode)) {
   1.214 +        uprv_free(result);
   1.215 +        result = NULL;
   1.216 +    }
   1.217 +
   1.218 +    return result;
   1.219 +}
   1.220 +
   1.221 +U_CAPI void U_EXPORT2
   1.222 +uscript_closeRun(UScriptRun *scriptRun)
   1.223 +{
   1.224 +    if (scriptRun != NULL) {
   1.225 +        uprv_free(scriptRun);
   1.226 +    }
   1.227 +}
   1.228 +
   1.229 +U_CAPI void U_EXPORT2
   1.230 +uscript_resetRun(UScriptRun *scriptRun)
   1.231 +{
   1.232 +    if (scriptRun != NULL) {
   1.233 +        scriptRun->scriptStart = 0;
   1.234 +        scriptRun->scriptLimit = 0;
   1.235 +        scriptRun->scriptCode  = USCRIPT_INVALID_CODE;
   1.236 +        scriptRun->parenSP     = -1;
   1.237 +        scriptRun->pushCount   =  0;
   1.238 +        scriptRun->fixupCount  =  0;
   1.239 +    }
   1.240 +}
   1.241 +
   1.242 +U_CAPI void U_EXPORT2
   1.243 +uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
   1.244 +{
   1.245 +    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
   1.246 +        return;
   1.247 +    }
   1.248 +
   1.249 +    if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
   1.250 +        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1.251 +        return;
   1.252 +    }
   1.253 +
   1.254 +    scriptRun->textArray  = src;
   1.255 +    scriptRun->textLength = length;
   1.256 +
   1.257 +    uscript_resetRun(scriptRun);
   1.258 +}
   1.259 +
   1.260 +U_CAPI UBool U_EXPORT2
   1.261 +uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
   1.262 +{
   1.263 +    UErrorCode error = U_ZERO_ERROR;
   1.264 +
   1.265 +    /* if we've fallen off the end of the text, we're done */
   1.266 +    if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
   1.267 +        return FALSE;
   1.268 +    }
   1.269 +    
   1.270 +    SYNC_FIXUP(scriptRun);
   1.271 +    scriptRun->scriptCode = USCRIPT_COMMON;
   1.272 +
   1.273 +    for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
   1.274 +        UChar   high = scriptRun->textArray[scriptRun->scriptLimit];
   1.275 +        UChar32 ch   = high;
   1.276 +        UScriptCode sc;
   1.277 +        int32_t pairIndex;
   1.278 +
   1.279 +        /*
   1.280 +         * if the character is a high surrogate and it's not the last one
   1.281 +         * in the text, see if it's followed by a low surrogate
   1.282 +         */
   1.283 +        if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
   1.284 +            UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
   1.285 +
   1.286 +            /*
   1.287 +             * if it is followed by a low surrogate,
   1.288 +             * consume it and form the full character
   1.289 +             */
   1.290 +            if (low >= 0xDC00 && low <= 0xDFFF) {
   1.291 +                ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
   1.292 +                scriptRun->scriptLimit += 1;
   1.293 +            }
   1.294 +        }
   1.295 +
   1.296 +        sc = uscript_getScript(ch, &error);
   1.297 +        pairIndex = getPairIndex(ch);
   1.298 +
   1.299 +        /*
   1.300 +         * Paired character handling:
   1.301 +         *
   1.302 +         * if it's an open character, push it onto the stack.
   1.303 +         * if it's a close character, find the matching open on the
   1.304 +         * stack, and use that script code. Any non-matching open
   1.305 +         * characters above it on the stack will be poped.
   1.306 +         */
   1.307 +        if (pairIndex >= 0) {
   1.308 +            if ((pairIndex & 1) == 0) {
   1.309 +                push(scriptRun, pairIndex, scriptRun->scriptCode);
   1.310 +            } else {
   1.311 +                int32_t pi = pairIndex & ~1;
   1.312 +
   1.313 +                while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
   1.314 +                    pop(scriptRun);
   1.315 +                }
   1.316 +
   1.317 +                if (STACK_IS_NOT_EMPTY(scriptRun)) {
   1.318 +                    sc = TOP(scriptRun).scriptCode;
   1.319 +                }
   1.320 +            }
   1.321 +        }
   1.322 +
   1.323 +        if (sameScript(scriptRun->scriptCode, sc)) {
   1.324 +            if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
   1.325 +                scriptRun->scriptCode = sc;
   1.326 +
   1.327 +                fixup(scriptRun, scriptRun->scriptCode);
   1.328 +            }
   1.329 +
   1.330 +            /*
   1.331 +             * if this character is a close paired character,
   1.332 +             * pop the matching open character from the stack
   1.333 +             */
   1.334 +            if (pairIndex >= 0 && (pairIndex & 1) != 0) {
   1.335 +                pop(scriptRun);
   1.336 +            }
   1.337 +        } else {
   1.338 +            /*
   1.339 +             * if the run broke on a surrogate pair,
   1.340 +             * end it before the high surrogate
   1.341 +             */
   1.342 +            if (ch >= 0x10000) {
   1.343 +                scriptRun->scriptLimit -= 1;
   1.344 +            }
   1.345 +
   1.346 +            break;
   1.347 +        }
   1.348 +    }
   1.349 +
   1.350 +
   1.351 +    if (pRunStart != NULL) {
   1.352 +        *pRunStart = scriptRun->scriptStart;
   1.353 +    }
   1.354 +
   1.355 +    if (pRunLimit != NULL) {
   1.356 +        *pRunLimit = scriptRun->scriptLimit;
   1.357 +    }
   1.358 +
   1.359 +    if (pRunScript != NULL) {
   1.360 +        *pRunScript = scriptRun->scriptCode;
   1.361 +    }
   1.362 +
   1.363 +    return TRUE;
   1.364 +}

mercurial