intl/icu/source/extra/scrptrun/scrptrun.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/extra/scrptrun/scrptrun.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,202 @@
     1.4 +/*
     1.5 + *******************************************************************************
     1.6 + *
     1.7 + *   Copyright (C) 1999-2001, International Business Machines
     1.8 + *   Corporation and others.  All Rights Reserved.
     1.9 + *
    1.10 + *******************************************************************************
    1.11 + *   file name:  scrptrun.cpp
    1.12 + *
    1.13 + *   created on: 10/17/2001
    1.14 + *   created by: Eric R. Mader
    1.15 + */
    1.16 +
    1.17 +#include "unicode/utypes.h"
    1.18 +#include "unicode/uscript.h"
    1.19 +
    1.20 +#include "scrptrun.h"
    1.21 +
    1.22 +#define ARRAY_SIZE(array) (sizeof array  / sizeof array[0])
    1.23 +
    1.24 +const char ScriptRun::fgClassID=0;
    1.25 +
    1.26 +UChar32 ScriptRun::pairedChars[] = {
    1.27 +    0x0028, 0x0029, // ascii paired punctuation
    1.28 +    0x003c, 0x003e,
    1.29 +    0x005b, 0x005d,
    1.30 +    0x007b, 0x007d,
    1.31 +    0x00ab, 0x00bb, // guillemets
    1.32 +    0x2018, 0x2019, // general punctuation
    1.33 +    0x201c, 0x201d,
    1.34 +    0x2039, 0x203a,
    1.35 +    0x3008, 0x3009, // chinese paired punctuation
    1.36 +    0x300a, 0x300b,
    1.37 +    0x300c, 0x300d,
    1.38 +    0x300e, 0x300f,
    1.39 +    0x3010, 0x3011,
    1.40 +    0x3014, 0x3015,
    1.41 +    0x3016, 0x3017,
    1.42 +    0x3018, 0x3019,
    1.43 +    0x301a, 0x301b
    1.44 +};
    1.45 +
    1.46 +const int32_t ScriptRun::pairedCharCount = ARRAY_SIZE(pairedChars);
    1.47 +const int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount);
    1.48 +const int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower;
    1.49 +
    1.50 +int8_t ScriptRun::highBit(int32_t value)
    1.51 +{
    1.52 +    if (value <= 0) {
    1.53 +        return -32;
    1.54 +    }
    1.55 +
    1.56 +    int8_t bit = 0;
    1.57 +
    1.58 +    if (value >= 1 << 16) {
    1.59 +        value >>= 16;
    1.60 +        bit += 16;
    1.61 +    }
    1.62 +
    1.63 +    if (value >= 1 << 8) {
    1.64 +        value >>= 8;
    1.65 +        bit += 8;
    1.66 +    }
    1.67 +
    1.68 +    if (value >= 1 << 4) {
    1.69 +        value >>= 4;
    1.70 +        bit += 4;
    1.71 +    }
    1.72 +
    1.73 +    if (value >= 1 << 2) {
    1.74 +        value >>= 2;
    1.75 +        bit += 2;
    1.76 +    }
    1.77 +
    1.78 +    if (value >= 1 << 1) {
    1.79 +        value >>= 1;
    1.80 +        bit += 1;
    1.81 +    }
    1.82 +
    1.83 +    return bit;
    1.84 +}
    1.85 +
    1.86 +int32_t ScriptRun::getPairIndex(UChar32 ch)
    1.87 +{
    1.88 +    int32_t probe = pairedCharPower;
    1.89 +    int32_t index = 0;
    1.90 +
    1.91 +    if (ch >= pairedChars[pairedCharExtra]) {
    1.92 +        index = pairedCharExtra;
    1.93 +    }
    1.94 +
    1.95 +    while (probe > (1 << 0)) {
    1.96 +        probe >>= 1;
    1.97 +
    1.98 +        if (ch >= pairedChars[index + probe]) {
    1.99 +            index += probe;
   1.100 +        }
   1.101 +    }
   1.102 +
   1.103 +    if (pairedChars[index] != ch) {
   1.104 +        index = -1;
   1.105 +    }
   1.106 +
   1.107 +    return index;
   1.108 +}
   1.109 +
   1.110 +UBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo)
   1.111 +{
   1.112 +    return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
   1.113 +}
   1.114 +
   1.115 +UBool ScriptRun::next()
   1.116 +{
   1.117 +    int32_t startSP  = parenSP;  // used to find the first new open character
   1.118 +    UErrorCode error = U_ZERO_ERROR;
   1.119 +
   1.120 +    // if we've fallen off the end of the text, we're done
   1.121 +    if (scriptEnd >= charLimit) {
   1.122 +        return false;
   1.123 +    }
   1.124 +    
   1.125 +    scriptCode = USCRIPT_COMMON;
   1.126 +
   1.127 +    for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
   1.128 +        UChar   high = charArray[scriptEnd];
   1.129 +        UChar32 ch   = high;
   1.130 +
   1.131 +        // if the character is a high surrogate and it's not the last one
   1.132 +        // in the text, see if it's followed by a low surrogate
   1.133 +        if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1)
   1.134 +        {
   1.135 +            UChar low = charArray[scriptEnd + 1];
   1.136 +
   1.137 +            // if it is followed by a low surrogate,
   1.138 +            // consume it and form the full character
   1.139 +            if (low >= 0xDC00 && low <= 0xDFFF) {
   1.140 +                ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
   1.141 +                scriptEnd += 1;
   1.142 +            }
   1.143 +        }
   1.144 +
   1.145 +        UScriptCode sc = uscript_getScript(ch, &error);
   1.146 +        int32_t pairIndex = getPairIndex(ch);
   1.147 +
   1.148 +        // Paired character handling:
   1.149 +        //
   1.150 +        // if it's an open character, push it onto the stack.
   1.151 +        // if it's a close character, find the matching open on the
   1.152 +        // stack, and use that script code. Any non-matching open
   1.153 +        // characters above it on the stack will be poped.
   1.154 +        if (pairIndex >= 0) {
   1.155 +            if ((pairIndex & 1) == 0) {
   1.156 +                parenStack[++parenSP].pairIndex = pairIndex;
   1.157 +                parenStack[parenSP].scriptCode  = scriptCode;
   1.158 +            } else if (parenSP >= 0) {
   1.159 +                int32_t pi = pairIndex & ~1;
   1.160 +
   1.161 +                while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
   1.162 +                    parenSP -= 1;
   1.163 +                }
   1.164 +
   1.165 +                if (parenSP < startSP) {
   1.166 +                    startSP = parenSP;
   1.167 +                }
   1.168 +
   1.169 +                if (parenSP >= 0) {
   1.170 +                    sc = parenStack[parenSP].scriptCode;
   1.171 +                }
   1.172 +            }
   1.173 +        }
   1.174 +
   1.175 +        if (sameScript(scriptCode, sc)) {
   1.176 +            if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
   1.177 +                scriptCode = sc;
   1.178 +
   1.179 +                // now that we have a final script code, fix any open
   1.180 +                // characters we pushed before we knew the script code.
   1.181 +                while (startSP < parenSP) {
   1.182 +                    parenStack[++startSP].scriptCode = scriptCode;
   1.183 +                }
   1.184 +            }
   1.185 +
   1.186 +            // if this character is a close paired character,
   1.187 +            // pop it from the stack
   1.188 +            if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
   1.189 +                parenSP -= 1;
   1.190 +                startSP -= 1;
   1.191 +            }
   1.192 +        } else {
   1.193 +            // if the run broke on a surrogate pair,
   1.194 +            // end it before the high surrogate
   1.195 +            if (ch >= 0x10000) {
   1.196 +                scriptEnd -= 1;
   1.197 +            }
   1.198 +
   1.199 +            break;
   1.200 +        }
   1.201 +    }
   1.202 +
   1.203 +    return true;
   1.204 +}
   1.205 +

mercurial