1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/extra/scrptrun/scrptrun.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,202 @@ 1.4 +/* 1.5 + ******************************************************************************* 1.6 + * 1.7 + * Copyright (C) 1999-2001, International Business Machines 1.8 + * Corporation and others. All Rights Reserved. 1.9 + * 1.10 + ******************************************************************************* 1.11 + * file name: scrptrun.cpp 1.12 + * 1.13 + * created on: 10/17/2001 1.14 + * created by: Eric R. Mader 1.15 + */ 1.16 + 1.17 +#include "unicode/utypes.h" 1.18 +#include "unicode/uscript.h" 1.19 + 1.20 +#include "scrptrun.h" 1.21 + 1.22 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 1.23 + 1.24 +const char ScriptRun::fgClassID=0; 1.25 + 1.26 +UChar32 ScriptRun::pairedChars[] = { 1.27 + 0x0028, 0x0029, // ascii paired punctuation 1.28 + 0x003c, 0x003e, 1.29 + 0x005b, 0x005d, 1.30 + 0x007b, 0x007d, 1.31 + 0x00ab, 0x00bb, // guillemets 1.32 + 0x2018, 0x2019, // general punctuation 1.33 + 0x201c, 0x201d, 1.34 + 0x2039, 0x203a, 1.35 + 0x3008, 0x3009, // chinese paired punctuation 1.36 + 0x300a, 0x300b, 1.37 + 0x300c, 0x300d, 1.38 + 0x300e, 0x300f, 1.39 + 0x3010, 0x3011, 1.40 + 0x3014, 0x3015, 1.41 + 0x3016, 0x3017, 1.42 + 0x3018, 0x3019, 1.43 + 0x301a, 0x301b 1.44 +}; 1.45 + 1.46 +const int32_t ScriptRun::pairedCharCount = ARRAY_SIZE(pairedChars); 1.47 +const int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount); 1.48 +const int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower; 1.49 + 1.50 +int8_t ScriptRun::highBit(int32_t value) 1.51 +{ 1.52 + if (value <= 0) { 1.53 + return -32; 1.54 + } 1.55 + 1.56 + int8_t bit = 0; 1.57 + 1.58 + if (value >= 1 << 16) { 1.59 + value >>= 16; 1.60 + bit += 16; 1.61 + } 1.62 + 1.63 + if (value >= 1 << 8) { 1.64 + value >>= 8; 1.65 + bit += 8; 1.66 + } 1.67 + 1.68 + if (value >= 1 << 4) { 1.69 + value >>= 4; 1.70 + bit += 4; 1.71 + } 1.72 + 1.73 + if (value >= 1 << 2) { 1.74 + value >>= 2; 1.75 + bit += 2; 1.76 + } 1.77 + 1.78 + if (value >= 1 << 1) { 1.79 + value >>= 1; 1.80 + bit += 1; 1.81 + } 1.82 + 1.83 + return bit; 1.84 +} 1.85 + 1.86 +int32_t ScriptRun::getPairIndex(UChar32 ch) 1.87 +{ 1.88 + int32_t probe = pairedCharPower; 1.89 + int32_t index = 0; 1.90 + 1.91 + if (ch >= pairedChars[pairedCharExtra]) { 1.92 + index = pairedCharExtra; 1.93 + } 1.94 + 1.95 + while (probe > (1 << 0)) { 1.96 + probe >>= 1; 1.97 + 1.98 + if (ch >= pairedChars[index + probe]) { 1.99 + index += probe; 1.100 + } 1.101 + } 1.102 + 1.103 + if (pairedChars[index] != ch) { 1.104 + index = -1; 1.105 + } 1.106 + 1.107 + return index; 1.108 +} 1.109 + 1.110 +UBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo) 1.111 +{ 1.112 + return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; 1.113 +} 1.114 + 1.115 +UBool ScriptRun::next() 1.116 +{ 1.117 + int32_t startSP = parenSP; // used to find the first new open character 1.118 + UErrorCode error = U_ZERO_ERROR; 1.119 + 1.120 + // if we've fallen off the end of the text, we're done 1.121 + if (scriptEnd >= charLimit) { 1.122 + return false; 1.123 + } 1.124 + 1.125 + scriptCode = USCRIPT_COMMON; 1.126 + 1.127 + for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) { 1.128 + UChar high = charArray[scriptEnd]; 1.129 + UChar32 ch = high; 1.130 + 1.131 + // if the character is a high surrogate and it's not the last one 1.132 + // in the text, see if it's followed by a low surrogate 1.133 + if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1) 1.134 + { 1.135 + UChar low = charArray[scriptEnd + 1]; 1.136 + 1.137 + // if it is followed by a low surrogate, 1.138 + // consume it and form the full character 1.139 + if (low >= 0xDC00 && low <= 0xDFFF) { 1.140 + ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; 1.141 + scriptEnd += 1; 1.142 + } 1.143 + } 1.144 + 1.145 + UScriptCode sc = uscript_getScript(ch, &error); 1.146 + int32_t pairIndex = getPairIndex(ch); 1.147 + 1.148 + // Paired character handling: 1.149 + // 1.150 + // if it's an open character, push it onto the stack. 1.151 + // if it's a close character, find the matching open on the 1.152 + // stack, and use that script code. Any non-matching open 1.153 + // characters above it on the stack will be poped. 1.154 + if (pairIndex >= 0) { 1.155 + if ((pairIndex & 1) == 0) { 1.156 + parenStack[++parenSP].pairIndex = pairIndex; 1.157 + parenStack[parenSP].scriptCode = scriptCode; 1.158 + } else if (parenSP >= 0) { 1.159 + int32_t pi = pairIndex & ~1; 1.160 + 1.161 + while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) { 1.162 + parenSP -= 1; 1.163 + } 1.164 + 1.165 + if (parenSP < startSP) { 1.166 + startSP = parenSP; 1.167 + } 1.168 + 1.169 + if (parenSP >= 0) { 1.170 + sc = parenStack[parenSP].scriptCode; 1.171 + } 1.172 + } 1.173 + } 1.174 + 1.175 + if (sameScript(scriptCode, sc)) { 1.176 + if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { 1.177 + scriptCode = sc; 1.178 + 1.179 + // now that we have a final script code, fix any open 1.180 + // characters we pushed before we knew the script code. 1.181 + while (startSP < parenSP) { 1.182 + parenStack[++startSP].scriptCode = scriptCode; 1.183 + } 1.184 + } 1.185 + 1.186 + // if this character is a close paired character, 1.187 + // pop it from the stack 1.188 + if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { 1.189 + parenSP -= 1; 1.190 + startSP -= 1; 1.191 + } 1.192 + } else { 1.193 + // if the run broke on a surrogate pair, 1.194 + // end it before the high surrogate 1.195 + if (ch >= 0x10000) { 1.196 + scriptEnd -= 1; 1.197 + } 1.198 + 1.199 + break; 1.200 + } 1.201 + } 1.202 + 1.203 + return true; 1.204 +} 1.205 +