|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2009, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * |
|
7 * File USC_IMPL.C |
|
8 * |
|
9 * Modification History: |
|
10 * |
|
11 * Date Name Description |
|
12 * 07/08/2002 Eric Mader Creation. |
|
13 ****************************************************************************** |
|
14 */ |
|
15 |
|
16 #include "unicode/uscript.h" |
|
17 #include "usc_impl.h" |
|
18 #include "cmemory.h" |
|
19 |
|
20 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
|
21 |
|
22 #define PAREN_STACK_DEPTH 32 |
|
23 |
|
24 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH) |
|
25 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH) |
|
26 #define INC(sp,count) (MOD((sp) + (count))) |
|
27 #define INC1(sp) (INC(sp, 1)) |
|
28 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count))) |
|
29 #define DEC1(sp) (DEC(sp, 1)) |
|
30 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0) |
|
31 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun)) |
|
32 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP]) |
|
33 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0) |
|
34 |
|
35 struct ParenStackEntry |
|
36 { |
|
37 int32_t pairIndex; |
|
38 UScriptCode scriptCode; |
|
39 }; |
|
40 |
|
41 struct UScriptRun |
|
42 { |
|
43 int32_t textLength; |
|
44 const UChar *textArray; |
|
45 |
|
46 int32_t scriptStart; |
|
47 int32_t scriptLimit; |
|
48 UScriptCode scriptCode; |
|
49 |
|
50 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH]; |
|
51 int32_t parenSP; |
|
52 int32_t pushCount; |
|
53 int32_t fixupCount; |
|
54 }; |
|
55 |
|
56 static int8_t highBit(int32_t value); |
|
57 |
|
58 static const UChar32 pairedChars[] = { |
|
59 0x0028, 0x0029, /* ascii paired punctuation */ |
|
60 0x003c, 0x003e, |
|
61 0x005b, 0x005d, |
|
62 0x007b, 0x007d, |
|
63 0x00ab, 0x00bb, /* guillemets */ |
|
64 0x2018, 0x2019, /* general punctuation */ |
|
65 0x201c, 0x201d, |
|
66 0x2039, 0x203a, |
|
67 0x3008, 0x3009, /* chinese paired punctuation */ |
|
68 0x300a, 0x300b, |
|
69 0x300c, 0x300d, |
|
70 0x300e, 0x300f, |
|
71 0x3010, 0x3011, |
|
72 0x3014, 0x3015, |
|
73 0x3016, 0x3017, |
|
74 0x3018, 0x3019, |
|
75 0x301a, 0x301b |
|
76 }; |
|
77 |
|
78 static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode) |
|
79 { |
|
80 scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount); |
|
81 scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount); |
|
82 |
|
83 scriptRun->parenSP = INC1(scriptRun->parenSP); |
|
84 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex; |
|
85 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode; |
|
86 } |
|
87 |
|
88 static void pop(UScriptRun *scriptRun) |
|
89 { |
|
90 if (STACK_IS_EMPTY(scriptRun)) { |
|
91 return; |
|
92 } |
|
93 |
|
94 if (scriptRun->fixupCount > 0) { |
|
95 scriptRun->fixupCount -= 1; |
|
96 } |
|
97 |
|
98 scriptRun->pushCount -= 1; |
|
99 scriptRun->parenSP = DEC1(scriptRun->parenSP); |
|
100 |
|
101 /* If the stack is now empty, reset the stack |
|
102 pointers to their initial values. |
|
103 */ |
|
104 if (STACK_IS_EMPTY(scriptRun)) { |
|
105 scriptRun->parenSP = -1; |
|
106 } |
|
107 } |
|
108 |
|
109 static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode) |
|
110 { |
|
111 int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount); |
|
112 |
|
113 while (scriptRun->fixupCount-- > 0) { |
|
114 fixupSP = INC1(fixupSP); |
|
115 scriptRun->parenStack[fixupSP].scriptCode = scriptCode; |
|
116 } |
|
117 } |
|
118 |
|
119 static int8_t |
|
120 highBit(int32_t value) |
|
121 { |
|
122 int8_t bit = 0; |
|
123 |
|
124 if (value <= 0) { |
|
125 return -32; |
|
126 } |
|
127 |
|
128 if (value >= 1 << 16) { |
|
129 value >>= 16; |
|
130 bit += 16; |
|
131 } |
|
132 |
|
133 if (value >= 1 << 8) { |
|
134 value >>= 8; |
|
135 bit += 8; |
|
136 } |
|
137 |
|
138 if (value >= 1 << 4) { |
|
139 value >>= 4; |
|
140 bit += 4; |
|
141 } |
|
142 |
|
143 if (value >= 1 << 2) { |
|
144 value >>= 2; |
|
145 bit += 2; |
|
146 } |
|
147 |
|
148 if (value >= 1 << 1) { |
|
149 value >>= 1; |
|
150 bit += 1; |
|
151 } |
|
152 |
|
153 return bit; |
|
154 } |
|
155 |
|
156 static int32_t |
|
157 getPairIndex(UChar32 ch) |
|
158 { |
|
159 int32_t pairedCharCount = ARRAY_SIZE(pairedChars); |
|
160 int32_t pairedCharPower = 1 << highBit(pairedCharCount); |
|
161 int32_t pairedCharExtra = pairedCharCount - pairedCharPower; |
|
162 |
|
163 int32_t probe = pairedCharPower; |
|
164 int32_t pairIndex = 0; |
|
165 |
|
166 if (ch >= pairedChars[pairedCharExtra]) { |
|
167 pairIndex = pairedCharExtra; |
|
168 } |
|
169 |
|
170 while (probe > (1 << 0)) { |
|
171 probe >>= 1; |
|
172 |
|
173 if (ch >= pairedChars[pairIndex + probe]) { |
|
174 pairIndex += probe; |
|
175 } |
|
176 } |
|
177 |
|
178 if (pairedChars[pairIndex] != ch) { |
|
179 pairIndex = -1; |
|
180 } |
|
181 |
|
182 return pairIndex; |
|
183 } |
|
184 |
|
185 static UBool |
|
186 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo) |
|
187 { |
|
188 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo; |
|
189 } |
|
190 |
|
191 U_CAPI UScriptRun * U_EXPORT2 |
|
192 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode) |
|
193 { |
|
194 UScriptRun *result = NULL; |
|
195 |
|
196 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { |
|
197 return NULL; |
|
198 } |
|
199 |
|
200 result = uprv_malloc(sizeof (UScriptRun)); |
|
201 |
|
202 if (result == NULL) { |
|
203 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
|
204 return NULL; |
|
205 } |
|
206 |
|
207 uscript_setRunText(result, src, length, pErrorCode); |
|
208 |
|
209 /* Release the UScriptRun if uscript_setRunText() returns an error */ |
|
210 if (U_FAILURE(*pErrorCode)) { |
|
211 uprv_free(result); |
|
212 result = NULL; |
|
213 } |
|
214 |
|
215 return result; |
|
216 } |
|
217 |
|
218 U_CAPI void U_EXPORT2 |
|
219 uscript_closeRun(UScriptRun *scriptRun) |
|
220 { |
|
221 if (scriptRun != NULL) { |
|
222 uprv_free(scriptRun); |
|
223 } |
|
224 } |
|
225 |
|
226 U_CAPI void U_EXPORT2 |
|
227 uscript_resetRun(UScriptRun *scriptRun) |
|
228 { |
|
229 if (scriptRun != NULL) { |
|
230 scriptRun->scriptStart = 0; |
|
231 scriptRun->scriptLimit = 0; |
|
232 scriptRun->scriptCode = USCRIPT_INVALID_CODE; |
|
233 scriptRun->parenSP = -1; |
|
234 scriptRun->pushCount = 0; |
|
235 scriptRun->fixupCount = 0; |
|
236 } |
|
237 } |
|
238 |
|
239 U_CAPI void U_EXPORT2 |
|
240 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode) |
|
241 { |
|
242 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) { |
|
243 return; |
|
244 } |
|
245 |
|
246 if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) { |
|
247 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
|
248 return; |
|
249 } |
|
250 |
|
251 scriptRun->textArray = src; |
|
252 scriptRun->textLength = length; |
|
253 |
|
254 uscript_resetRun(scriptRun); |
|
255 } |
|
256 |
|
257 U_CAPI UBool U_EXPORT2 |
|
258 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript) |
|
259 { |
|
260 UErrorCode error = U_ZERO_ERROR; |
|
261 |
|
262 /* if we've fallen off the end of the text, we're done */ |
|
263 if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) { |
|
264 return FALSE; |
|
265 } |
|
266 |
|
267 SYNC_FIXUP(scriptRun); |
|
268 scriptRun->scriptCode = USCRIPT_COMMON; |
|
269 |
|
270 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) { |
|
271 UChar high = scriptRun->textArray[scriptRun->scriptLimit]; |
|
272 UChar32 ch = high; |
|
273 UScriptCode sc; |
|
274 int32_t pairIndex; |
|
275 |
|
276 /* |
|
277 * if the character is a high surrogate and it's not the last one |
|
278 * in the text, see if it's followed by a low surrogate |
|
279 */ |
|
280 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) { |
|
281 UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1]; |
|
282 |
|
283 /* |
|
284 * if it is followed by a low surrogate, |
|
285 * consume it and form the full character |
|
286 */ |
|
287 if (low >= 0xDC00 && low <= 0xDFFF) { |
|
288 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000; |
|
289 scriptRun->scriptLimit += 1; |
|
290 } |
|
291 } |
|
292 |
|
293 sc = uscript_getScript(ch, &error); |
|
294 pairIndex = getPairIndex(ch); |
|
295 |
|
296 /* |
|
297 * Paired character handling: |
|
298 * |
|
299 * if it's an open character, push it onto the stack. |
|
300 * if it's a close character, find the matching open on the |
|
301 * stack, and use that script code. Any non-matching open |
|
302 * characters above it on the stack will be poped. |
|
303 */ |
|
304 if (pairIndex >= 0) { |
|
305 if ((pairIndex & 1) == 0) { |
|
306 push(scriptRun, pairIndex, scriptRun->scriptCode); |
|
307 } else { |
|
308 int32_t pi = pairIndex & ~1; |
|
309 |
|
310 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) { |
|
311 pop(scriptRun); |
|
312 } |
|
313 |
|
314 if (STACK_IS_NOT_EMPTY(scriptRun)) { |
|
315 sc = TOP(scriptRun).scriptCode; |
|
316 } |
|
317 } |
|
318 } |
|
319 |
|
320 if (sameScript(scriptRun->scriptCode, sc)) { |
|
321 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) { |
|
322 scriptRun->scriptCode = sc; |
|
323 |
|
324 fixup(scriptRun, scriptRun->scriptCode); |
|
325 } |
|
326 |
|
327 /* |
|
328 * if this character is a close paired character, |
|
329 * pop the matching open character from the stack |
|
330 */ |
|
331 if (pairIndex >= 0 && (pairIndex & 1) != 0) { |
|
332 pop(scriptRun); |
|
333 } |
|
334 } else { |
|
335 /* |
|
336 * if the run broke on a surrogate pair, |
|
337 * end it before the high surrogate |
|
338 */ |
|
339 if (ch >= 0x10000) { |
|
340 scriptRun->scriptLimit -= 1; |
|
341 } |
|
342 |
|
343 break; |
|
344 } |
|
345 } |
|
346 |
|
347 |
|
348 if (pRunStart != NULL) { |
|
349 *pRunStart = scriptRun->scriptStart; |
|
350 } |
|
351 |
|
352 if (pRunLimit != NULL) { |
|
353 *pRunLimit = scriptRun->scriptLimit; |
|
354 } |
|
355 |
|
356 if (pRunScript != NULL) { |
|
357 *pRunScript = scriptRun->scriptCode; |
|
358 } |
|
359 |
|
360 return TRUE; |
|
361 } |