|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 2007-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * file name: bmpset.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2007jan29 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 #include "unicode/uniset.h" |
|
19 #include "unicode/utf8.h" |
|
20 #include "unicode/utf16.h" |
|
21 #include "cmemory.h" |
|
22 #include "bmpset.h" |
|
23 #include "uassert.h" |
|
24 |
|
25 U_NAMESPACE_BEGIN |
|
26 |
|
27 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : |
|
28 list(parentList), listLength(parentListLength) { |
|
29 uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); |
|
30 uprv_memset(table7FF, 0, sizeof(table7FF)); |
|
31 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); |
|
32 |
|
33 /* |
|
34 * Set the list indexes for binary searches for |
|
35 * U+0800, U+1000, U+2000, .., U+F000, U+10000. |
|
36 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are |
|
37 * looked up in the bit tables. |
|
38 * The last pair of indexes is for finding supplementary code points. |
|
39 */ |
|
40 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); |
|
41 int32_t i; |
|
42 for(i=1; i<=0x10; ++i) { |
|
43 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); |
|
44 } |
|
45 list4kStarts[0x11]=listLength-1; |
|
46 |
|
47 initBits(); |
|
48 overrideIllegal(); |
|
49 } |
|
50 |
|
51 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : |
|
52 list(newParentList), listLength(newParentListLength) { |
|
53 uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); |
|
54 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); |
|
55 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); |
|
56 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); |
|
57 } |
|
58 |
|
59 BMPSet::~BMPSet() { |
|
60 } |
|
61 |
|
62 /* |
|
63 * Set bits in a bit rectangle in "vertical" bit organization. |
|
64 * start<limit<=0x800 |
|
65 */ |
|
66 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { |
|
67 U_ASSERT(start<limit); |
|
68 U_ASSERT(limit<=0x800); |
|
69 |
|
70 int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. |
|
71 int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. |
|
72 |
|
73 // Set one bit indicating an all-one block. |
|
74 uint32_t bits=(uint32_t)1<<lead; |
|
75 if((start+1)==limit) { // Single-character shortcut. |
|
76 table[trail]|=bits; |
|
77 return; |
|
78 } |
|
79 |
|
80 int32_t limitLead=limit>>6; |
|
81 int32_t limitTrail=limit&0x3f; |
|
82 |
|
83 if(lead==limitLead) { |
|
84 // Partial vertical bit column. |
|
85 while(trail<limitTrail) { |
|
86 table[trail++]|=bits; |
|
87 } |
|
88 } else { |
|
89 // Partial vertical bit column, |
|
90 // followed by a bit rectangle, |
|
91 // followed by another partial vertical bit column. |
|
92 if(trail>0) { |
|
93 do { |
|
94 table[trail++]|=bits; |
|
95 } while(trail<64); |
|
96 ++lead; |
|
97 } |
|
98 if(lead<limitLead) { |
|
99 bits=~((1<<lead)-1); |
|
100 if(limitLead<0x20) { |
|
101 bits&=(1<<limitLead)-1; |
|
102 } |
|
103 for(trail=0; trail<64; ++trail) { |
|
104 table[trail]|=bits; |
|
105 } |
|
106 } |
|
107 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. |
|
108 // In that case, bits=1<<limitLead is undefined but the bits value |
|
109 // is not used because trail<limitTrail is already false. |
|
110 bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); |
|
111 for(trail=0; trail<limitTrail; ++trail) { |
|
112 table[trail]|=bits; |
|
113 } |
|
114 } |
|
115 } |
|
116 |
|
117 void BMPSet::initBits() { |
|
118 UChar32 start, limit; |
|
119 int32_t listIndex=0; |
|
120 |
|
121 // Set asciiBytes[]. |
|
122 do { |
|
123 start=list[listIndex++]; |
|
124 if(listIndex<listLength) { |
|
125 limit=list[listIndex++]; |
|
126 } else { |
|
127 limit=0x110000; |
|
128 } |
|
129 if(start>=0x80) { |
|
130 break; |
|
131 } |
|
132 do { |
|
133 asciiBytes[start++]=1; |
|
134 } while(start<limit && start<0x80); |
|
135 } while(limit<=0x80); |
|
136 |
|
137 // Set table7FF[]. |
|
138 while(start<0x800) { |
|
139 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); |
|
140 if(limit>0x800) { |
|
141 start=0x800; |
|
142 break; |
|
143 } |
|
144 |
|
145 start=list[listIndex++]; |
|
146 if(listIndex<listLength) { |
|
147 limit=list[listIndex++]; |
|
148 } else { |
|
149 limit=0x110000; |
|
150 } |
|
151 } |
|
152 |
|
153 // Set bmpBlockBits[]. |
|
154 int32_t minStart=0x800; |
|
155 while(start<0x10000) { |
|
156 if(limit>0x10000) { |
|
157 limit=0x10000; |
|
158 } |
|
159 |
|
160 if(start<minStart) { |
|
161 start=minStart; |
|
162 } |
|
163 if(start<limit) { // Else: Another range entirely in a known mixed-value block. |
|
164 if(start&0x3f) { |
|
165 // Mixed-value block of 64 code points. |
|
166 start>>=6; |
|
167 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); |
|
168 start=(start+1)<<6; // Round up to the next block boundary. |
|
169 minStart=start; // Ignore further ranges in this block. |
|
170 } |
|
171 if(start<limit) { |
|
172 if(start<(limit&~0x3f)) { |
|
173 // Multiple all-ones blocks of 64 code points each. |
|
174 set32x64Bits(bmpBlockBits, start>>6, limit>>6); |
|
175 } |
|
176 |
|
177 if(limit&0x3f) { |
|
178 // Mixed-value block of 64 code points. |
|
179 limit>>=6; |
|
180 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); |
|
181 limit=(limit+1)<<6; // Round up to the next block boundary. |
|
182 minStart=limit; // Ignore further ranges in this block. |
|
183 } |
|
184 } |
|
185 } |
|
186 |
|
187 if(limit==0x10000) { |
|
188 break; |
|
189 } |
|
190 |
|
191 start=list[listIndex++]; |
|
192 if(listIndex<listLength) { |
|
193 limit=list[listIndex++]; |
|
194 } else { |
|
195 limit=0x110000; |
|
196 } |
|
197 } |
|
198 } |
|
199 |
|
200 /* |
|
201 * Override some bits and bytes to the result of contains(FFFD) |
|
202 * for faster validity checking at runtime. |
|
203 * No need to set 0 values where they were reset to 0 in the constructor |
|
204 * and not modified by initBits(). |
|
205 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) |
|
206 * Need to set 0 values for surrogates D800..DFFF. |
|
207 */ |
|
208 void BMPSet::overrideIllegal() { |
|
209 uint32_t bits, mask; |
|
210 int32_t i; |
|
211 |
|
212 if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { |
|
213 // contains(FFFD)==TRUE |
|
214 for(i=0x80; i<0xc0; ++i) { |
|
215 asciiBytes[i]=1; |
|
216 } |
|
217 |
|
218 bits=3; // Lead bytes 0xC0 and 0xC1. |
|
219 for(i=0; i<64; ++i) { |
|
220 table7FF[i]|=bits; |
|
221 } |
|
222 |
|
223 bits=1; // Lead byte 0xE0. |
|
224 for(i=0; i<32; ++i) { // First half of 4k block. |
|
225 bmpBlockBits[i]|=bits; |
|
226 } |
|
227 |
|
228 mask=~(0x10001<<0xd); // Lead byte 0xED. |
|
229 bits=1<<0xd; |
|
230 for(i=32; i<64; ++i) { // Second half of 4k block. |
|
231 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; |
|
232 } |
|
233 } else { |
|
234 // contains(FFFD)==FALSE |
|
235 mask=~(0x10001<<0xd); // Lead byte 0xED. |
|
236 for(i=32; i<64; ++i) { // Second half of 4k block. |
|
237 bmpBlockBits[i]&=mask; |
|
238 } |
|
239 } |
|
240 } |
|
241 |
|
242 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { |
|
243 /* Examples: |
|
244 findCodePoint(c) |
|
245 set list[] c=0 1 3 4 7 8 |
|
246 === ============== =========== |
|
247 [] [110000] 0 0 0 0 0 0 |
|
248 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 |
|
249 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 |
|
250 [:Any:] [0, 110000] 1 1 1 1 1 1 |
|
251 */ |
|
252 |
|
253 // Return the smallest i such that c < list[i]. Assume |
|
254 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). |
|
255 if (c < list[lo]) |
|
256 return lo; |
|
257 // High runner test. c is often after the last range, so an |
|
258 // initial check for this condition pays off. |
|
259 if (lo >= hi || c >= list[hi-1]) |
|
260 return hi; |
|
261 // invariant: c >= list[lo] |
|
262 // invariant: c < list[hi] |
|
263 for (;;) { |
|
264 int32_t i = (lo + hi) >> 1; |
|
265 if (i == lo) { |
|
266 break; // Found! |
|
267 } else if (c < list[i]) { |
|
268 hi = i; |
|
269 } else { |
|
270 lo = i; |
|
271 } |
|
272 } |
|
273 return hi; |
|
274 } |
|
275 |
|
276 UBool |
|
277 BMPSet::contains(UChar32 c) const { |
|
278 if((uint32_t)c<=0x7f) { |
|
279 return (UBool)asciiBytes[c]; |
|
280 } else if((uint32_t)c<=0x7ff) { |
|
281 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); |
|
282 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { |
|
283 int lead=c>>12; |
|
284 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
285 if(twoBits<=1) { |
|
286 // All 64 code points with the same bits 15..6 |
|
287 // are either in the set or not. |
|
288 return (UBool)twoBits; |
|
289 } else { |
|
290 // Look up the code point in its 4k block of code points. |
|
291 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); |
|
292 } |
|
293 } else if((uint32_t)c<=0x10ffff) { |
|
294 // surrogate or supplementary code point |
|
295 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); |
|
296 } else { |
|
297 // Out-of-range code points get FALSE, consistent with long-standing |
|
298 // behavior of UnicodeSet::contains(c). |
|
299 return FALSE; |
|
300 } |
|
301 } |
|
302 |
|
303 /* |
|
304 * Check for sufficient length for trail unit for each surrogate pair. |
|
305 * Handle single surrogates as surrogate code points as usual in ICU. |
|
306 */ |
|
307 const UChar * |
|
308 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { |
|
309 UChar c, c2; |
|
310 |
|
311 if(spanCondition) { |
|
312 // span |
|
313 do { |
|
314 c=*s; |
|
315 if(c<=0x7f) { |
|
316 if(!asciiBytes[c]) { |
|
317 break; |
|
318 } |
|
319 } else if(c<=0x7ff) { |
|
320 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { |
|
321 break; |
|
322 } |
|
323 } else if(c<0xd800 || c>=0xe000) { |
|
324 int lead=c>>12; |
|
325 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
326 if(twoBits<=1) { |
|
327 // All 64 code points with the same bits 15..6 |
|
328 // are either in the set or not. |
|
329 if(twoBits==0) { |
|
330 break; |
|
331 } |
|
332 } else { |
|
333 // Look up the code point in its 4k block of code points. |
|
334 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
|
335 break; |
|
336 } |
|
337 } |
|
338 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { |
|
339 // surrogate code point |
|
340 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
|
341 break; |
|
342 } |
|
343 } else { |
|
344 // surrogate pair |
|
345 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { |
|
346 break; |
|
347 } |
|
348 ++s; |
|
349 } |
|
350 } while(++s<limit); |
|
351 } else { |
|
352 // span not |
|
353 do { |
|
354 c=*s; |
|
355 if(c<=0x7f) { |
|
356 if(asciiBytes[c]) { |
|
357 break; |
|
358 } |
|
359 } else if(c<=0x7ff) { |
|
360 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { |
|
361 break; |
|
362 } |
|
363 } else if(c<0xd800 || c>=0xe000) { |
|
364 int lead=c>>12; |
|
365 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
366 if(twoBits<=1) { |
|
367 // All 64 code points with the same bits 15..6 |
|
368 // are either in the set or not. |
|
369 if(twoBits!=0) { |
|
370 break; |
|
371 } |
|
372 } else { |
|
373 // Look up the code point in its 4k block of code points. |
|
374 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
|
375 break; |
|
376 } |
|
377 } |
|
378 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { |
|
379 // surrogate code point |
|
380 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
|
381 break; |
|
382 } |
|
383 } else { |
|
384 // surrogate pair |
|
385 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { |
|
386 break; |
|
387 } |
|
388 ++s; |
|
389 } |
|
390 } while(++s<limit); |
|
391 } |
|
392 return s; |
|
393 } |
|
394 |
|
395 /* Symmetrical with span(). */ |
|
396 const UChar * |
|
397 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { |
|
398 UChar c, c2; |
|
399 |
|
400 if(spanCondition) { |
|
401 // span |
|
402 for(;;) { |
|
403 c=*(--limit); |
|
404 if(c<=0x7f) { |
|
405 if(!asciiBytes[c]) { |
|
406 break; |
|
407 } |
|
408 } else if(c<=0x7ff) { |
|
409 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { |
|
410 break; |
|
411 } |
|
412 } else if(c<0xd800 || c>=0xe000) { |
|
413 int lead=c>>12; |
|
414 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
415 if(twoBits<=1) { |
|
416 // All 64 code points with the same bits 15..6 |
|
417 // are either in the set or not. |
|
418 if(twoBits==0) { |
|
419 break; |
|
420 } |
|
421 } else { |
|
422 // Look up the code point in its 4k block of code points. |
|
423 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
|
424 break; |
|
425 } |
|
426 } |
|
427 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { |
|
428 // surrogate code point |
|
429 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
|
430 break; |
|
431 } |
|
432 } else { |
|
433 // surrogate pair |
|
434 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { |
|
435 break; |
|
436 } |
|
437 --limit; |
|
438 } |
|
439 if(s==limit) { |
|
440 return s; |
|
441 } |
|
442 } |
|
443 } else { |
|
444 // span not |
|
445 for(;;) { |
|
446 c=*(--limit); |
|
447 if(c<=0x7f) { |
|
448 if(asciiBytes[c]) { |
|
449 break; |
|
450 } |
|
451 } else if(c<=0x7ff) { |
|
452 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { |
|
453 break; |
|
454 } |
|
455 } else if(c<0xd800 || c>=0xe000) { |
|
456 int lead=c>>12; |
|
457 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
458 if(twoBits<=1) { |
|
459 // All 64 code points with the same bits 15..6 |
|
460 // are either in the set or not. |
|
461 if(twoBits!=0) { |
|
462 break; |
|
463 } |
|
464 } else { |
|
465 // Look up the code point in its 4k block of code points. |
|
466 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
|
467 break; |
|
468 } |
|
469 } |
|
470 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { |
|
471 // surrogate code point |
|
472 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
|
473 break; |
|
474 } |
|
475 } else { |
|
476 // surrogate pair |
|
477 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { |
|
478 break; |
|
479 } |
|
480 --limit; |
|
481 } |
|
482 if(s==limit) { |
|
483 return s; |
|
484 } |
|
485 } |
|
486 } |
|
487 return limit+1; |
|
488 } |
|
489 |
|
490 /* |
|
491 * Precheck for sufficient trail bytes at end of string only once per span. |
|
492 * Check validity. |
|
493 */ |
|
494 const uint8_t * |
|
495 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
|
496 const uint8_t *limit=s+length; |
|
497 uint8_t b=*s; |
|
498 if((int8_t)b>=0) { |
|
499 // Initial all-ASCII span. |
|
500 if(spanCondition) { |
|
501 do { |
|
502 if(!asciiBytes[b] || ++s==limit) { |
|
503 return s; |
|
504 } |
|
505 b=*s; |
|
506 } while((int8_t)b>=0); |
|
507 } else { |
|
508 do { |
|
509 if(asciiBytes[b] || ++s==limit) { |
|
510 return s; |
|
511 } |
|
512 b=*s; |
|
513 } while((int8_t)b>=0); |
|
514 } |
|
515 length=(int32_t)(limit-s); |
|
516 } |
|
517 |
|
518 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
|
519 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
|
520 } |
|
521 |
|
522 const uint8_t *limit0=limit; |
|
523 |
|
524 /* |
|
525 * Make sure that the last 1/2/3/4-byte sequence before limit is complete |
|
526 * or runs into a lead byte. |
|
527 * In the span loop compare s with limit only once |
|
528 * per multi-byte character. |
|
529 * |
|
530 * Give a trailing illegal sequence the same value as the result of contains(FFFD), |
|
531 * including it if that is part of the span, otherwise set limit0 to before |
|
532 * the truncated sequence. |
|
533 */ |
|
534 b=*(limit-1); |
|
535 if((int8_t)b<0) { |
|
536 // b>=0x80: lead or trail byte |
|
537 if(b<0xc0) { |
|
538 // single trail byte, check for preceding 3- or 4-byte lead byte |
|
539 if(length>=2 && (b=*(limit-2))>=0xe0) { |
|
540 limit-=2; |
|
541 if(asciiBytes[0x80]!=spanCondition) { |
|
542 limit0=limit; |
|
543 } |
|
544 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { |
|
545 // 4-byte lead byte with only two trail bytes |
|
546 limit-=3; |
|
547 if(asciiBytes[0x80]!=spanCondition) { |
|
548 limit0=limit; |
|
549 } |
|
550 } |
|
551 } else { |
|
552 // lead byte with no trail bytes |
|
553 --limit; |
|
554 if(asciiBytes[0x80]!=spanCondition) { |
|
555 limit0=limit; |
|
556 } |
|
557 } |
|
558 } |
|
559 |
|
560 uint8_t t1, t2, t3; |
|
561 |
|
562 while(s<limit) { |
|
563 b=*s; |
|
564 if(b<0xc0) { |
|
565 // ASCII; or trail bytes with the result of contains(FFFD). |
|
566 if(spanCondition) { |
|
567 do { |
|
568 if(!asciiBytes[b]) { |
|
569 return s; |
|
570 } else if(++s==limit) { |
|
571 return limit0; |
|
572 } |
|
573 b=*s; |
|
574 } while(b<0xc0); |
|
575 } else { |
|
576 do { |
|
577 if(asciiBytes[b]) { |
|
578 return s; |
|
579 } else if(++s==limit) { |
|
580 return limit0; |
|
581 } |
|
582 b=*s; |
|
583 } while(b<0xc0); |
|
584 } |
|
585 } |
|
586 ++s; // Advance past the lead byte. |
|
587 if(b>=0xe0) { |
|
588 if(b<0xf0) { |
|
589 if( /* handle U+0000..U+FFFF inline */ |
|
590 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && |
|
591 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f |
|
592 ) { |
|
593 b&=0xf; |
|
594 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; |
|
595 if(twoBits<=1) { |
|
596 // All 64 code points with this lead byte and middle trail byte |
|
597 // are either in the set or not. |
|
598 if(twoBits!=(uint32_t)spanCondition) { |
|
599 return s-1; |
|
600 } |
|
601 } else { |
|
602 // Look up the code point in its 4k block of code points. |
|
603 UChar32 c=(b<<12)|(t1<<6)|t2; |
|
604 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { |
|
605 return s-1; |
|
606 } |
|
607 } |
|
608 s+=2; |
|
609 continue; |
|
610 } |
|
611 } else if( /* handle U+10000..U+10FFFF inline */ |
|
612 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && |
|
613 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && |
|
614 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f |
|
615 ) { |
|
616 // Give an illegal sequence the same value as the result of contains(FFFD). |
|
617 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; |
|
618 if( ( (0x10000<=c && c<=0x10ffff) ? |
|
619 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : |
|
620 asciiBytes[0x80] |
|
621 ) != spanCondition |
|
622 ) { |
|
623 return s-1; |
|
624 } |
|
625 s+=3; |
|
626 continue; |
|
627 } |
|
628 } else /* 0xc0<=b<0xe0 */ { |
|
629 if( /* handle U+0000..U+07FF inline */ |
|
630 (t1=(uint8_t)(*s-0x80)) <= 0x3f |
|
631 ) { |
|
632 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { |
|
633 return s-1; |
|
634 } |
|
635 ++s; |
|
636 continue; |
|
637 } |
|
638 } |
|
639 |
|
640 // Give an illegal sequence the same value as the result of contains(FFFD). |
|
641 // Handle each byte of an illegal sequence separately to simplify the code; |
|
642 // no need to optimize error handling. |
|
643 if(asciiBytes[0x80]!=spanCondition) { |
|
644 return s-1; |
|
645 } |
|
646 } |
|
647 |
|
648 return limit0; |
|
649 } |
|
650 |
|
651 /* |
|
652 * While going backwards through UTF-8 optimize only for ASCII. |
|
653 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not |
|
654 * possible to tell from the last byte in a multi-byte sequence how many |
|
655 * preceding bytes there should be. Therefore, going backwards through UTF-8 |
|
656 * is much harder than going forward. |
|
657 */ |
|
658 int32_t |
|
659 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
|
660 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
|
661 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
|
662 } |
|
663 |
|
664 uint8_t b; |
|
665 |
|
666 do { |
|
667 b=s[--length]; |
|
668 if((int8_t)b>=0) { |
|
669 // ASCII sub-span |
|
670 if(spanCondition) { |
|
671 do { |
|
672 if(!asciiBytes[b]) { |
|
673 return length+1; |
|
674 } else if(length==0) { |
|
675 return 0; |
|
676 } |
|
677 b=s[--length]; |
|
678 } while((int8_t)b>=0); |
|
679 } else { |
|
680 do { |
|
681 if(asciiBytes[b]) { |
|
682 return length+1; |
|
683 } else if(length==0) { |
|
684 return 0; |
|
685 } |
|
686 b=s[--length]; |
|
687 } while((int8_t)b>=0); |
|
688 } |
|
689 } |
|
690 |
|
691 int32_t prev=length; |
|
692 UChar32 c; |
|
693 // trail byte: collect a multi-byte character |
|
694 // (or lead byte in last-trail position) |
|
695 c=utf8_prevCharSafeBody(s, 0, &length, b, -3); |
|
696 // c is a valid code point, not ASCII, not a surrogate |
|
697 if(c<=0x7ff) { |
|
698 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { |
|
699 return prev+1; |
|
700 } |
|
701 } else if(c<=0xffff) { |
|
702 int lead=c>>12; |
|
703 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
|
704 if(twoBits<=1) { |
|
705 // All 64 code points with the same bits 15..6 |
|
706 // are either in the set or not. |
|
707 if(twoBits!=(uint32_t)spanCondition) { |
|
708 return prev+1; |
|
709 } |
|
710 } else { |
|
711 // Look up the code point in its 4k block of code points. |
|
712 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { |
|
713 return prev+1; |
|
714 } |
|
715 } |
|
716 } else { |
|
717 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { |
|
718 return prev+1; |
|
719 } |
|
720 } |
|
721 } while(length>0); |
|
722 return 0; |
|
723 } |
|
724 |
|
725 U_NAMESPACE_END |