|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2002-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uiter.cpp |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002jan18 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 #include "unicode/ustring.h" |
|
19 #include "unicode/chariter.h" |
|
20 #include "unicode/rep.h" |
|
21 #include "unicode/uiter.h" |
|
22 #include "unicode/utf.h" |
|
23 #include "unicode/utf8.h" |
|
24 #include "unicode/utf16.h" |
|
25 #include "cstring.h" |
|
26 |
|
27 U_NAMESPACE_USE |
|
28 |
|
29 #define IS_EVEN(n) (((n)&1)==0) |
|
30 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) |
|
31 |
|
32 U_CDECL_BEGIN |
|
33 |
|
34 /* No-Op UCharIterator implementation for illegal input --------------------- */ |
|
35 |
|
36 static int32_t U_CALLCONV |
|
37 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { |
|
38 return 0; |
|
39 } |
|
40 |
|
41 static int32_t U_CALLCONV |
|
42 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { |
|
43 return 0; |
|
44 } |
|
45 |
|
46 static UBool U_CALLCONV |
|
47 noopHasNext(UCharIterator * /*iter*/) { |
|
48 return FALSE; |
|
49 } |
|
50 |
|
51 static UChar32 U_CALLCONV |
|
52 noopCurrent(UCharIterator * /*iter*/) { |
|
53 return U_SENTINEL; |
|
54 } |
|
55 |
|
56 static uint32_t U_CALLCONV |
|
57 noopGetState(const UCharIterator * /*iter*/) { |
|
58 return UITER_NO_STATE; |
|
59 } |
|
60 |
|
61 static void U_CALLCONV |
|
62 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { |
|
63 *pErrorCode=U_UNSUPPORTED_ERROR; |
|
64 } |
|
65 |
|
66 static const UCharIterator noopIterator={ |
|
67 0, 0, 0, 0, 0, 0, |
|
68 noopGetIndex, |
|
69 noopMove, |
|
70 noopHasNext, |
|
71 noopHasNext, |
|
72 noopCurrent, |
|
73 noopCurrent, |
|
74 noopCurrent, |
|
75 NULL, |
|
76 noopGetState, |
|
77 noopSetState |
|
78 }; |
|
79 |
|
80 /* UCharIterator implementation for simple strings -------------------------- */ |
|
81 |
|
82 /* |
|
83 * This is an implementation of a code unit (UChar) iterator |
|
84 * for UChar * strings. |
|
85 * |
|
86 * The UCharIterator.context field holds a pointer to the string. |
|
87 */ |
|
88 |
|
89 static int32_t U_CALLCONV |
|
90 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
|
91 switch(origin) { |
|
92 case UITER_ZERO: |
|
93 return 0; |
|
94 case UITER_START: |
|
95 return iter->start; |
|
96 case UITER_CURRENT: |
|
97 return iter->index; |
|
98 case UITER_LIMIT: |
|
99 return iter->limit; |
|
100 case UITER_LENGTH: |
|
101 return iter->length; |
|
102 default: |
|
103 /* not a valid origin */ |
|
104 /* Should never get here! */ |
|
105 return -1; |
|
106 } |
|
107 } |
|
108 |
|
109 static int32_t U_CALLCONV |
|
110 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { |
|
111 int32_t pos; |
|
112 |
|
113 switch(origin) { |
|
114 case UITER_ZERO: |
|
115 pos=delta; |
|
116 break; |
|
117 case UITER_START: |
|
118 pos=iter->start+delta; |
|
119 break; |
|
120 case UITER_CURRENT: |
|
121 pos=iter->index+delta; |
|
122 break; |
|
123 case UITER_LIMIT: |
|
124 pos=iter->limit+delta; |
|
125 break; |
|
126 case UITER_LENGTH: |
|
127 pos=iter->length+delta; |
|
128 break; |
|
129 default: |
|
130 return -1; /* Error */ |
|
131 } |
|
132 |
|
133 if(pos<iter->start) { |
|
134 pos=iter->start; |
|
135 } else if(pos>iter->limit) { |
|
136 pos=iter->limit; |
|
137 } |
|
138 |
|
139 return iter->index=pos; |
|
140 } |
|
141 |
|
142 static UBool U_CALLCONV |
|
143 stringIteratorHasNext(UCharIterator *iter) { |
|
144 return iter->index<iter->limit; |
|
145 } |
|
146 |
|
147 static UBool U_CALLCONV |
|
148 stringIteratorHasPrevious(UCharIterator *iter) { |
|
149 return iter->index>iter->start; |
|
150 } |
|
151 |
|
152 static UChar32 U_CALLCONV |
|
153 stringIteratorCurrent(UCharIterator *iter) { |
|
154 if(iter->index<iter->limit) { |
|
155 return ((const UChar *)(iter->context))[iter->index]; |
|
156 } else { |
|
157 return U_SENTINEL; |
|
158 } |
|
159 } |
|
160 |
|
161 static UChar32 U_CALLCONV |
|
162 stringIteratorNext(UCharIterator *iter) { |
|
163 if(iter->index<iter->limit) { |
|
164 return ((const UChar *)(iter->context))[iter->index++]; |
|
165 } else { |
|
166 return U_SENTINEL; |
|
167 } |
|
168 } |
|
169 |
|
170 static UChar32 U_CALLCONV |
|
171 stringIteratorPrevious(UCharIterator *iter) { |
|
172 if(iter->index>iter->start) { |
|
173 return ((const UChar *)(iter->context))[--iter->index]; |
|
174 } else { |
|
175 return U_SENTINEL; |
|
176 } |
|
177 } |
|
178 |
|
179 static uint32_t U_CALLCONV |
|
180 stringIteratorGetState(const UCharIterator *iter) { |
|
181 return (uint32_t)iter->index; |
|
182 } |
|
183 |
|
184 static void U_CALLCONV |
|
185 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { |
|
186 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
187 /* do nothing */ |
|
188 } else if(iter==NULL) { |
|
189 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
190 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { |
|
191 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
192 } else { |
|
193 iter->index=(int32_t)state; |
|
194 } |
|
195 } |
|
196 |
|
197 static const UCharIterator stringIterator={ |
|
198 0, 0, 0, 0, 0, 0, |
|
199 stringIteratorGetIndex, |
|
200 stringIteratorMove, |
|
201 stringIteratorHasNext, |
|
202 stringIteratorHasPrevious, |
|
203 stringIteratorCurrent, |
|
204 stringIteratorNext, |
|
205 stringIteratorPrevious, |
|
206 NULL, |
|
207 stringIteratorGetState, |
|
208 stringIteratorSetState |
|
209 }; |
|
210 |
|
211 U_CAPI void U_EXPORT2 |
|
212 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { |
|
213 if(iter!=0) { |
|
214 if(s!=0 && length>=-1) { |
|
215 *iter=stringIterator; |
|
216 iter->context=s; |
|
217 if(length>=0) { |
|
218 iter->length=length; |
|
219 } else { |
|
220 iter->length=u_strlen(s); |
|
221 } |
|
222 iter->limit=iter->length; |
|
223 } else { |
|
224 *iter=noopIterator; |
|
225 } |
|
226 } |
|
227 } |
|
228 |
|
229 /* UCharIterator implementation for UTF-16BE strings ------------------------ */ |
|
230 |
|
231 /* |
|
232 * This is an implementation of a code unit (UChar) iterator |
|
233 * for UTF-16BE strings, i.e., strings in byte-vectors where |
|
234 * each UChar is stored as a big-endian pair of bytes. |
|
235 * |
|
236 * The UCharIterator.context field holds a pointer to the string. |
|
237 * Everything works just like with a normal UChar iterator (uiter_setString), |
|
238 * except that UChars are assembled from byte pairs. |
|
239 */ |
|
240 |
|
241 /* internal helper function */ |
|
242 static inline UChar32 |
|
243 utf16BEIteratorGet(UCharIterator *iter, int32_t index) { |
|
244 const uint8_t *p=(const uint8_t *)iter->context; |
|
245 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; |
|
246 } |
|
247 |
|
248 static UChar32 U_CALLCONV |
|
249 utf16BEIteratorCurrent(UCharIterator *iter) { |
|
250 int32_t index; |
|
251 |
|
252 if((index=iter->index)<iter->limit) { |
|
253 return utf16BEIteratorGet(iter, index); |
|
254 } else { |
|
255 return U_SENTINEL; |
|
256 } |
|
257 } |
|
258 |
|
259 static UChar32 U_CALLCONV |
|
260 utf16BEIteratorNext(UCharIterator *iter) { |
|
261 int32_t index; |
|
262 |
|
263 if((index=iter->index)<iter->limit) { |
|
264 iter->index=index+1; |
|
265 return utf16BEIteratorGet(iter, index); |
|
266 } else { |
|
267 return U_SENTINEL; |
|
268 } |
|
269 } |
|
270 |
|
271 static UChar32 U_CALLCONV |
|
272 utf16BEIteratorPrevious(UCharIterator *iter) { |
|
273 int32_t index; |
|
274 |
|
275 if((index=iter->index)>iter->start) { |
|
276 iter->index=--index; |
|
277 return utf16BEIteratorGet(iter, index); |
|
278 } else { |
|
279 return U_SENTINEL; |
|
280 } |
|
281 } |
|
282 |
|
283 static const UCharIterator utf16BEIterator={ |
|
284 0, 0, 0, 0, 0, 0, |
|
285 stringIteratorGetIndex, |
|
286 stringIteratorMove, |
|
287 stringIteratorHasNext, |
|
288 stringIteratorHasPrevious, |
|
289 utf16BEIteratorCurrent, |
|
290 utf16BEIteratorNext, |
|
291 utf16BEIteratorPrevious, |
|
292 NULL, |
|
293 stringIteratorGetState, |
|
294 stringIteratorSetState |
|
295 }; |
|
296 |
|
297 /* |
|
298 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL, |
|
299 * i.e., before a pair of 0 bytes where the first 0 byte is at an even |
|
300 * offset from s. |
|
301 */ |
|
302 static int32_t |
|
303 utf16BE_strlen(const char *s) { |
|
304 if(IS_POINTER_EVEN(s)) { |
|
305 /* |
|
306 * even-aligned, call u_strlen(s) |
|
307 * we are probably on a little-endian machine, but searching for UChar NUL |
|
308 * does not care about endianness |
|
309 */ |
|
310 return u_strlen((const UChar *)s); |
|
311 } else { |
|
312 /* odd-aligned, search for pair of 0 bytes */ |
|
313 const char *p=s; |
|
314 |
|
315 while(!(*p==0 && p[1]==0)) { |
|
316 p+=2; |
|
317 } |
|
318 return (int32_t)((p-s)/2); |
|
319 } |
|
320 } |
|
321 |
|
322 U_CAPI void U_EXPORT2 |
|
323 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { |
|
324 if(iter!=NULL) { |
|
325 /* allow only even-length strings (the input length counts bytes) */ |
|
326 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { |
|
327 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ |
|
328 length>>=1; |
|
329 |
|
330 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { |
|
331 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */ |
|
332 uiter_setString(iter, (const UChar *)s, length); |
|
333 return; |
|
334 } |
|
335 |
|
336 *iter=utf16BEIterator; |
|
337 iter->context=s; |
|
338 if(length>=0) { |
|
339 iter->length=length; |
|
340 } else { |
|
341 iter->length=utf16BE_strlen(s); |
|
342 } |
|
343 iter->limit=iter->length; |
|
344 } else { |
|
345 *iter=noopIterator; |
|
346 } |
|
347 } |
|
348 } |
|
349 |
|
350 /* UCharIterator wrapper around CharacterIterator --------------------------- */ |
|
351 |
|
352 /* |
|
353 * This is wrapper code around a C++ CharacterIterator to |
|
354 * look like a C UCharIterator. |
|
355 * |
|
356 * The UCharIterator.context field holds a pointer to the CharacterIterator. |
|
357 */ |
|
358 |
|
359 static int32_t U_CALLCONV |
|
360 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
|
361 switch(origin) { |
|
362 case UITER_ZERO: |
|
363 return 0; |
|
364 case UITER_START: |
|
365 return ((CharacterIterator *)(iter->context))->startIndex(); |
|
366 case UITER_CURRENT: |
|
367 return ((CharacterIterator *)(iter->context))->getIndex(); |
|
368 case UITER_LIMIT: |
|
369 return ((CharacterIterator *)(iter->context))->endIndex(); |
|
370 case UITER_LENGTH: |
|
371 return ((CharacterIterator *)(iter->context))->getLength(); |
|
372 default: |
|
373 /* not a valid origin */ |
|
374 /* Should never get here! */ |
|
375 return -1; |
|
376 } |
|
377 } |
|
378 |
|
379 static int32_t U_CALLCONV |
|
380 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { |
|
381 switch(origin) { |
|
382 case UITER_ZERO: |
|
383 ((CharacterIterator *)(iter->context))->setIndex(delta); |
|
384 return ((CharacterIterator *)(iter->context))->getIndex(); |
|
385 case UITER_START: |
|
386 case UITER_CURRENT: |
|
387 case UITER_LIMIT: |
|
388 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); |
|
389 case UITER_LENGTH: |
|
390 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); |
|
391 return ((CharacterIterator *)(iter->context))->getIndex(); |
|
392 default: |
|
393 /* not a valid origin */ |
|
394 /* Should never get here! */ |
|
395 return -1; |
|
396 } |
|
397 } |
|
398 |
|
399 static UBool U_CALLCONV |
|
400 characterIteratorHasNext(UCharIterator *iter) { |
|
401 return ((CharacterIterator *)(iter->context))->hasNext(); |
|
402 } |
|
403 |
|
404 static UBool U_CALLCONV |
|
405 characterIteratorHasPrevious(UCharIterator *iter) { |
|
406 return ((CharacterIterator *)(iter->context))->hasPrevious(); |
|
407 } |
|
408 |
|
409 static UChar32 U_CALLCONV |
|
410 characterIteratorCurrent(UCharIterator *iter) { |
|
411 UChar32 c; |
|
412 |
|
413 c=((CharacterIterator *)(iter->context))->current(); |
|
414 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { |
|
415 return c; |
|
416 } else { |
|
417 return U_SENTINEL; |
|
418 } |
|
419 } |
|
420 |
|
421 static UChar32 U_CALLCONV |
|
422 characterIteratorNext(UCharIterator *iter) { |
|
423 if(((CharacterIterator *)(iter->context))->hasNext()) { |
|
424 return ((CharacterIterator *)(iter->context))->nextPostInc(); |
|
425 } else { |
|
426 return U_SENTINEL; |
|
427 } |
|
428 } |
|
429 |
|
430 static UChar32 U_CALLCONV |
|
431 characterIteratorPrevious(UCharIterator *iter) { |
|
432 if(((CharacterIterator *)(iter->context))->hasPrevious()) { |
|
433 return ((CharacterIterator *)(iter->context))->previous(); |
|
434 } else { |
|
435 return U_SENTINEL; |
|
436 } |
|
437 } |
|
438 |
|
439 static uint32_t U_CALLCONV |
|
440 characterIteratorGetState(const UCharIterator *iter) { |
|
441 return ((CharacterIterator *)(iter->context))->getIndex(); |
|
442 } |
|
443 |
|
444 static void U_CALLCONV |
|
445 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { |
|
446 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
447 /* do nothing */ |
|
448 } else if(iter==NULL || iter->context==NULL) { |
|
449 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
450 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { |
|
451 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
452 } else { |
|
453 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); |
|
454 } |
|
455 } |
|
456 |
|
457 static const UCharIterator characterIteratorWrapper={ |
|
458 0, 0, 0, 0, 0, 0, |
|
459 characterIteratorGetIndex, |
|
460 characterIteratorMove, |
|
461 characterIteratorHasNext, |
|
462 characterIteratorHasPrevious, |
|
463 characterIteratorCurrent, |
|
464 characterIteratorNext, |
|
465 characterIteratorPrevious, |
|
466 NULL, |
|
467 characterIteratorGetState, |
|
468 characterIteratorSetState |
|
469 }; |
|
470 |
|
471 U_CAPI void U_EXPORT2 |
|
472 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { |
|
473 if(iter!=0) { |
|
474 if(charIter!=0) { |
|
475 *iter=characterIteratorWrapper; |
|
476 iter->context=charIter; |
|
477 } else { |
|
478 *iter=noopIterator; |
|
479 } |
|
480 } |
|
481 } |
|
482 |
|
483 /* UCharIterator wrapper around Replaceable --------------------------------- */ |
|
484 |
|
485 /* |
|
486 * This is an implementation of a code unit (UChar) iterator |
|
487 * based on a Replaceable object. |
|
488 * |
|
489 * The UCharIterator.context field holds a pointer to the Replaceable. |
|
490 * UCharIterator.length and UCharIterator.index hold Replaceable.length() |
|
491 * and the iteration index. |
|
492 */ |
|
493 |
|
494 static UChar32 U_CALLCONV |
|
495 replaceableIteratorCurrent(UCharIterator *iter) { |
|
496 if(iter->index<iter->limit) { |
|
497 return ((Replaceable *)(iter->context))->charAt(iter->index); |
|
498 } else { |
|
499 return U_SENTINEL; |
|
500 } |
|
501 } |
|
502 |
|
503 static UChar32 U_CALLCONV |
|
504 replaceableIteratorNext(UCharIterator *iter) { |
|
505 if(iter->index<iter->limit) { |
|
506 return ((Replaceable *)(iter->context))->charAt(iter->index++); |
|
507 } else { |
|
508 return U_SENTINEL; |
|
509 } |
|
510 } |
|
511 |
|
512 static UChar32 U_CALLCONV |
|
513 replaceableIteratorPrevious(UCharIterator *iter) { |
|
514 if(iter->index>iter->start) { |
|
515 return ((Replaceable *)(iter->context))->charAt(--iter->index); |
|
516 } else { |
|
517 return U_SENTINEL; |
|
518 } |
|
519 } |
|
520 |
|
521 static const UCharIterator replaceableIterator={ |
|
522 0, 0, 0, 0, 0, 0, |
|
523 stringIteratorGetIndex, |
|
524 stringIteratorMove, |
|
525 stringIteratorHasNext, |
|
526 stringIteratorHasPrevious, |
|
527 replaceableIteratorCurrent, |
|
528 replaceableIteratorNext, |
|
529 replaceableIteratorPrevious, |
|
530 NULL, |
|
531 stringIteratorGetState, |
|
532 stringIteratorSetState |
|
533 }; |
|
534 |
|
535 U_CAPI void U_EXPORT2 |
|
536 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { |
|
537 if(iter!=0) { |
|
538 if(rep!=0) { |
|
539 *iter=replaceableIterator; |
|
540 iter->context=rep; |
|
541 iter->limit=iter->length=rep->length(); |
|
542 } else { |
|
543 *iter=noopIterator; |
|
544 } |
|
545 } |
|
546 } |
|
547 |
|
548 /* UCharIterator implementation for UTF-8 strings --------------------------- */ |
|
549 |
|
550 /* |
|
551 * Possible, probably necessary only for an implementation for arbitrary |
|
552 * converters: |
|
553 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. |
|
554 * This would require to turn reservedFn into a close function and |
|
555 * to introduce a uiter_close(iter). |
|
556 */ |
|
557 |
|
558 #define UITER_CNV_CAPACITY 16 |
|
559 |
|
560 /* |
|
561 * Minimal implementation: |
|
562 * Maintain a single-UChar buffer for an additional surrogate. |
|
563 * The caller must not modify start and limit because they are used internally. |
|
564 * |
|
565 * Use UCharIterator fields as follows: |
|
566 * context pointer to UTF-8 string |
|
567 * length UTF-16 length of the string; -1 until lazy evaluation |
|
568 * start current UTF-8 index |
|
569 * index current UTF-16 index; may be -1="unknown" after setState() |
|
570 * limit UTF-8 length of the string |
|
571 * reservedField supplementary code point |
|
572 * |
|
573 * Since UCharIterator delivers 16-bit code units, the iteration can be |
|
574 * currently in the middle of the byte sequence for a supplementary code point. |
|
575 * In this case, reservedField will contain that code point and start will |
|
576 * point to after the corresponding byte sequence. The UTF-16 index will be |
|
577 * one less than what it would otherwise be corresponding to the UTF-8 index. |
|
578 * Otherwise, reservedField will be 0. |
|
579 */ |
|
580 |
|
581 /* |
|
582 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: |
|
583 * Add implementations that do not call strlen() for iteration but check for NUL. |
|
584 */ |
|
585 |
|
586 static int32_t U_CALLCONV |
|
587 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { |
|
588 switch(origin) { |
|
589 case UITER_ZERO: |
|
590 case UITER_START: |
|
591 return 0; |
|
592 case UITER_CURRENT: |
|
593 if(iter->index<0) { |
|
594 /* the current UTF-16 index is unknown after setState(), count from the beginning */ |
|
595 const uint8_t *s; |
|
596 UChar32 c; |
|
597 int32_t i, limit, index; |
|
598 |
|
599 s=(const uint8_t *)iter->context; |
|
600 i=index=0; |
|
601 limit=iter->start; /* count up to the UTF-8 index */ |
|
602 while(i<limit) { |
|
603 U8_NEXT_OR_FFFD(s, i, limit, c); |
|
604 index+=U16_LENGTH(c); |
|
605 } |
|
606 |
|
607 iter->start=i; /* just in case setState() did not get us to a code point boundary */ |
|
608 if(i==iter->limit) { |
|
609 iter->length=index; /* in case it was <0 or wrong */ |
|
610 } |
|
611 if(iter->reservedField!=0) { |
|
612 --index; /* we are in the middle of a supplementary code point */ |
|
613 } |
|
614 iter->index=index; |
|
615 } |
|
616 return iter->index; |
|
617 case UITER_LIMIT: |
|
618 case UITER_LENGTH: |
|
619 if(iter->length<0) { |
|
620 const uint8_t *s; |
|
621 UChar32 c; |
|
622 int32_t i, limit, length; |
|
623 |
|
624 s=(const uint8_t *)iter->context; |
|
625 if(iter->index<0) { |
|
626 /* |
|
627 * the current UTF-16 index is unknown after setState(), |
|
628 * we must first count from the beginning to here |
|
629 */ |
|
630 i=length=0; |
|
631 limit=iter->start; |
|
632 |
|
633 /* count from the beginning to the current index */ |
|
634 while(i<limit) { |
|
635 U8_NEXT_OR_FFFD(s, i, limit, c); |
|
636 length+=U16_LENGTH(c); |
|
637 } |
|
638 |
|
639 /* assume i==limit==iter->start, set the UTF-16 index */ |
|
640 iter->start=i; /* just in case setState() did not get us to a code point boundary */ |
|
641 iter->index= iter->reservedField!=0 ? length-1 : length; |
|
642 } else { |
|
643 i=iter->start; |
|
644 length=iter->index; |
|
645 if(iter->reservedField!=0) { |
|
646 ++length; |
|
647 } |
|
648 } |
|
649 |
|
650 /* count from the current index to the end */ |
|
651 limit=iter->limit; |
|
652 while(i<limit) { |
|
653 U8_NEXT_OR_FFFD(s, i, limit, c); |
|
654 length+=U16_LENGTH(c); |
|
655 } |
|
656 iter->length=length; |
|
657 } |
|
658 return iter->length; |
|
659 default: |
|
660 /* not a valid origin */ |
|
661 /* Should never get here! */ |
|
662 return -1; |
|
663 } |
|
664 } |
|
665 |
|
666 static int32_t U_CALLCONV |
|
667 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { |
|
668 const uint8_t *s; |
|
669 UChar32 c; |
|
670 int32_t pos; /* requested UTF-16 index */ |
|
671 int32_t i; /* UTF-8 index */ |
|
672 UBool havePos; |
|
673 |
|
674 /* calculate the requested UTF-16 index */ |
|
675 switch(origin) { |
|
676 case UITER_ZERO: |
|
677 case UITER_START: |
|
678 pos=delta; |
|
679 havePos=TRUE; |
|
680 /* iter->index<0 (unknown) is possible */ |
|
681 break; |
|
682 case UITER_CURRENT: |
|
683 if(iter->index>=0) { |
|
684 pos=iter->index+delta; |
|
685 havePos=TRUE; |
|
686 } else { |
|
687 /* the current UTF-16 index is unknown after setState(), use only delta */ |
|
688 pos=0; |
|
689 havePos=FALSE; |
|
690 } |
|
691 break; |
|
692 case UITER_LIMIT: |
|
693 case UITER_LENGTH: |
|
694 if(iter->length>=0) { |
|
695 pos=iter->length+delta; |
|
696 havePos=TRUE; |
|
697 } else { |
|
698 /* pin to the end, avoid counting the length */ |
|
699 iter->index=-1; |
|
700 iter->start=iter->limit; |
|
701 iter->reservedField=0; |
|
702 if(delta>=0) { |
|
703 return UITER_UNKNOWN_INDEX; |
|
704 } else { |
|
705 /* the current UTF-16 index is unknown, use only delta */ |
|
706 pos=0; |
|
707 havePos=FALSE; |
|
708 } |
|
709 } |
|
710 break; |
|
711 default: |
|
712 return -1; /* Error */ |
|
713 } |
|
714 |
|
715 if(havePos) { |
|
716 /* shortcuts: pinning to the edges of the string */ |
|
717 if(pos<=0) { |
|
718 iter->index=iter->start=iter->reservedField=0; |
|
719 return 0; |
|
720 } else if(iter->length>=0 && pos>=iter->length) { |
|
721 iter->index=iter->length; |
|
722 iter->start=iter->limit; |
|
723 iter->reservedField=0; |
|
724 return iter->index; |
|
725 } |
|
726 |
|
727 /* minimize the number of U8_NEXT/PREV operations */ |
|
728 if(iter->index<0 || pos<iter->index/2) { |
|
729 /* go forward from the start instead of backward from the current index */ |
|
730 iter->index=iter->start=iter->reservedField=0; |
|
731 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { |
|
732 /* |
|
733 * if we have the UTF-16 index and length and the new position is |
|
734 * closer to the end than the current index, |
|
735 * then go backward from the end instead of forward from the current index |
|
736 */ |
|
737 iter->index=iter->length; |
|
738 iter->start=iter->limit; |
|
739 iter->reservedField=0; |
|
740 } |
|
741 |
|
742 delta=pos-iter->index; |
|
743 if(delta==0) { |
|
744 return iter->index; /* nothing to do */ |
|
745 } |
|
746 } else { |
|
747 /* move relative to unknown UTF-16 index */ |
|
748 if(delta==0) { |
|
749 return UITER_UNKNOWN_INDEX; /* nothing to do */ |
|
750 } else if(-delta>=iter->start) { |
|
751 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ |
|
752 iter->index=iter->start=iter->reservedField=0; |
|
753 return 0; |
|
754 } else if(delta>=(iter->limit-iter->start)) { |
|
755 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ |
|
756 iter->index=iter->length; /* may or may not be <0 (unknown) */ |
|
757 iter->start=iter->limit; |
|
758 iter->reservedField=0; |
|
759 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; |
|
760 } |
|
761 } |
|
762 |
|
763 /* delta!=0 */ |
|
764 |
|
765 /* move towards the requested position, pin to the edges of the string */ |
|
766 s=(const uint8_t *)iter->context; |
|
767 pos=iter->index; /* could be <0 (unknown) */ |
|
768 i=iter->start; |
|
769 if(delta>0) { |
|
770 /* go forward */ |
|
771 int32_t limit=iter->limit; |
|
772 if(iter->reservedField!=0) { |
|
773 iter->reservedField=0; |
|
774 ++pos; |
|
775 --delta; |
|
776 } |
|
777 while(delta>0 && i<limit) { |
|
778 U8_NEXT_OR_FFFD(s, i, limit, c); |
|
779 if(c<=0xffff) { |
|
780 ++pos; |
|
781 --delta; |
|
782 } else if(delta>=2) { |
|
783 pos+=2; |
|
784 delta-=2; |
|
785 } else /* delta==1 */ { |
|
786 /* stop in the middle of a supplementary code point */ |
|
787 iter->reservedField=c; |
|
788 ++pos; |
|
789 break; /* delta=0; */ |
|
790 } |
|
791 } |
|
792 if(i==limit) { |
|
793 if(iter->length<0 && iter->index>=0) { |
|
794 iter->length= iter->reservedField==0 ? pos : pos+1; |
|
795 } else if(iter->index<0 && iter->length>=0) { |
|
796 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; |
|
797 } |
|
798 } |
|
799 } else /* delta<0 */ { |
|
800 /* go backward */ |
|
801 if(iter->reservedField!=0) { |
|
802 iter->reservedField=0; |
|
803 i-=4; /* we stayed behind the supplementary code point; go before it now */ |
|
804 --pos; |
|
805 ++delta; |
|
806 } |
|
807 while(delta<0 && i>0) { |
|
808 U8_PREV_OR_FFFD(s, 0, i, c); |
|
809 if(c<=0xffff) { |
|
810 --pos; |
|
811 ++delta; |
|
812 } else if(delta<=-2) { |
|
813 pos-=2; |
|
814 delta+=2; |
|
815 } else /* delta==-1 */ { |
|
816 /* stop in the middle of a supplementary code point */ |
|
817 i+=4; /* back to behind this supplementary code point for consistent state */ |
|
818 iter->reservedField=c; |
|
819 --pos; |
|
820 break; /* delta=0; */ |
|
821 } |
|
822 } |
|
823 } |
|
824 |
|
825 iter->start=i; |
|
826 if(iter->index>=0) { |
|
827 return iter->index=pos; |
|
828 } else { |
|
829 /* we started with index<0 (unknown) so pos is bogus */ |
|
830 if(i<=1) { |
|
831 return iter->index=i; /* reached the beginning */ |
|
832 } else { |
|
833 /* we still don't know the UTF-16 index */ |
|
834 return UITER_UNKNOWN_INDEX; |
|
835 } |
|
836 } |
|
837 } |
|
838 |
|
839 static UBool U_CALLCONV |
|
840 utf8IteratorHasNext(UCharIterator *iter) { |
|
841 return iter->start<iter->limit || iter->reservedField!=0; |
|
842 } |
|
843 |
|
844 static UBool U_CALLCONV |
|
845 utf8IteratorHasPrevious(UCharIterator *iter) { |
|
846 return iter->start>0; |
|
847 } |
|
848 |
|
849 static UChar32 U_CALLCONV |
|
850 utf8IteratorCurrent(UCharIterator *iter) { |
|
851 if(iter->reservedField!=0) { |
|
852 return U16_TRAIL(iter->reservedField); |
|
853 } else if(iter->start<iter->limit) { |
|
854 const uint8_t *s=(const uint8_t *)iter->context; |
|
855 UChar32 c; |
|
856 int32_t i=iter->start; |
|
857 |
|
858 U8_NEXT_OR_FFFD(s, i, iter->limit, c); |
|
859 if(c<=0xffff) { |
|
860 return c; |
|
861 } else { |
|
862 return U16_LEAD(c); |
|
863 } |
|
864 } else { |
|
865 return U_SENTINEL; |
|
866 } |
|
867 } |
|
868 |
|
869 static UChar32 U_CALLCONV |
|
870 utf8IteratorNext(UCharIterator *iter) { |
|
871 int32_t index; |
|
872 |
|
873 if(iter->reservedField!=0) { |
|
874 UChar trail=U16_TRAIL(iter->reservedField); |
|
875 iter->reservedField=0; |
|
876 if((index=iter->index)>=0) { |
|
877 iter->index=index+1; |
|
878 } |
|
879 return trail; |
|
880 } else if(iter->start<iter->limit) { |
|
881 const uint8_t *s=(const uint8_t *)iter->context; |
|
882 UChar32 c; |
|
883 |
|
884 U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c); |
|
885 if((index=iter->index)>=0) { |
|
886 iter->index=++index; |
|
887 if(iter->length<0 && iter->start==iter->limit) { |
|
888 iter->length= c<=0xffff ? index : index+1; |
|
889 } |
|
890 } else if(iter->start==iter->limit && iter->length>=0) { |
|
891 iter->index= c<=0xffff ? iter->length : iter->length-1; |
|
892 } |
|
893 if(c<=0xffff) { |
|
894 return c; |
|
895 } else { |
|
896 iter->reservedField=c; |
|
897 return U16_LEAD(c); |
|
898 } |
|
899 } else { |
|
900 return U_SENTINEL; |
|
901 } |
|
902 } |
|
903 |
|
904 static UChar32 U_CALLCONV |
|
905 utf8IteratorPrevious(UCharIterator *iter) { |
|
906 int32_t index; |
|
907 |
|
908 if(iter->reservedField!=0) { |
|
909 UChar lead=U16_LEAD(iter->reservedField); |
|
910 iter->reservedField=0; |
|
911 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ |
|
912 if((index=iter->index)>0) { |
|
913 iter->index=index-1; |
|
914 } |
|
915 return lead; |
|
916 } else if(iter->start>0) { |
|
917 const uint8_t *s=(const uint8_t *)iter->context; |
|
918 UChar32 c; |
|
919 |
|
920 U8_PREV_OR_FFFD(s, 0, iter->start, c); |
|
921 if((index=iter->index)>0) { |
|
922 iter->index=index-1; |
|
923 } else if(iter->start<=1) { |
|
924 iter->index= c<=0xffff ? iter->start : iter->start+1; |
|
925 } |
|
926 if(c<=0xffff) { |
|
927 return c; |
|
928 } else { |
|
929 iter->start+=4; /* back to behind this supplementary code point for consistent state */ |
|
930 iter->reservedField=c; |
|
931 return U16_TRAIL(c); |
|
932 } |
|
933 } else { |
|
934 return U_SENTINEL; |
|
935 } |
|
936 } |
|
937 |
|
938 static uint32_t U_CALLCONV |
|
939 utf8IteratorGetState(const UCharIterator *iter) { |
|
940 uint32_t state=(uint32_t)(iter->start<<1); |
|
941 if(iter->reservedField!=0) { |
|
942 state|=1; |
|
943 } |
|
944 return state; |
|
945 } |
|
946 |
|
947 static void U_CALLCONV |
|
948 utf8IteratorSetState(UCharIterator *iter, |
|
949 uint32_t state, |
|
950 UErrorCode *pErrorCode) |
|
951 { |
|
952 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
953 /* do nothing */ |
|
954 } else if(iter==NULL) { |
|
955 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
956 } else if(state==utf8IteratorGetState(iter)) { |
|
957 /* setting to the current state: no-op */ |
|
958 } else { |
|
959 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ |
|
960 state&=1; /* 1 if in surrogate pair, must be index>=4 */ |
|
961 |
|
962 if((state==0 ? index<0 : index<4) || iter->limit<index) { |
|
963 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
964 } else { |
|
965 iter->start=index; /* restore UTF-8 byte index */ |
|
966 if(index<=1) { |
|
967 iter->index=index; |
|
968 } else { |
|
969 iter->index=-1; /* unknown UTF-16 index */ |
|
970 } |
|
971 if(state==0) { |
|
972 iter->reservedField=0; |
|
973 } else { |
|
974 /* verified index>=4 above */ |
|
975 UChar32 c; |
|
976 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c); |
|
977 if(c<=0xffff) { |
|
978 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
979 } else { |
|
980 iter->reservedField=c; |
|
981 } |
|
982 } |
|
983 } |
|
984 } |
|
985 } |
|
986 |
|
987 static const UCharIterator utf8Iterator={ |
|
988 0, 0, 0, 0, 0, 0, |
|
989 utf8IteratorGetIndex, |
|
990 utf8IteratorMove, |
|
991 utf8IteratorHasNext, |
|
992 utf8IteratorHasPrevious, |
|
993 utf8IteratorCurrent, |
|
994 utf8IteratorNext, |
|
995 utf8IteratorPrevious, |
|
996 NULL, |
|
997 utf8IteratorGetState, |
|
998 utf8IteratorSetState |
|
999 }; |
|
1000 |
|
1001 U_CAPI void U_EXPORT2 |
|
1002 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { |
|
1003 if(iter!=0) { |
|
1004 if(s!=0 && length>=-1) { |
|
1005 *iter=utf8Iterator; |
|
1006 iter->context=s; |
|
1007 if(length>=0) { |
|
1008 iter->limit=length; |
|
1009 } else { |
|
1010 iter->limit=(int32_t)uprv_strlen(s); |
|
1011 } |
|
1012 iter->length= iter->limit<=1 ? iter->limit : -1; |
|
1013 } else { |
|
1014 *iter=noopIterator; |
|
1015 } |
|
1016 } |
|
1017 } |
|
1018 |
|
1019 /* Helper functions --------------------------------------------------------- */ |
|
1020 |
|
1021 U_CAPI UChar32 U_EXPORT2 |
|
1022 uiter_current32(UCharIterator *iter) { |
|
1023 UChar32 c, c2; |
|
1024 |
|
1025 c=iter->current(iter); |
|
1026 if(U16_IS_SURROGATE(c)) { |
|
1027 if(U16_IS_SURROGATE_LEAD(c)) { |
|
1028 /* |
|
1029 * go to the next code unit |
|
1030 * we know that we are not at the limit because c!=U_SENTINEL |
|
1031 */ |
|
1032 iter->move(iter, 1, UITER_CURRENT); |
|
1033 if(U16_IS_TRAIL(c2=iter->current(iter))) { |
|
1034 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
1035 } |
|
1036 |
|
1037 /* undo index movement */ |
|
1038 iter->move(iter, -1, UITER_CURRENT); |
|
1039 } else { |
|
1040 if(U16_IS_LEAD(c2=iter->previous(iter))) { |
|
1041 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
1042 } |
|
1043 if(c2>=0) { |
|
1044 /* undo index movement */ |
|
1045 iter->move(iter, 1, UITER_CURRENT); |
|
1046 } |
|
1047 } |
|
1048 } |
|
1049 return c; |
|
1050 } |
|
1051 |
|
1052 U_CAPI UChar32 U_EXPORT2 |
|
1053 uiter_next32(UCharIterator *iter) { |
|
1054 UChar32 c, c2; |
|
1055 |
|
1056 c=iter->next(iter); |
|
1057 if(U16_IS_LEAD(c)) { |
|
1058 if(U16_IS_TRAIL(c2=iter->next(iter))) { |
|
1059 c=U16_GET_SUPPLEMENTARY(c, c2); |
|
1060 } else if(c2>=0) { |
|
1061 /* unmatched first surrogate, undo index movement */ |
|
1062 iter->move(iter, -1, UITER_CURRENT); |
|
1063 } |
|
1064 } |
|
1065 return c; |
|
1066 } |
|
1067 |
|
1068 U_CAPI UChar32 U_EXPORT2 |
|
1069 uiter_previous32(UCharIterator *iter) { |
|
1070 UChar32 c, c2; |
|
1071 |
|
1072 c=iter->previous(iter); |
|
1073 if(U16_IS_TRAIL(c)) { |
|
1074 if(U16_IS_LEAD(c2=iter->previous(iter))) { |
|
1075 c=U16_GET_SUPPLEMENTARY(c2, c); |
|
1076 } else if(c2>=0) { |
|
1077 /* unmatched second surrogate, undo index movement */ |
|
1078 iter->move(iter, 1, UITER_CURRENT); |
|
1079 } |
|
1080 } |
|
1081 return c; |
|
1082 } |
|
1083 |
|
1084 U_CAPI uint32_t U_EXPORT2 |
|
1085 uiter_getState(const UCharIterator *iter) { |
|
1086 if(iter==NULL || iter->getState==NULL) { |
|
1087 return UITER_NO_STATE; |
|
1088 } else { |
|
1089 return iter->getState(iter); |
|
1090 } |
|
1091 } |
|
1092 |
|
1093 U_CAPI void U_EXPORT2 |
|
1094 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { |
|
1095 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
|
1096 /* do nothing */ |
|
1097 } else if(iter==NULL) { |
|
1098 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
1099 } else if(iter->setState==NULL) { |
|
1100 *pErrorCode=U_UNSUPPORTED_ERROR; |
|
1101 } else { |
|
1102 iter->setState(iter, state, pErrorCode); |
|
1103 } |
|
1104 } |
|
1105 |
|
1106 U_CDECL_END |