|
1 /* |
|
2 ****************************************************************************** |
|
3 * |
|
4 * Copyright (C) 1998-2012, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ****************************************************************************** |
|
8 * |
|
9 * File ustring.cpp |
|
10 * |
|
11 * Modification History: |
|
12 * |
|
13 * Date Name Description |
|
14 * 12/07/98 bertrand Creation. |
|
15 ****************************************************************************** |
|
16 */ |
|
17 |
|
18 #include "unicode/utypes.h" |
|
19 #include "unicode/putil.h" |
|
20 #include "unicode/ustring.h" |
|
21 #include "unicode/utf16.h" |
|
22 #include "cstring.h" |
|
23 #include "cwchar.h" |
|
24 #include "cmemory.h" |
|
25 #include "ustr_imp.h" |
|
26 |
|
27 /* ANSI string.h - style functions ------------------------------------------ */ |
|
28 |
|
29 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ |
|
30 #define U_BMP_MAX 0xffff |
|
31 |
|
32 /* Forward binary string search functions ----------------------------------- */ |
|
33 |
|
34 /* |
|
35 * Test if a substring match inside a string is at code point boundaries. |
|
36 * All pointers refer to the same buffer. |
|
37 * The limit pointer may be NULL, all others must be real pointers. |
|
38 */ |
|
39 static inline UBool |
|
40 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { |
|
41 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { |
|
42 /* the leading edge of the match is in the middle of a surrogate pair */ |
|
43 return FALSE; |
|
44 } |
|
45 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { |
|
46 /* the trailing edge of the match is in the middle of a surrogate pair */ |
|
47 return FALSE; |
|
48 } |
|
49 return TRUE; |
|
50 } |
|
51 |
|
52 U_CAPI UChar * U_EXPORT2 |
|
53 u_strFindFirst(const UChar *s, int32_t length, |
|
54 const UChar *sub, int32_t subLength) { |
|
55 const UChar *start, *p, *q, *subLimit; |
|
56 UChar c, cs, cq; |
|
57 |
|
58 if(sub==NULL || subLength<-1) { |
|
59 return (UChar *)s; |
|
60 } |
|
61 if(s==NULL || length<-1) { |
|
62 return NULL; |
|
63 } |
|
64 |
|
65 start=s; |
|
66 |
|
67 if(length<0 && subLength<0) { |
|
68 /* both strings are NUL-terminated */ |
|
69 if((cs=*sub++)==0) { |
|
70 return (UChar *)s; |
|
71 } |
|
72 if(*sub==0 && !U16_IS_SURROGATE(cs)) { |
|
73 /* the substring consists of a single, non-surrogate BMP code point */ |
|
74 return u_strchr(s, cs); |
|
75 } |
|
76 |
|
77 while((c=*s++)!=0) { |
|
78 if(c==cs) { |
|
79 /* found first substring UChar, compare rest */ |
|
80 p=s; |
|
81 q=sub; |
|
82 for(;;) { |
|
83 if((cq=*q)==0) { |
|
84 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { |
|
85 return (UChar *)(s-1); /* well-formed match */ |
|
86 } else { |
|
87 break; /* no match because surrogate pair is split */ |
|
88 } |
|
89 } |
|
90 if((c=*p)==0) { |
|
91 return NULL; /* no match, and none possible after s */ |
|
92 } |
|
93 if(c!=cq) { |
|
94 break; /* no match */ |
|
95 } |
|
96 ++p; |
|
97 ++q; |
|
98 } |
|
99 } |
|
100 } |
|
101 |
|
102 /* not found */ |
|
103 return NULL; |
|
104 } |
|
105 |
|
106 if(subLength<0) { |
|
107 subLength=u_strlen(sub); |
|
108 } |
|
109 if(subLength==0) { |
|
110 return (UChar *)s; |
|
111 } |
|
112 |
|
113 /* get sub[0] to search for it fast */ |
|
114 cs=*sub++; |
|
115 --subLength; |
|
116 subLimit=sub+subLength; |
|
117 |
|
118 if(subLength==0 && !U16_IS_SURROGATE(cs)) { |
|
119 /* the substring consists of a single, non-surrogate BMP code point */ |
|
120 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); |
|
121 } |
|
122 |
|
123 if(length<0) { |
|
124 /* s is NUL-terminated */ |
|
125 while((c=*s++)!=0) { |
|
126 if(c==cs) { |
|
127 /* found first substring UChar, compare rest */ |
|
128 p=s; |
|
129 q=sub; |
|
130 for(;;) { |
|
131 if(q==subLimit) { |
|
132 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { |
|
133 return (UChar *)(s-1); /* well-formed match */ |
|
134 } else { |
|
135 break; /* no match because surrogate pair is split */ |
|
136 } |
|
137 } |
|
138 if((c=*p)==0) { |
|
139 return NULL; /* no match, and none possible after s */ |
|
140 } |
|
141 if(c!=*q) { |
|
142 break; /* no match */ |
|
143 } |
|
144 ++p; |
|
145 ++q; |
|
146 } |
|
147 } |
|
148 } |
|
149 } else { |
|
150 const UChar *limit, *preLimit; |
|
151 |
|
152 /* subLength was decremented above */ |
|
153 if(length<=subLength) { |
|
154 return NULL; /* s is shorter than sub */ |
|
155 } |
|
156 |
|
157 limit=s+length; |
|
158 |
|
159 /* the substring must start before preLimit */ |
|
160 preLimit=limit-subLength; |
|
161 |
|
162 while(s!=preLimit) { |
|
163 c=*s++; |
|
164 if(c==cs) { |
|
165 /* found first substring UChar, compare rest */ |
|
166 p=s; |
|
167 q=sub; |
|
168 for(;;) { |
|
169 if(q==subLimit) { |
|
170 if(isMatchAtCPBoundary(start, s-1, p, limit)) { |
|
171 return (UChar *)(s-1); /* well-formed match */ |
|
172 } else { |
|
173 break; /* no match because surrogate pair is split */ |
|
174 } |
|
175 } |
|
176 if(*p!=*q) { |
|
177 break; /* no match */ |
|
178 } |
|
179 ++p; |
|
180 ++q; |
|
181 } |
|
182 } |
|
183 } |
|
184 } |
|
185 |
|
186 /* not found */ |
|
187 return NULL; |
|
188 } |
|
189 |
|
190 U_CAPI UChar * U_EXPORT2 |
|
191 u_strstr(const UChar *s, const UChar *substring) { |
|
192 return u_strFindFirst(s, -1, substring, -1); |
|
193 } |
|
194 |
|
195 U_CAPI UChar * U_EXPORT2 |
|
196 u_strchr(const UChar *s, UChar c) { |
|
197 if(U16_IS_SURROGATE(c)) { |
|
198 /* make sure to not find half of a surrogate pair */ |
|
199 return u_strFindFirst(s, -1, &c, 1); |
|
200 } else { |
|
201 UChar cs; |
|
202 |
|
203 /* trivial search for a BMP code point */ |
|
204 for(;;) { |
|
205 if((cs=*s)==c) { |
|
206 return (UChar *)s; |
|
207 } |
|
208 if(cs==0) { |
|
209 return NULL; |
|
210 } |
|
211 ++s; |
|
212 } |
|
213 } |
|
214 } |
|
215 |
|
216 U_CAPI UChar * U_EXPORT2 |
|
217 u_strchr32(const UChar *s, UChar32 c) { |
|
218 if((uint32_t)c<=U_BMP_MAX) { |
|
219 /* find BMP code point */ |
|
220 return u_strchr(s, (UChar)c); |
|
221 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
|
222 /* find supplementary code point as surrogate pair */ |
|
223 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); |
|
224 |
|
225 while((cs=*s++)!=0) { |
|
226 if(cs==lead && *s==trail) { |
|
227 return (UChar *)(s-1); |
|
228 } |
|
229 } |
|
230 return NULL; |
|
231 } else { |
|
232 /* not a Unicode code point, not findable */ |
|
233 return NULL; |
|
234 } |
|
235 } |
|
236 |
|
237 U_CAPI UChar * U_EXPORT2 |
|
238 u_memchr(const UChar *s, UChar c, int32_t count) { |
|
239 if(count<=0) { |
|
240 return NULL; /* no string */ |
|
241 } else if(U16_IS_SURROGATE(c)) { |
|
242 /* make sure to not find half of a surrogate pair */ |
|
243 return u_strFindFirst(s, count, &c, 1); |
|
244 } else { |
|
245 /* trivial search for a BMP code point */ |
|
246 const UChar *limit=s+count; |
|
247 do { |
|
248 if(*s==c) { |
|
249 return (UChar *)s; |
|
250 } |
|
251 } while(++s!=limit); |
|
252 return NULL; |
|
253 } |
|
254 } |
|
255 |
|
256 U_CAPI UChar * U_EXPORT2 |
|
257 u_memchr32(const UChar *s, UChar32 c, int32_t count) { |
|
258 if((uint32_t)c<=U_BMP_MAX) { |
|
259 /* find BMP code point */ |
|
260 return u_memchr(s, (UChar)c, count); |
|
261 } else if(count<2) { |
|
262 /* too short for a surrogate pair */ |
|
263 return NULL; |
|
264 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
|
265 /* find supplementary code point as surrogate pair */ |
|
266 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ |
|
267 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); |
|
268 |
|
269 do { |
|
270 if(*s==lead && *(s+1)==trail) { |
|
271 return (UChar *)s; |
|
272 } |
|
273 } while(++s!=limit); |
|
274 return NULL; |
|
275 } else { |
|
276 /* not a Unicode code point, not findable */ |
|
277 return NULL; |
|
278 } |
|
279 } |
|
280 |
|
281 /* Backward binary string search functions ---------------------------------- */ |
|
282 |
|
283 U_CAPI UChar * U_EXPORT2 |
|
284 u_strFindLast(const UChar *s, int32_t length, |
|
285 const UChar *sub, int32_t subLength) { |
|
286 const UChar *start, *limit, *p, *q, *subLimit; |
|
287 UChar c, cs; |
|
288 |
|
289 if(sub==NULL || subLength<-1) { |
|
290 return (UChar *)s; |
|
291 } |
|
292 if(s==NULL || length<-1) { |
|
293 return NULL; |
|
294 } |
|
295 |
|
296 /* |
|
297 * This implementation is more lazy than the one for u_strFindFirst(): |
|
298 * There is no special search code for NUL-terminated strings. |
|
299 * It does not seem to be worth it for searching substrings to |
|
300 * search forward and find all matches like in u_strrchr() and similar. |
|
301 * Therefore, we simply get both string lengths and search backward. |
|
302 * |
|
303 * markus 2002oct23 |
|
304 */ |
|
305 |
|
306 if(subLength<0) { |
|
307 subLength=u_strlen(sub); |
|
308 } |
|
309 if(subLength==0) { |
|
310 return (UChar *)s; |
|
311 } |
|
312 |
|
313 /* get sub[subLength-1] to search for it fast */ |
|
314 subLimit=sub+subLength; |
|
315 cs=*(--subLimit); |
|
316 --subLength; |
|
317 |
|
318 if(subLength==0 && !U16_IS_SURROGATE(cs)) { |
|
319 /* the substring consists of a single, non-surrogate BMP code point */ |
|
320 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); |
|
321 } |
|
322 |
|
323 if(length<0) { |
|
324 length=u_strlen(s); |
|
325 } |
|
326 |
|
327 /* subLength was decremented above */ |
|
328 if(length<=subLength) { |
|
329 return NULL; /* s is shorter than sub */ |
|
330 } |
|
331 |
|
332 start=s; |
|
333 limit=s+length; |
|
334 |
|
335 /* the substring must start no later than s+subLength */ |
|
336 s+=subLength; |
|
337 |
|
338 while(s!=limit) { |
|
339 c=*(--limit); |
|
340 if(c==cs) { |
|
341 /* found last substring UChar, compare rest */ |
|
342 p=limit; |
|
343 q=subLimit; |
|
344 for(;;) { |
|
345 if(q==sub) { |
|
346 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { |
|
347 return (UChar *)p; /* well-formed match */ |
|
348 } else { |
|
349 break; /* no match because surrogate pair is split */ |
|
350 } |
|
351 } |
|
352 if(*(--p)!=*(--q)) { |
|
353 break; /* no match */ |
|
354 } |
|
355 } |
|
356 } |
|
357 } |
|
358 |
|
359 /* not found */ |
|
360 return NULL; |
|
361 } |
|
362 |
|
363 U_CAPI UChar * U_EXPORT2 |
|
364 u_strrstr(const UChar *s, const UChar *substring) { |
|
365 return u_strFindLast(s, -1, substring, -1); |
|
366 } |
|
367 |
|
368 U_CAPI UChar * U_EXPORT2 |
|
369 u_strrchr(const UChar *s, UChar c) { |
|
370 if(U16_IS_SURROGATE(c)) { |
|
371 /* make sure to not find half of a surrogate pair */ |
|
372 return u_strFindLast(s, -1, &c, 1); |
|
373 } else { |
|
374 const UChar *result=NULL; |
|
375 UChar cs; |
|
376 |
|
377 /* trivial search for a BMP code point */ |
|
378 for(;;) { |
|
379 if((cs=*s)==c) { |
|
380 result=s; |
|
381 } |
|
382 if(cs==0) { |
|
383 return (UChar *)result; |
|
384 } |
|
385 ++s; |
|
386 } |
|
387 } |
|
388 } |
|
389 |
|
390 U_CAPI UChar * U_EXPORT2 |
|
391 u_strrchr32(const UChar *s, UChar32 c) { |
|
392 if((uint32_t)c<=U_BMP_MAX) { |
|
393 /* find BMP code point */ |
|
394 return u_strrchr(s, (UChar)c); |
|
395 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
|
396 /* find supplementary code point as surrogate pair */ |
|
397 const UChar *result=NULL; |
|
398 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); |
|
399 |
|
400 while((cs=*s++)!=0) { |
|
401 if(cs==lead && *s==trail) { |
|
402 result=s-1; |
|
403 } |
|
404 } |
|
405 return (UChar *)result; |
|
406 } else { |
|
407 /* not a Unicode code point, not findable */ |
|
408 return NULL; |
|
409 } |
|
410 } |
|
411 |
|
412 U_CAPI UChar * U_EXPORT2 |
|
413 u_memrchr(const UChar *s, UChar c, int32_t count) { |
|
414 if(count<=0) { |
|
415 return NULL; /* no string */ |
|
416 } else if(U16_IS_SURROGATE(c)) { |
|
417 /* make sure to not find half of a surrogate pair */ |
|
418 return u_strFindLast(s, count, &c, 1); |
|
419 } else { |
|
420 /* trivial search for a BMP code point */ |
|
421 const UChar *limit=s+count; |
|
422 do { |
|
423 if(*(--limit)==c) { |
|
424 return (UChar *)limit; |
|
425 } |
|
426 } while(s!=limit); |
|
427 return NULL; |
|
428 } |
|
429 } |
|
430 |
|
431 U_CAPI UChar * U_EXPORT2 |
|
432 u_memrchr32(const UChar *s, UChar32 c, int32_t count) { |
|
433 if((uint32_t)c<=U_BMP_MAX) { |
|
434 /* find BMP code point */ |
|
435 return u_memrchr(s, (UChar)c, count); |
|
436 } else if(count<2) { |
|
437 /* too short for a surrogate pair */ |
|
438 return NULL; |
|
439 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { |
|
440 /* find supplementary code point as surrogate pair */ |
|
441 const UChar *limit=s+count-1; |
|
442 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); |
|
443 |
|
444 do { |
|
445 if(*limit==trail && *(limit-1)==lead) { |
|
446 return (UChar *)(limit-1); |
|
447 } |
|
448 } while(s!=--limit); |
|
449 return NULL; |
|
450 } else { |
|
451 /* not a Unicode code point, not findable */ |
|
452 return NULL; |
|
453 } |
|
454 } |
|
455 |
|
456 /* Tokenization functions --------------------------------------------------- */ |
|
457 |
|
458 /* |
|
459 * Match each code point in a string against each code point in the matchSet. |
|
460 * Return the index of the first string code point that |
|
461 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. |
|
462 * Return -(string length)-1 if there is no such code point. |
|
463 */ |
|
464 static int32_t |
|
465 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { |
|
466 int32_t matchLen, matchBMPLen, strItr, matchItr; |
|
467 UChar32 stringCh, matchCh; |
|
468 UChar c, c2; |
|
469 |
|
470 /* first part of matchSet contains only BMP code points */ |
|
471 matchBMPLen = 0; |
|
472 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { |
|
473 ++matchBMPLen; |
|
474 } |
|
475 |
|
476 /* second part of matchSet contains BMP and supplementary code points */ |
|
477 matchLen = matchBMPLen; |
|
478 while(matchSet[matchLen] != 0) { |
|
479 ++matchLen; |
|
480 } |
|
481 |
|
482 for(strItr = 0; (c = string[strItr]) != 0;) { |
|
483 ++strItr; |
|
484 if(U16_IS_SINGLE(c)) { |
|
485 if(polarity) { |
|
486 for(matchItr = 0; matchItr < matchLen; ++matchItr) { |
|
487 if(c == matchSet[matchItr]) { |
|
488 return strItr - 1; /* one matches */ |
|
489 } |
|
490 } |
|
491 } else { |
|
492 for(matchItr = 0; matchItr < matchLen; ++matchItr) { |
|
493 if(c == matchSet[matchItr]) { |
|
494 goto endloop; |
|
495 } |
|
496 } |
|
497 return strItr - 1; /* none matches */ |
|
498 } |
|
499 } else { |
|
500 /* |
|
501 * No need to check for string length before U16_IS_TRAIL |
|
502 * because c2 could at worst be the terminating NUL. |
|
503 */ |
|
504 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { |
|
505 ++strItr; |
|
506 stringCh = U16_GET_SUPPLEMENTARY(c, c2); |
|
507 } else { |
|
508 stringCh = c; /* unpaired trail surrogate */ |
|
509 } |
|
510 |
|
511 if(polarity) { |
|
512 for(matchItr = matchBMPLen; matchItr < matchLen;) { |
|
513 U16_NEXT(matchSet, matchItr, matchLen, matchCh); |
|
514 if(stringCh == matchCh) { |
|
515 return strItr - U16_LENGTH(stringCh); /* one matches */ |
|
516 } |
|
517 } |
|
518 } else { |
|
519 for(matchItr = matchBMPLen; matchItr < matchLen;) { |
|
520 U16_NEXT(matchSet, matchItr, matchLen, matchCh); |
|
521 if(stringCh == matchCh) { |
|
522 goto endloop; |
|
523 } |
|
524 } |
|
525 return strItr - U16_LENGTH(stringCh); /* none matches */ |
|
526 } |
|
527 } |
|
528 endloop: |
|
529 /* wish C had continue with labels like Java... */; |
|
530 } |
|
531 |
|
532 /* Didn't find it. */ |
|
533 return -strItr-1; |
|
534 } |
|
535 |
|
536 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ |
|
537 U_CAPI UChar * U_EXPORT2 |
|
538 u_strpbrk(const UChar *string, const UChar *matchSet) |
|
539 { |
|
540 int32_t idx = _matchFromSet(string, matchSet, TRUE); |
|
541 if(idx >= 0) { |
|
542 return (UChar *)string + idx; |
|
543 } else { |
|
544 return NULL; |
|
545 } |
|
546 } |
|
547 |
|
548 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ |
|
549 U_CAPI int32_t U_EXPORT2 |
|
550 u_strcspn(const UChar *string, const UChar *matchSet) |
|
551 { |
|
552 int32_t idx = _matchFromSet(string, matchSet, TRUE); |
|
553 if(idx >= 0) { |
|
554 return idx; |
|
555 } else { |
|
556 return -idx - 1; /* == u_strlen(string) */ |
|
557 } |
|
558 } |
|
559 |
|
560 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ |
|
561 U_CAPI int32_t U_EXPORT2 |
|
562 u_strspn(const UChar *string, const UChar *matchSet) |
|
563 { |
|
564 int32_t idx = _matchFromSet(string, matchSet, FALSE); |
|
565 if(idx >= 0) { |
|
566 return idx; |
|
567 } else { |
|
568 return -idx - 1; /* == u_strlen(string) */ |
|
569 } |
|
570 } |
|
571 |
|
572 /* ----- Text manipulation functions --- */ |
|
573 |
|
574 U_CAPI UChar* U_EXPORT2 |
|
575 u_strtok_r(UChar *src, |
|
576 const UChar *delim, |
|
577 UChar **saveState) |
|
578 { |
|
579 UChar *tokSource; |
|
580 UChar *nextToken; |
|
581 uint32_t nonDelimIdx; |
|
582 |
|
583 /* If saveState is NULL, the user messed up. */ |
|
584 if (src != NULL) { |
|
585 tokSource = src; |
|
586 *saveState = src; /* Set to "src" in case there are no delimiters */ |
|
587 } |
|
588 else if (*saveState) { |
|
589 tokSource = *saveState; |
|
590 } |
|
591 else { |
|
592 /* src == NULL && *saveState == NULL */ |
|
593 /* This shouldn't happen. We already finished tokenizing. */ |
|
594 return NULL; |
|
595 } |
|
596 |
|
597 /* Skip initial delimiters */ |
|
598 nonDelimIdx = u_strspn(tokSource, delim); |
|
599 tokSource = &tokSource[nonDelimIdx]; |
|
600 |
|
601 if (*tokSource) { |
|
602 nextToken = u_strpbrk(tokSource, delim); |
|
603 if (nextToken != NULL) { |
|
604 /* Create a token */ |
|
605 *(nextToken++) = 0; |
|
606 *saveState = nextToken; |
|
607 return tokSource; |
|
608 } |
|
609 else if (*saveState) { |
|
610 /* Return the last token */ |
|
611 *saveState = NULL; |
|
612 return tokSource; |
|
613 } |
|
614 } |
|
615 else { |
|
616 /* No tokens were found. Only delimiters were left. */ |
|
617 *saveState = NULL; |
|
618 } |
|
619 return NULL; |
|
620 } |
|
621 |
|
622 /* Miscellaneous functions -------------------------------------------------- */ |
|
623 |
|
624 U_CAPI UChar* U_EXPORT2 |
|
625 u_strcat(UChar *dst, |
|
626 const UChar *src) |
|
627 { |
|
628 UChar *anchor = dst; /* save a pointer to start of dst */ |
|
629 |
|
630 while(*dst != 0) { /* To end of first string */ |
|
631 ++dst; |
|
632 } |
|
633 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ |
|
634 } |
|
635 |
|
636 return anchor; |
|
637 } |
|
638 |
|
639 U_CAPI UChar* U_EXPORT2 |
|
640 u_strncat(UChar *dst, |
|
641 const UChar *src, |
|
642 int32_t n ) |
|
643 { |
|
644 if(n > 0) { |
|
645 UChar *anchor = dst; /* save a pointer to start of dst */ |
|
646 |
|
647 while(*dst != 0) { /* To end of first string */ |
|
648 ++dst; |
|
649 } |
|
650 while((*dst = *src) != 0) { /* copy string 2 over */ |
|
651 ++dst; |
|
652 if(--n == 0) { |
|
653 *dst = 0; |
|
654 break; |
|
655 } |
|
656 ++src; |
|
657 } |
|
658 |
|
659 return anchor; |
|
660 } else { |
|
661 return dst; |
|
662 } |
|
663 } |
|
664 |
|
665 /* ----- Text property functions --- */ |
|
666 |
|
667 U_CAPI int32_t U_EXPORT2 |
|
668 u_strcmp(const UChar *s1, |
|
669 const UChar *s2) |
|
670 { |
|
671 UChar c1, c2; |
|
672 |
|
673 for(;;) { |
|
674 c1=*s1++; |
|
675 c2=*s2++; |
|
676 if (c1 != c2 || c1 == 0) { |
|
677 break; |
|
678 } |
|
679 } |
|
680 return (int32_t)c1 - (int32_t)c2; |
|
681 } |
|
682 |
|
683 U_CFUNC int32_t U_EXPORT2 |
|
684 uprv_strCompare(const UChar *s1, int32_t length1, |
|
685 const UChar *s2, int32_t length2, |
|
686 UBool strncmpStyle, UBool codePointOrder) { |
|
687 const UChar *start1, *start2, *limit1, *limit2; |
|
688 UChar c1, c2; |
|
689 |
|
690 /* setup for fix-up */ |
|
691 start1=s1; |
|
692 start2=s2; |
|
693 |
|
694 /* compare identical prefixes - they do not need to be fixed up */ |
|
695 if(length1<0 && length2<0) { |
|
696 /* strcmp style, both NUL-terminated */ |
|
697 if(s1==s2) { |
|
698 return 0; |
|
699 } |
|
700 |
|
701 for(;;) { |
|
702 c1=*s1; |
|
703 c2=*s2; |
|
704 if(c1!=c2) { |
|
705 break; |
|
706 } |
|
707 if(c1==0) { |
|
708 return 0; |
|
709 } |
|
710 ++s1; |
|
711 ++s2; |
|
712 } |
|
713 |
|
714 /* setup for fix-up */ |
|
715 limit1=limit2=NULL; |
|
716 } else if(strncmpStyle) { |
|
717 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ |
|
718 if(s1==s2) { |
|
719 return 0; |
|
720 } |
|
721 |
|
722 limit1=start1+length1; |
|
723 |
|
724 for(;;) { |
|
725 /* both lengths are same, check only one limit */ |
|
726 if(s1==limit1) { |
|
727 return 0; |
|
728 } |
|
729 |
|
730 c1=*s1; |
|
731 c2=*s2; |
|
732 if(c1!=c2) { |
|
733 break; |
|
734 } |
|
735 if(c1==0) { |
|
736 return 0; |
|
737 } |
|
738 ++s1; |
|
739 ++s2; |
|
740 } |
|
741 |
|
742 /* setup for fix-up */ |
|
743 limit2=start2+length1; /* use length1 here, too, to enforce assumption */ |
|
744 } else { |
|
745 /* memcmp/UnicodeString style, both length-specified */ |
|
746 int32_t lengthResult; |
|
747 |
|
748 if(length1<0) { |
|
749 length1=u_strlen(s1); |
|
750 } |
|
751 if(length2<0) { |
|
752 length2=u_strlen(s2); |
|
753 } |
|
754 |
|
755 /* limit1=start1+min(lenght1, length2) */ |
|
756 if(length1<length2) { |
|
757 lengthResult=-1; |
|
758 limit1=start1+length1; |
|
759 } else if(length1==length2) { |
|
760 lengthResult=0; |
|
761 limit1=start1+length1; |
|
762 } else /* length1>length2 */ { |
|
763 lengthResult=1; |
|
764 limit1=start1+length2; |
|
765 } |
|
766 |
|
767 if(s1==s2) { |
|
768 return lengthResult; |
|
769 } |
|
770 |
|
771 for(;;) { |
|
772 /* check pseudo-limit */ |
|
773 if(s1==limit1) { |
|
774 return lengthResult; |
|
775 } |
|
776 |
|
777 c1=*s1; |
|
778 c2=*s2; |
|
779 if(c1!=c2) { |
|
780 break; |
|
781 } |
|
782 ++s1; |
|
783 ++s2; |
|
784 } |
|
785 |
|
786 /* setup for fix-up */ |
|
787 limit1=start1+length1; |
|
788 limit2=start2+length2; |
|
789 } |
|
790 |
|
791 /* if both values are in or above the surrogate range, fix them up */ |
|
792 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { |
|
793 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
|
794 if( |
|
795 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || |
|
796 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) |
|
797 ) { |
|
798 /* part of a surrogate pair, leave >=d800 */ |
|
799 } else { |
|
800 /* BMP code point - may be surrogate code point - make <d800 */ |
|
801 c1-=0x2800; |
|
802 } |
|
803 |
|
804 if( |
|
805 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || |
|
806 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) |
|
807 ) { |
|
808 /* part of a surrogate pair, leave >=d800 */ |
|
809 } else { |
|
810 /* BMP code point - may be surrogate code point - make <d800 */ |
|
811 c2-=0x2800; |
|
812 } |
|
813 } |
|
814 |
|
815 /* now c1 and c2 are in the requested (code unit or code point) order */ |
|
816 return (int32_t)c1-(int32_t)c2; |
|
817 } |
|
818 |
|
819 /* |
|
820 * Compare two strings as presented by UCharIterators. |
|
821 * Use code unit or code point order. |
|
822 * When the function returns, it is undefined where the iterators |
|
823 * have stopped. |
|
824 */ |
|
825 U_CAPI int32_t U_EXPORT2 |
|
826 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { |
|
827 UChar32 c1, c2; |
|
828 |
|
829 /* argument checking */ |
|
830 if(iter1==NULL || iter2==NULL) { |
|
831 return 0; /* bad arguments */ |
|
832 } |
|
833 if(iter1==iter2) { |
|
834 return 0; /* identical iterators */ |
|
835 } |
|
836 |
|
837 /* reset iterators to start? */ |
|
838 iter1->move(iter1, 0, UITER_START); |
|
839 iter2->move(iter2, 0, UITER_START); |
|
840 |
|
841 /* compare identical prefixes - they do not need to be fixed up */ |
|
842 for(;;) { |
|
843 c1=iter1->next(iter1); |
|
844 c2=iter2->next(iter2); |
|
845 if(c1!=c2) { |
|
846 break; |
|
847 } |
|
848 if(c1==-1) { |
|
849 return 0; |
|
850 } |
|
851 } |
|
852 |
|
853 /* if both values are in or above the surrogate range, fix them up */ |
|
854 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { |
|
855 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
|
856 if( |
|
857 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || |
|
858 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) |
|
859 ) { |
|
860 /* part of a surrogate pair, leave >=d800 */ |
|
861 } else { |
|
862 /* BMP code point - may be surrogate code point - make <d800 */ |
|
863 c1-=0x2800; |
|
864 } |
|
865 |
|
866 if( |
|
867 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || |
|
868 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) |
|
869 ) { |
|
870 /* part of a surrogate pair, leave >=d800 */ |
|
871 } else { |
|
872 /* BMP code point - may be surrogate code point - make <d800 */ |
|
873 c2-=0x2800; |
|
874 } |
|
875 } |
|
876 |
|
877 /* now c1 and c2 are in the requested (code unit or code point) order */ |
|
878 return (int32_t)c1-(int32_t)c2; |
|
879 } |
|
880 |
|
881 #if 0 |
|
882 /* |
|
883 * u_strCompareIter() does not leave the iterators _on_ the different units. |
|
884 * This is possible but would cost a few extra indirect function calls to back |
|
885 * up if the last unit (c1 or c2 respectively) was >=0. |
|
886 * |
|
887 * Consistently leaving them _behind_ the different units is not an option |
|
888 * because the current "unit" is the end of the string if that is reached, |
|
889 * and in such a case the iterator does not move. |
|
890 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end |
|
891 * of their strings. Calling previous() on each does not move them to where |
|
892 * the comparison fails. |
|
893 * |
|
894 * So the simplest semantics is to not define where the iterators end up. |
|
895 * |
|
896 * The following fragment is part of what would need to be done for backing up. |
|
897 */ |
|
898 void fragment { |
|
899 /* iff a surrogate is part of a surrogate pair, leave >=d800 */ |
|
900 if(c1<=0xdbff) { |
|
901 if(!U16_IS_TRAIL(iter1->current(iter1))) { |
|
902 /* lead surrogate code point - make <d800 */ |
|
903 c1-=0x2800; |
|
904 } |
|
905 } else if(c1<=0xdfff) { |
|
906 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); |
|
907 iter1->previous(iter1); /* ==c1 */ |
|
908 if(!U16_IS_LEAD(iter1->previous(iter1))) { |
|
909 /* trail surrogate code point - make <d800 */ |
|
910 c1-=0x2800; |
|
911 } |
|
912 /* go back to behind where the difference is */ |
|
913 iter1->move(iter1, idx, UITER_ZERO); |
|
914 } else /* 0xe000<=c1<=0xffff */ { |
|
915 /* BMP code point - make <d800 */ |
|
916 c1-=0x2800; |
|
917 } |
|
918 } |
|
919 #endif |
|
920 |
|
921 U_CAPI int32_t U_EXPORT2 |
|
922 u_strCompare(const UChar *s1, int32_t length1, |
|
923 const UChar *s2, int32_t length2, |
|
924 UBool codePointOrder) { |
|
925 /* argument checking */ |
|
926 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { |
|
927 return 0; |
|
928 } |
|
929 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); |
|
930 } |
|
931 |
|
932 /* String compare in code point order - u_strcmp() compares in code unit order. */ |
|
933 U_CAPI int32_t U_EXPORT2 |
|
934 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { |
|
935 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); |
|
936 } |
|
937 |
|
938 U_CAPI int32_t U_EXPORT2 |
|
939 u_strncmp(const UChar *s1, |
|
940 const UChar *s2, |
|
941 int32_t n) |
|
942 { |
|
943 if(n > 0) { |
|
944 int32_t rc; |
|
945 for(;;) { |
|
946 rc = (int32_t)*s1 - (int32_t)*s2; |
|
947 if(rc != 0 || *s1 == 0 || --n == 0) { |
|
948 return rc; |
|
949 } |
|
950 ++s1; |
|
951 ++s2; |
|
952 } |
|
953 } else { |
|
954 return 0; |
|
955 } |
|
956 } |
|
957 |
|
958 U_CAPI int32_t U_EXPORT2 |
|
959 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { |
|
960 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); |
|
961 } |
|
962 |
|
963 U_CAPI UChar* U_EXPORT2 |
|
964 u_strcpy(UChar *dst, |
|
965 const UChar *src) |
|
966 { |
|
967 UChar *anchor = dst; /* save a pointer to start of dst */ |
|
968 |
|
969 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ |
|
970 } |
|
971 |
|
972 return anchor; |
|
973 } |
|
974 |
|
975 U_CAPI UChar* U_EXPORT2 |
|
976 u_strncpy(UChar *dst, |
|
977 const UChar *src, |
|
978 int32_t n) |
|
979 { |
|
980 UChar *anchor = dst; /* save a pointer to start of dst */ |
|
981 |
|
982 /* copy string 2 over */ |
|
983 while(n > 0 && (*(dst++) = *(src++)) != 0) { |
|
984 --n; |
|
985 } |
|
986 |
|
987 return anchor; |
|
988 } |
|
989 |
|
990 U_CAPI int32_t U_EXPORT2 |
|
991 u_strlen(const UChar *s) |
|
992 { |
|
993 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR |
|
994 return (int32_t)uprv_wcslen(s); |
|
995 #else |
|
996 const UChar *t = s; |
|
997 while(*t != 0) { |
|
998 ++t; |
|
999 } |
|
1000 return t - s; |
|
1001 #endif |
|
1002 } |
|
1003 |
|
1004 U_CAPI int32_t U_EXPORT2 |
|
1005 u_countChar32(const UChar *s, int32_t length) { |
|
1006 int32_t count; |
|
1007 |
|
1008 if(s==NULL || length<-1) { |
|
1009 return 0; |
|
1010 } |
|
1011 |
|
1012 count=0; |
|
1013 if(length>=0) { |
|
1014 while(length>0) { |
|
1015 ++count; |
|
1016 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { |
|
1017 s+=2; |
|
1018 length-=2; |
|
1019 } else { |
|
1020 ++s; |
|
1021 --length; |
|
1022 } |
|
1023 } |
|
1024 } else /* length==-1 */ { |
|
1025 UChar c; |
|
1026 |
|
1027 for(;;) { |
|
1028 if((c=*s++)==0) { |
|
1029 break; |
|
1030 } |
|
1031 ++count; |
|
1032 |
|
1033 /* |
|
1034 * sufficient to look ahead one because of UTF-16; |
|
1035 * safe to look ahead one because at worst that would be the terminating NUL |
|
1036 */ |
|
1037 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { |
|
1038 ++s; |
|
1039 } |
|
1040 } |
|
1041 } |
|
1042 return count; |
|
1043 } |
|
1044 |
|
1045 U_CAPI UBool U_EXPORT2 |
|
1046 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { |
|
1047 |
|
1048 if(number<0) { |
|
1049 return TRUE; |
|
1050 } |
|
1051 if(s==NULL || length<-1) { |
|
1052 return FALSE; |
|
1053 } |
|
1054 |
|
1055 if(length==-1) { |
|
1056 /* s is NUL-terminated */ |
|
1057 UChar c; |
|
1058 |
|
1059 /* count code points until they exceed */ |
|
1060 for(;;) { |
|
1061 if((c=*s++)==0) { |
|
1062 return FALSE; |
|
1063 } |
|
1064 if(number==0) { |
|
1065 return TRUE; |
|
1066 } |
|
1067 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { |
|
1068 ++s; |
|
1069 } |
|
1070 --number; |
|
1071 } |
|
1072 } else { |
|
1073 /* length>=0 known */ |
|
1074 const UChar *limit; |
|
1075 int32_t maxSupplementary; |
|
1076 |
|
1077 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ |
|
1078 if(((length+1)/2)>number) { |
|
1079 return TRUE; |
|
1080 } |
|
1081 |
|
1082 /* check if s does not even contain enough UChars */ |
|
1083 maxSupplementary=length-number; |
|
1084 if(maxSupplementary<=0) { |
|
1085 return FALSE; |
|
1086 } |
|
1087 /* there are maxSupplementary=length-number more UChars than asked-for code points */ |
|
1088 |
|
1089 /* |
|
1090 * count code points until they exceed and also check that there are |
|
1091 * no more than maxSupplementary supplementary code points (UChar pairs) |
|
1092 */ |
|
1093 limit=s+length; |
|
1094 for(;;) { |
|
1095 if(s==limit) { |
|
1096 return FALSE; |
|
1097 } |
|
1098 if(number==0) { |
|
1099 return TRUE; |
|
1100 } |
|
1101 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { |
|
1102 ++s; |
|
1103 if(--maxSupplementary<=0) { |
|
1104 /* too many pairs - too few code points */ |
|
1105 return FALSE; |
|
1106 } |
|
1107 } |
|
1108 --number; |
|
1109 } |
|
1110 } |
|
1111 } |
|
1112 |
|
1113 U_CAPI UChar * U_EXPORT2 |
|
1114 u_memcpy(UChar *dest, const UChar *src, int32_t count) { |
|
1115 if(count > 0) { |
|
1116 uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); |
|
1117 } |
|
1118 return dest; |
|
1119 } |
|
1120 |
|
1121 U_CAPI UChar * U_EXPORT2 |
|
1122 u_memmove(UChar *dest, const UChar *src, int32_t count) { |
|
1123 if(count > 0) { |
|
1124 uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); |
|
1125 } |
|
1126 return dest; |
|
1127 } |
|
1128 |
|
1129 U_CAPI UChar * U_EXPORT2 |
|
1130 u_memset(UChar *dest, UChar c, int32_t count) { |
|
1131 if(count > 0) { |
|
1132 UChar *ptr = dest; |
|
1133 UChar *limit = dest + count; |
|
1134 |
|
1135 while (ptr < limit) { |
|
1136 *(ptr++) = c; |
|
1137 } |
|
1138 } |
|
1139 return dest; |
|
1140 } |
|
1141 |
|
1142 U_CAPI int32_t U_EXPORT2 |
|
1143 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { |
|
1144 if(count > 0) { |
|
1145 const UChar *limit = buf1 + count; |
|
1146 int32_t result; |
|
1147 |
|
1148 while (buf1 < limit) { |
|
1149 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; |
|
1150 if (result != 0) { |
|
1151 return result; |
|
1152 } |
|
1153 buf1++; |
|
1154 buf2++; |
|
1155 } |
|
1156 } |
|
1157 return 0; |
|
1158 } |
|
1159 |
|
1160 U_CAPI int32_t U_EXPORT2 |
|
1161 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { |
|
1162 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); |
|
1163 } |
|
1164 |
|
1165 /* u_unescape & support fns ------------------------------------------------- */ |
|
1166 |
|
1167 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ |
|
1168 static const UChar UNESCAPE_MAP[] = { |
|
1169 /*" 0x22, 0x22 */ |
|
1170 /*' 0x27, 0x27 */ |
|
1171 /*? 0x3F, 0x3F */ |
|
1172 /*\ 0x5C, 0x5C */ |
|
1173 /*a*/ 0x61, 0x07, |
|
1174 /*b*/ 0x62, 0x08, |
|
1175 /*e*/ 0x65, 0x1b, |
|
1176 /*f*/ 0x66, 0x0c, |
|
1177 /*n*/ 0x6E, 0x0a, |
|
1178 /*r*/ 0x72, 0x0d, |
|
1179 /*t*/ 0x74, 0x09, |
|
1180 /*v*/ 0x76, 0x0b |
|
1181 }; |
|
1182 enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; |
|
1183 |
|
1184 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ |
|
1185 static int8_t _digit8(UChar c) { |
|
1186 if (c >= 0x0030 && c <= 0x0037) { |
|
1187 return (int8_t)(c - 0x0030); |
|
1188 } |
|
1189 return -1; |
|
1190 } |
|
1191 |
|
1192 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ |
|
1193 static int8_t _digit16(UChar c) { |
|
1194 if (c >= 0x0030 && c <= 0x0039) { |
|
1195 return (int8_t)(c - 0x0030); |
|
1196 } |
|
1197 if (c >= 0x0041 && c <= 0x0046) { |
|
1198 return (int8_t)(c - (0x0041 - 10)); |
|
1199 } |
|
1200 if (c >= 0x0061 && c <= 0x0066) { |
|
1201 return (int8_t)(c - (0x0061 - 10)); |
|
1202 } |
|
1203 return -1; |
|
1204 } |
|
1205 |
|
1206 /* Parse a single escape sequence. Although this method deals in |
|
1207 * UChars, it does not use C++ or UnicodeString. This allows it to |
|
1208 * be used from C contexts. */ |
|
1209 U_CAPI UChar32 U_EXPORT2 |
|
1210 u_unescapeAt(UNESCAPE_CHAR_AT charAt, |
|
1211 int32_t *offset, |
|
1212 int32_t length, |
|
1213 void *context) { |
|
1214 |
|
1215 int32_t start = *offset; |
|
1216 UChar c; |
|
1217 UChar32 result = 0; |
|
1218 int8_t n = 0; |
|
1219 int8_t minDig = 0; |
|
1220 int8_t maxDig = 0; |
|
1221 int8_t bitsPerDigit = 4; |
|
1222 int8_t dig; |
|
1223 int32_t i; |
|
1224 UBool braces = FALSE; |
|
1225 |
|
1226 /* Check that offset is in range */ |
|
1227 if (*offset < 0 || *offset >= length) { |
|
1228 goto err; |
|
1229 } |
|
1230 |
|
1231 /* Fetch first UChar after '\\' */ |
|
1232 c = charAt((*offset)++, context); |
|
1233 |
|
1234 /* Convert hexadecimal and octal escapes */ |
|
1235 switch (c) { |
|
1236 case 0x0075 /*'u'*/: |
|
1237 minDig = maxDig = 4; |
|
1238 break; |
|
1239 case 0x0055 /*'U'*/: |
|
1240 minDig = maxDig = 8; |
|
1241 break; |
|
1242 case 0x0078 /*'x'*/: |
|
1243 minDig = 1; |
|
1244 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { |
|
1245 ++(*offset); |
|
1246 braces = TRUE; |
|
1247 maxDig = 8; |
|
1248 } else { |
|
1249 maxDig = 2; |
|
1250 } |
|
1251 break; |
|
1252 default: |
|
1253 dig = _digit8(c); |
|
1254 if (dig >= 0) { |
|
1255 minDig = 1; |
|
1256 maxDig = 3; |
|
1257 n = 1; /* Already have first octal digit */ |
|
1258 bitsPerDigit = 3; |
|
1259 result = dig; |
|
1260 } |
|
1261 break; |
|
1262 } |
|
1263 if (minDig != 0) { |
|
1264 while (*offset < length && n < maxDig) { |
|
1265 c = charAt(*offset, context); |
|
1266 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); |
|
1267 if (dig < 0) { |
|
1268 break; |
|
1269 } |
|
1270 result = (result << bitsPerDigit) | dig; |
|
1271 ++(*offset); |
|
1272 ++n; |
|
1273 } |
|
1274 if (n < minDig) { |
|
1275 goto err; |
|
1276 } |
|
1277 if (braces) { |
|
1278 if (c != 0x7D /*}*/) { |
|
1279 goto err; |
|
1280 } |
|
1281 ++(*offset); |
|
1282 } |
|
1283 if (result < 0 || result >= 0x110000) { |
|
1284 goto err; |
|
1285 } |
|
1286 /* If an escape sequence specifies a lead surrogate, see if |
|
1287 * there is a trail surrogate after it, either as an escape or |
|
1288 * as a literal. If so, join them up into a supplementary. |
|
1289 */ |
|
1290 if (*offset < length && U16_IS_LEAD(result)) { |
|
1291 int32_t ahead = *offset + 1; |
|
1292 c = charAt(*offset, context); |
|
1293 if (c == 0x5C /*'\\'*/ && ahead < length) { |
|
1294 c = (UChar) u_unescapeAt(charAt, &ahead, length, context); |
|
1295 } |
|
1296 if (U16_IS_TRAIL(c)) { |
|
1297 *offset = ahead; |
|
1298 result = U16_GET_SUPPLEMENTARY(result, c); |
|
1299 } |
|
1300 } |
|
1301 return result; |
|
1302 } |
|
1303 |
|
1304 /* Convert C-style escapes in table */ |
|
1305 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { |
|
1306 if (c == UNESCAPE_MAP[i]) { |
|
1307 return UNESCAPE_MAP[i+1]; |
|
1308 } else if (c < UNESCAPE_MAP[i]) { |
|
1309 break; |
|
1310 } |
|
1311 } |
|
1312 |
|
1313 /* Map \cX to control-X: X & 0x1F */ |
|
1314 if (c == 0x0063 /*'c'*/ && *offset < length) { |
|
1315 c = charAt((*offset)++, context); |
|
1316 if (U16_IS_LEAD(c) && *offset < length) { |
|
1317 UChar c2 = charAt(*offset, context); |
|
1318 if (U16_IS_TRAIL(c2)) { |
|
1319 ++(*offset); |
|
1320 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ |
|
1321 } |
|
1322 } |
|
1323 return 0x1F & c; |
|
1324 } |
|
1325 |
|
1326 /* If no special forms are recognized, then consider |
|
1327 * the backslash to generically escape the next character. |
|
1328 * Deal with surrogate pairs. */ |
|
1329 if (U16_IS_LEAD(c) && *offset < length) { |
|
1330 UChar c2 = charAt(*offset, context); |
|
1331 if (U16_IS_TRAIL(c2)) { |
|
1332 ++(*offset); |
|
1333 return U16_GET_SUPPLEMENTARY(c, c2); |
|
1334 } |
|
1335 } |
|
1336 return c; |
|
1337 |
|
1338 err: |
|
1339 /* Invalid escape sequence */ |
|
1340 *offset = start; /* Reset to initial value */ |
|
1341 return (UChar32)0xFFFFFFFF; |
|
1342 } |
|
1343 |
|
1344 /* u_unescapeAt() callback to return a UChar from a char* */ |
|
1345 static UChar U_CALLCONV |
|
1346 _charPtr_charAt(int32_t offset, void *context) { |
|
1347 UChar c16; |
|
1348 /* It would be more efficient to access the invariant tables |
|
1349 * directly but there is no API for that. */ |
|
1350 u_charsToUChars(((char*) context) + offset, &c16, 1); |
|
1351 return c16; |
|
1352 } |
|
1353 |
|
1354 /* Append an escape-free segment of the text; used by u_unescape() */ |
|
1355 static void _appendUChars(UChar *dest, int32_t destCapacity, |
|
1356 const char *src, int32_t srcLen) { |
|
1357 if (destCapacity < 0) { |
|
1358 destCapacity = 0; |
|
1359 } |
|
1360 if (srcLen > destCapacity) { |
|
1361 srcLen = destCapacity; |
|
1362 } |
|
1363 u_charsToUChars(src, dest, srcLen); |
|
1364 } |
|
1365 |
|
1366 /* Do an invariant conversion of char* -> UChar*, with escape parsing */ |
|
1367 U_CAPI int32_t U_EXPORT2 |
|
1368 u_unescape(const char *src, UChar *dest, int32_t destCapacity) { |
|
1369 const char *segment = src; |
|
1370 int32_t i = 0; |
|
1371 char c; |
|
1372 |
|
1373 while ((c=*src) != 0) { |
|
1374 /* '\\' intentionally written as compiler-specific |
|
1375 * character constant to correspond to compiler-specific |
|
1376 * char* constants. */ |
|
1377 if (c == '\\') { |
|
1378 int32_t lenParsed = 0; |
|
1379 UChar32 c32; |
|
1380 if (src != segment) { |
|
1381 if (dest != NULL) { |
|
1382 _appendUChars(dest + i, destCapacity - i, |
|
1383 segment, (int32_t)(src - segment)); |
|
1384 } |
|
1385 i += (int32_t)(src - segment); |
|
1386 } |
|
1387 ++src; /* advance past '\\' */ |
|
1388 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); |
|
1389 if (lenParsed == 0) { |
|
1390 goto err; |
|
1391 } |
|
1392 src += lenParsed; /* advance past escape seq. */ |
|
1393 if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { |
|
1394 U16_APPEND_UNSAFE(dest, i, c32); |
|
1395 } else { |
|
1396 i += U16_LENGTH(c32); |
|
1397 } |
|
1398 segment = src; |
|
1399 } else { |
|
1400 ++src; |
|
1401 } |
|
1402 } |
|
1403 if (src != segment) { |
|
1404 if (dest != NULL) { |
|
1405 _appendUChars(dest + i, destCapacity - i, |
|
1406 segment, (int32_t)(src - segment)); |
|
1407 } |
|
1408 i += (int32_t)(src - segment); |
|
1409 } |
|
1410 if (dest != NULL && i < destCapacity) { |
|
1411 dest[i] = 0; |
|
1412 } |
|
1413 return i; |
|
1414 |
|
1415 err: |
|
1416 if (dest != NULL && destCapacity > 0) { |
|
1417 *dest = 0; |
|
1418 } |
|
1419 return 0; |
|
1420 } |
|
1421 |
|
1422 /* NUL-termination of strings ----------------------------------------------- */ |
|
1423 |
|
1424 /** |
|
1425 * NUL-terminate a string no matter what its type. |
|
1426 * Set warning and error codes accordingly. |
|
1427 */ |
|
1428 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ |
|
1429 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ |
|
1430 /* not a public function, so no complete argument checking */ \ |
|
1431 \ |
|
1432 if(length<0) { \ |
|
1433 /* assume that the caller handles this */ \ |
|
1434 } else if(length<destCapacity) { \ |
|
1435 /* NUL-terminate the string, the NUL fits */ \ |
|
1436 dest[length]=0; \ |
|
1437 /* unset the not-terminated warning but leave all others */ \ |
|
1438 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ |
|
1439 *pErrorCode=U_ZERO_ERROR; \ |
|
1440 } \ |
|
1441 } else if(length==destCapacity) { \ |
|
1442 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ |
|
1443 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ |
|
1444 } else /* length>destCapacity */ { \ |
|
1445 /* even the string itself did not fit - set an error code */ \ |
|
1446 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ |
|
1447 } \ |
|
1448 } |
|
1449 |
|
1450 U_CAPI int32_t U_EXPORT2 |
|
1451 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
|
1452 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
|
1453 return length; |
|
1454 } |
|
1455 |
|
1456 U_CAPI int32_t U_EXPORT2 |
|
1457 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
|
1458 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
|
1459 return length; |
|
1460 } |
|
1461 |
|
1462 U_CAPI int32_t U_EXPORT2 |
|
1463 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
|
1464 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
|
1465 return length; |
|
1466 } |
|
1467 |
|
1468 U_CAPI int32_t U_EXPORT2 |
|
1469 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { |
|
1470 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); |
|
1471 return length; |
|
1472 } |
|
1473 |
|
1474 // Compute the hash code for a string -------------------------------------- *** |
|
1475 |
|
1476 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend |
|
1477 // on UHashtable code. |
|
1478 |
|
1479 /* |
|
1480 Compute the hash by iterating sparsely over about 32 (up to 63) |
|
1481 characters spaced evenly through the string. For each character, |
|
1482 multiply the previous hash value by a prime number and add the new |
|
1483 character in, like a linear congruential random number generator, |
|
1484 producing a pseudorandom deterministic value well distributed over |
|
1485 the output range. [LIU] |
|
1486 */ |
|
1487 |
|
1488 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ |
|
1489 int32_t hash = 0; \ |
|
1490 const TYPE *p = (const TYPE*) STR; \ |
|
1491 if (p != NULL) { \ |
|
1492 int32_t len = (int32_t)(STRLEN); \ |
|
1493 int32_t inc = ((len - 32) / 32) + 1; \ |
|
1494 const TYPE *limit = p + len; \ |
|
1495 while (p<limit) { \ |
|
1496 hash = (hash * 37) + DEREF; \ |
|
1497 p += inc; \ |
|
1498 } \ |
|
1499 } \ |
|
1500 return hash |
|
1501 |
|
1502 /* Used by UnicodeString to compute its hashcode - Not public API. */ |
|
1503 U_CAPI int32_t U_EXPORT2 |
|
1504 ustr_hashUCharsN(const UChar *str, int32_t length) { |
|
1505 STRING_HASH(UChar, str, length, *p); |
|
1506 } |
|
1507 |
|
1508 U_CAPI int32_t U_EXPORT2 |
|
1509 ustr_hashCharsN(const char *str, int32_t length) { |
|
1510 STRING_HASH(uint8_t, str, length, *p); |
|
1511 } |
|
1512 |
|
1513 U_CAPI int32_t U_EXPORT2 |
|
1514 ustr_hashICharsN(const char *str, int32_t length) { |
|
1515 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); |
|
1516 } |