1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ustring.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1516 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 1998-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* 1.12 +* File ustring.cpp 1.13 +* 1.14 +* Modification History: 1.15 +* 1.16 +* Date Name Description 1.17 +* 12/07/98 bertrand Creation. 1.18 +****************************************************************************** 1.19 +*/ 1.20 + 1.21 +#include "unicode/utypes.h" 1.22 +#include "unicode/putil.h" 1.23 +#include "unicode/ustring.h" 1.24 +#include "unicode/utf16.h" 1.25 +#include "cstring.h" 1.26 +#include "cwchar.h" 1.27 +#include "cmemory.h" 1.28 +#include "ustr_imp.h" 1.29 + 1.30 +/* ANSI string.h - style functions ------------------------------------------ */ 1.31 + 1.32 +/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ 1.33 +#define U_BMP_MAX 0xffff 1.34 + 1.35 +/* Forward binary string search functions ----------------------------------- */ 1.36 + 1.37 +/* 1.38 + * Test if a substring match inside a string is at code point boundaries. 1.39 + * All pointers refer to the same buffer. 1.40 + * The limit pointer may be NULL, all others must be real pointers. 1.41 + */ 1.42 +static inline UBool 1.43 +isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { 1.44 + if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { 1.45 + /* the leading edge of the match is in the middle of a surrogate pair */ 1.46 + return FALSE; 1.47 + } 1.48 + if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { 1.49 + /* the trailing edge of the match is in the middle of a surrogate pair */ 1.50 + return FALSE; 1.51 + } 1.52 + return TRUE; 1.53 +} 1.54 + 1.55 +U_CAPI UChar * U_EXPORT2 1.56 +u_strFindFirst(const UChar *s, int32_t length, 1.57 + const UChar *sub, int32_t subLength) { 1.58 + const UChar *start, *p, *q, *subLimit; 1.59 + UChar c, cs, cq; 1.60 + 1.61 + if(sub==NULL || subLength<-1) { 1.62 + return (UChar *)s; 1.63 + } 1.64 + if(s==NULL || length<-1) { 1.65 + return NULL; 1.66 + } 1.67 + 1.68 + start=s; 1.69 + 1.70 + if(length<0 && subLength<0) { 1.71 + /* both strings are NUL-terminated */ 1.72 + if((cs=*sub++)==0) { 1.73 + return (UChar *)s; 1.74 + } 1.75 + if(*sub==0 && !U16_IS_SURROGATE(cs)) { 1.76 + /* the substring consists of a single, non-surrogate BMP code point */ 1.77 + return u_strchr(s, cs); 1.78 + } 1.79 + 1.80 + while((c=*s++)!=0) { 1.81 + if(c==cs) { 1.82 + /* found first substring UChar, compare rest */ 1.83 + p=s; 1.84 + q=sub; 1.85 + for(;;) { 1.86 + if((cq=*q)==0) { 1.87 + if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 1.88 + return (UChar *)(s-1); /* well-formed match */ 1.89 + } else { 1.90 + break; /* no match because surrogate pair is split */ 1.91 + } 1.92 + } 1.93 + if((c=*p)==0) { 1.94 + return NULL; /* no match, and none possible after s */ 1.95 + } 1.96 + if(c!=cq) { 1.97 + break; /* no match */ 1.98 + } 1.99 + ++p; 1.100 + ++q; 1.101 + } 1.102 + } 1.103 + } 1.104 + 1.105 + /* not found */ 1.106 + return NULL; 1.107 + } 1.108 + 1.109 + if(subLength<0) { 1.110 + subLength=u_strlen(sub); 1.111 + } 1.112 + if(subLength==0) { 1.113 + return (UChar *)s; 1.114 + } 1.115 + 1.116 + /* get sub[0] to search for it fast */ 1.117 + cs=*sub++; 1.118 + --subLength; 1.119 + subLimit=sub+subLength; 1.120 + 1.121 + if(subLength==0 && !U16_IS_SURROGATE(cs)) { 1.122 + /* the substring consists of a single, non-surrogate BMP code point */ 1.123 + return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); 1.124 + } 1.125 + 1.126 + if(length<0) { 1.127 + /* s is NUL-terminated */ 1.128 + while((c=*s++)!=0) { 1.129 + if(c==cs) { 1.130 + /* found first substring UChar, compare rest */ 1.131 + p=s; 1.132 + q=sub; 1.133 + for(;;) { 1.134 + if(q==subLimit) { 1.135 + if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 1.136 + return (UChar *)(s-1); /* well-formed match */ 1.137 + } else { 1.138 + break; /* no match because surrogate pair is split */ 1.139 + } 1.140 + } 1.141 + if((c=*p)==0) { 1.142 + return NULL; /* no match, and none possible after s */ 1.143 + } 1.144 + if(c!=*q) { 1.145 + break; /* no match */ 1.146 + } 1.147 + ++p; 1.148 + ++q; 1.149 + } 1.150 + } 1.151 + } 1.152 + } else { 1.153 + const UChar *limit, *preLimit; 1.154 + 1.155 + /* subLength was decremented above */ 1.156 + if(length<=subLength) { 1.157 + return NULL; /* s is shorter than sub */ 1.158 + } 1.159 + 1.160 + limit=s+length; 1.161 + 1.162 + /* the substring must start before preLimit */ 1.163 + preLimit=limit-subLength; 1.164 + 1.165 + while(s!=preLimit) { 1.166 + c=*s++; 1.167 + if(c==cs) { 1.168 + /* found first substring UChar, compare rest */ 1.169 + p=s; 1.170 + q=sub; 1.171 + for(;;) { 1.172 + if(q==subLimit) { 1.173 + if(isMatchAtCPBoundary(start, s-1, p, limit)) { 1.174 + return (UChar *)(s-1); /* well-formed match */ 1.175 + } else { 1.176 + break; /* no match because surrogate pair is split */ 1.177 + } 1.178 + } 1.179 + if(*p!=*q) { 1.180 + break; /* no match */ 1.181 + } 1.182 + ++p; 1.183 + ++q; 1.184 + } 1.185 + } 1.186 + } 1.187 + } 1.188 + 1.189 + /* not found */ 1.190 + return NULL; 1.191 +} 1.192 + 1.193 +U_CAPI UChar * U_EXPORT2 1.194 +u_strstr(const UChar *s, const UChar *substring) { 1.195 + return u_strFindFirst(s, -1, substring, -1); 1.196 +} 1.197 + 1.198 +U_CAPI UChar * U_EXPORT2 1.199 +u_strchr(const UChar *s, UChar c) { 1.200 + if(U16_IS_SURROGATE(c)) { 1.201 + /* make sure to not find half of a surrogate pair */ 1.202 + return u_strFindFirst(s, -1, &c, 1); 1.203 + } else { 1.204 + UChar cs; 1.205 + 1.206 + /* trivial search for a BMP code point */ 1.207 + for(;;) { 1.208 + if((cs=*s)==c) { 1.209 + return (UChar *)s; 1.210 + } 1.211 + if(cs==0) { 1.212 + return NULL; 1.213 + } 1.214 + ++s; 1.215 + } 1.216 + } 1.217 +} 1.218 + 1.219 +U_CAPI UChar * U_EXPORT2 1.220 +u_strchr32(const UChar *s, UChar32 c) { 1.221 + if((uint32_t)c<=U_BMP_MAX) { 1.222 + /* find BMP code point */ 1.223 + return u_strchr(s, (UChar)c); 1.224 + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 1.225 + /* find supplementary code point as surrogate pair */ 1.226 + UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 1.227 + 1.228 + while((cs=*s++)!=0) { 1.229 + if(cs==lead && *s==trail) { 1.230 + return (UChar *)(s-1); 1.231 + } 1.232 + } 1.233 + return NULL; 1.234 + } else { 1.235 + /* not a Unicode code point, not findable */ 1.236 + return NULL; 1.237 + } 1.238 +} 1.239 + 1.240 +U_CAPI UChar * U_EXPORT2 1.241 +u_memchr(const UChar *s, UChar c, int32_t count) { 1.242 + if(count<=0) { 1.243 + return NULL; /* no string */ 1.244 + } else if(U16_IS_SURROGATE(c)) { 1.245 + /* make sure to not find half of a surrogate pair */ 1.246 + return u_strFindFirst(s, count, &c, 1); 1.247 + } else { 1.248 + /* trivial search for a BMP code point */ 1.249 + const UChar *limit=s+count; 1.250 + do { 1.251 + if(*s==c) { 1.252 + return (UChar *)s; 1.253 + } 1.254 + } while(++s!=limit); 1.255 + return NULL; 1.256 + } 1.257 +} 1.258 + 1.259 +U_CAPI UChar * U_EXPORT2 1.260 +u_memchr32(const UChar *s, UChar32 c, int32_t count) { 1.261 + if((uint32_t)c<=U_BMP_MAX) { 1.262 + /* find BMP code point */ 1.263 + return u_memchr(s, (UChar)c, count); 1.264 + } else if(count<2) { 1.265 + /* too short for a surrogate pair */ 1.266 + return NULL; 1.267 + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 1.268 + /* find supplementary code point as surrogate pair */ 1.269 + const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ 1.270 + UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 1.271 + 1.272 + do { 1.273 + if(*s==lead && *(s+1)==trail) { 1.274 + return (UChar *)s; 1.275 + } 1.276 + } while(++s!=limit); 1.277 + return NULL; 1.278 + } else { 1.279 + /* not a Unicode code point, not findable */ 1.280 + return NULL; 1.281 + } 1.282 +} 1.283 + 1.284 +/* Backward binary string search functions ---------------------------------- */ 1.285 + 1.286 +U_CAPI UChar * U_EXPORT2 1.287 +u_strFindLast(const UChar *s, int32_t length, 1.288 + const UChar *sub, int32_t subLength) { 1.289 + const UChar *start, *limit, *p, *q, *subLimit; 1.290 + UChar c, cs; 1.291 + 1.292 + if(sub==NULL || subLength<-1) { 1.293 + return (UChar *)s; 1.294 + } 1.295 + if(s==NULL || length<-1) { 1.296 + return NULL; 1.297 + } 1.298 + 1.299 + /* 1.300 + * This implementation is more lazy than the one for u_strFindFirst(): 1.301 + * There is no special search code for NUL-terminated strings. 1.302 + * It does not seem to be worth it for searching substrings to 1.303 + * search forward and find all matches like in u_strrchr() and similar. 1.304 + * Therefore, we simply get both string lengths and search backward. 1.305 + * 1.306 + * markus 2002oct23 1.307 + */ 1.308 + 1.309 + if(subLength<0) { 1.310 + subLength=u_strlen(sub); 1.311 + } 1.312 + if(subLength==0) { 1.313 + return (UChar *)s; 1.314 + } 1.315 + 1.316 + /* get sub[subLength-1] to search for it fast */ 1.317 + subLimit=sub+subLength; 1.318 + cs=*(--subLimit); 1.319 + --subLength; 1.320 + 1.321 + if(subLength==0 && !U16_IS_SURROGATE(cs)) { 1.322 + /* the substring consists of a single, non-surrogate BMP code point */ 1.323 + return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); 1.324 + } 1.325 + 1.326 + if(length<0) { 1.327 + length=u_strlen(s); 1.328 + } 1.329 + 1.330 + /* subLength was decremented above */ 1.331 + if(length<=subLength) { 1.332 + return NULL; /* s is shorter than sub */ 1.333 + } 1.334 + 1.335 + start=s; 1.336 + limit=s+length; 1.337 + 1.338 + /* the substring must start no later than s+subLength */ 1.339 + s+=subLength; 1.340 + 1.341 + while(s!=limit) { 1.342 + c=*(--limit); 1.343 + if(c==cs) { 1.344 + /* found last substring UChar, compare rest */ 1.345 + p=limit; 1.346 + q=subLimit; 1.347 + for(;;) { 1.348 + if(q==sub) { 1.349 + if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { 1.350 + return (UChar *)p; /* well-formed match */ 1.351 + } else { 1.352 + break; /* no match because surrogate pair is split */ 1.353 + } 1.354 + } 1.355 + if(*(--p)!=*(--q)) { 1.356 + break; /* no match */ 1.357 + } 1.358 + } 1.359 + } 1.360 + } 1.361 + 1.362 + /* not found */ 1.363 + return NULL; 1.364 +} 1.365 + 1.366 +U_CAPI UChar * U_EXPORT2 1.367 +u_strrstr(const UChar *s, const UChar *substring) { 1.368 + return u_strFindLast(s, -1, substring, -1); 1.369 +} 1.370 + 1.371 +U_CAPI UChar * U_EXPORT2 1.372 +u_strrchr(const UChar *s, UChar c) { 1.373 + if(U16_IS_SURROGATE(c)) { 1.374 + /* make sure to not find half of a surrogate pair */ 1.375 + return u_strFindLast(s, -1, &c, 1); 1.376 + } else { 1.377 + const UChar *result=NULL; 1.378 + UChar cs; 1.379 + 1.380 + /* trivial search for a BMP code point */ 1.381 + for(;;) { 1.382 + if((cs=*s)==c) { 1.383 + result=s; 1.384 + } 1.385 + if(cs==0) { 1.386 + return (UChar *)result; 1.387 + } 1.388 + ++s; 1.389 + } 1.390 + } 1.391 +} 1.392 + 1.393 +U_CAPI UChar * U_EXPORT2 1.394 +u_strrchr32(const UChar *s, UChar32 c) { 1.395 + if((uint32_t)c<=U_BMP_MAX) { 1.396 + /* find BMP code point */ 1.397 + return u_strrchr(s, (UChar)c); 1.398 + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 1.399 + /* find supplementary code point as surrogate pair */ 1.400 + const UChar *result=NULL; 1.401 + UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 1.402 + 1.403 + while((cs=*s++)!=0) { 1.404 + if(cs==lead && *s==trail) { 1.405 + result=s-1; 1.406 + } 1.407 + } 1.408 + return (UChar *)result; 1.409 + } else { 1.410 + /* not a Unicode code point, not findable */ 1.411 + return NULL; 1.412 + } 1.413 +} 1.414 + 1.415 +U_CAPI UChar * U_EXPORT2 1.416 +u_memrchr(const UChar *s, UChar c, int32_t count) { 1.417 + if(count<=0) { 1.418 + return NULL; /* no string */ 1.419 + } else if(U16_IS_SURROGATE(c)) { 1.420 + /* make sure to not find half of a surrogate pair */ 1.421 + return u_strFindLast(s, count, &c, 1); 1.422 + } else { 1.423 + /* trivial search for a BMP code point */ 1.424 + const UChar *limit=s+count; 1.425 + do { 1.426 + if(*(--limit)==c) { 1.427 + return (UChar *)limit; 1.428 + } 1.429 + } while(s!=limit); 1.430 + return NULL; 1.431 + } 1.432 +} 1.433 + 1.434 +U_CAPI UChar * U_EXPORT2 1.435 +u_memrchr32(const UChar *s, UChar32 c, int32_t count) { 1.436 + if((uint32_t)c<=U_BMP_MAX) { 1.437 + /* find BMP code point */ 1.438 + return u_memrchr(s, (UChar)c, count); 1.439 + } else if(count<2) { 1.440 + /* too short for a surrogate pair */ 1.441 + return NULL; 1.442 + } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 1.443 + /* find supplementary code point as surrogate pair */ 1.444 + const UChar *limit=s+count-1; 1.445 + UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 1.446 + 1.447 + do { 1.448 + if(*limit==trail && *(limit-1)==lead) { 1.449 + return (UChar *)(limit-1); 1.450 + } 1.451 + } while(s!=--limit); 1.452 + return NULL; 1.453 + } else { 1.454 + /* not a Unicode code point, not findable */ 1.455 + return NULL; 1.456 + } 1.457 +} 1.458 + 1.459 +/* Tokenization functions --------------------------------------------------- */ 1.460 + 1.461 +/* 1.462 + * Match each code point in a string against each code point in the matchSet. 1.463 + * Return the index of the first string code point that 1.464 + * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. 1.465 + * Return -(string length)-1 if there is no such code point. 1.466 + */ 1.467 +static int32_t 1.468 +_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { 1.469 + int32_t matchLen, matchBMPLen, strItr, matchItr; 1.470 + UChar32 stringCh, matchCh; 1.471 + UChar c, c2; 1.472 + 1.473 + /* first part of matchSet contains only BMP code points */ 1.474 + matchBMPLen = 0; 1.475 + while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { 1.476 + ++matchBMPLen; 1.477 + } 1.478 + 1.479 + /* second part of matchSet contains BMP and supplementary code points */ 1.480 + matchLen = matchBMPLen; 1.481 + while(matchSet[matchLen] != 0) { 1.482 + ++matchLen; 1.483 + } 1.484 + 1.485 + for(strItr = 0; (c = string[strItr]) != 0;) { 1.486 + ++strItr; 1.487 + if(U16_IS_SINGLE(c)) { 1.488 + if(polarity) { 1.489 + for(matchItr = 0; matchItr < matchLen; ++matchItr) { 1.490 + if(c == matchSet[matchItr]) { 1.491 + return strItr - 1; /* one matches */ 1.492 + } 1.493 + } 1.494 + } else { 1.495 + for(matchItr = 0; matchItr < matchLen; ++matchItr) { 1.496 + if(c == matchSet[matchItr]) { 1.497 + goto endloop; 1.498 + } 1.499 + } 1.500 + return strItr - 1; /* none matches */ 1.501 + } 1.502 + } else { 1.503 + /* 1.504 + * No need to check for string length before U16_IS_TRAIL 1.505 + * because c2 could at worst be the terminating NUL. 1.506 + */ 1.507 + if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { 1.508 + ++strItr; 1.509 + stringCh = U16_GET_SUPPLEMENTARY(c, c2); 1.510 + } else { 1.511 + stringCh = c; /* unpaired trail surrogate */ 1.512 + } 1.513 + 1.514 + if(polarity) { 1.515 + for(matchItr = matchBMPLen; matchItr < matchLen;) { 1.516 + U16_NEXT(matchSet, matchItr, matchLen, matchCh); 1.517 + if(stringCh == matchCh) { 1.518 + return strItr - U16_LENGTH(stringCh); /* one matches */ 1.519 + } 1.520 + } 1.521 + } else { 1.522 + for(matchItr = matchBMPLen; matchItr < matchLen;) { 1.523 + U16_NEXT(matchSet, matchItr, matchLen, matchCh); 1.524 + if(stringCh == matchCh) { 1.525 + goto endloop; 1.526 + } 1.527 + } 1.528 + return strItr - U16_LENGTH(stringCh); /* none matches */ 1.529 + } 1.530 + } 1.531 +endloop: 1.532 + /* wish C had continue with labels like Java... */; 1.533 + } 1.534 + 1.535 + /* Didn't find it. */ 1.536 + return -strItr-1; 1.537 +} 1.538 + 1.539 +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 1.540 +U_CAPI UChar * U_EXPORT2 1.541 +u_strpbrk(const UChar *string, const UChar *matchSet) 1.542 +{ 1.543 + int32_t idx = _matchFromSet(string, matchSet, TRUE); 1.544 + if(idx >= 0) { 1.545 + return (UChar *)string + idx; 1.546 + } else { 1.547 + return NULL; 1.548 + } 1.549 +} 1.550 + 1.551 +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 1.552 +U_CAPI int32_t U_EXPORT2 1.553 +u_strcspn(const UChar *string, const UChar *matchSet) 1.554 +{ 1.555 + int32_t idx = _matchFromSet(string, matchSet, TRUE); 1.556 + if(idx >= 0) { 1.557 + return idx; 1.558 + } else { 1.559 + return -idx - 1; /* == u_strlen(string) */ 1.560 + } 1.561 +} 1.562 + 1.563 +/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ 1.564 +U_CAPI int32_t U_EXPORT2 1.565 +u_strspn(const UChar *string, const UChar *matchSet) 1.566 +{ 1.567 + int32_t idx = _matchFromSet(string, matchSet, FALSE); 1.568 + if(idx >= 0) { 1.569 + return idx; 1.570 + } else { 1.571 + return -idx - 1; /* == u_strlen(string) */ 1.572 + } 1.573 +} 1.574 + 1.575 +/* ----- Text manipulation functions --- */ 1.576 + 1.577 +U_CAPI UChar* U_EXPORT2 1.578 +u_strtok_r(UChar *src, 1.579 + const UChar *delim, 1.580 + UChar **saveState) 1.581 +{ 1.582 + UChar *tokSource; 1.583 + UChar *nextToken; 1.584 + uint32_t nonDelimIdx; 1.585 + 1.586 + /* If saveState is NULL, the user messed up. */ 1.587 + if (src != NULL) { 1.588 + tokSource = src; 1.589 + *saveState = src; /* Set to "src" in case there are no delimiters */ 1.590 + } 1.591 + else if (*saveState) { 1.592 + tokSource = *saveState; 1.593 + } 1.594 + else { 1.595 + /* src == NULL && *saveState == NULL */ 1.596 + /* This shouldn't happen. We already finished tokenizing. */ 1.597 + return NULL; 1.598 + } 1.599 + 1.600 + /* Skip initial delimiters */ 1.601 + nonDelimIdx = u_strspn(tokSource, delim); 1.602 + tokSource = &tokSource[nonDelimIdx]; 1.603 + 1.604 + if (*tokSource) { 1.605 + nextToken = u_strpbrk(tokSource, delim); 1.606 + if (nextToken != NULL) { 1.607 + /* Create a token */ 1.608 + *(nextToken++) = 0; 1.609 + *saveState = nextToken; 1.610 + return tokSource; 1.611 + } 1.612 + else if (*saveState) { 1.613 + /* Return the last token */ 1.614 + *saveState = NULL; 1.615 + return tokSource; 1.616 + } 1.617 + } 1.618 + else { 1.619 + /* No tokens were found. Only delimiters were left. */ 1.620 + *saveState = NULL; 1.621 + } 1.622 + return NULL; 1.623 +} 1.624 + 1.625 +/* Miscellaneous functions -------------------------------------------------- */ 1.626 + 1.627 +U_CAPI UChar* U_EXPORT2 1.628 +u_strcat(UChar *dst, 1.629 + const UChar *src) 1.630 +{ 1.631 + UChar *anchor = dst; /* save a pointer to start of dst */ 1.632 + 1.633 + while(*dst != 0) { /* To end of first string */ 1.634 + ++dst; 1.635 + } 1.636 + while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 1.637 + } 1.638 + 1.639 + return anchor; 1.640 +} 1.641 + 1.642 +U_CAPI UChar* U_EXPORT2 1.643 +u_strncat(UChar *dst, 1.644 + const UChar *src, 1.645 + int32_t n ) 1.646 +{ 1.647 + if(n > 0) { 1.648 + UChar *anchor = dst; /* save a pointer to start of dst */ 1.649 + 1.650 + while(*dst != 0) { /* To end of first string */ 1.651 + ++dst; 1.652 + } 1.653 + while((*dst = *src) != 0) { /* copy string 2 over */ 1.654 + ++dst; 1.655 + if(--n == 0) { 1.656 + *dst = 0; 1.657 + break; 1.658 + } 1.659 + ++src; 1.660 + } 1.661 + 1.662 + return anchor; 1.663 + } else { 1.664 + return dst; 1.665 + } 1.666 +} 1.667 + 1.668 +/* ----- Text property functions --- */ 1.669 + 1.670 +U_CAPI int32_t U_EXPORT2 1.671 +u_strcmp(const UChar *s1, 1.672 + const UChar *s2) 1.673 +{ 1.674 + UChar c1, c2; 1.675 + 1.676 + for(;;) { 1.677 + c1=*s1++; 1.678 + c2=*s2++; 1.679 + if (c1 != c2 || c1 == 0) { 1.680 + break; 1.681 + } 1.682 + } 1.683 + return (int32_t)c1 - (int32_t)c2; 1.684 +} 1.685 + 1.686 +U_CFUNC int32_t U_EXPORT2 1.687 +uprv_strCompare(const UChar *s1, int32_t length1, 1.688 + const UChar *s2, int32_t length2, 1.689 + UBool strncmpStyle, UBool codePointOrder) { 1.690 + const UChar *start1, *start2, *limit1, *limit2; 1.691 + UChar c1, c2; 1.692 + 1.693 + /* setup for fix-up */ 1.694 + start1=s1; 1.695 + start2=s2; 1.696 + 1.697 + /* compare identical prefixes - they do not need to be fixed up */ 1.698 + if(length1<0 && length2<0) { 1.699 + /* strcmp style, both NUL-terminated */ 1.700 + if(s1==s2) { 1.701 + return 0; 1.702 + } 1.703 + 1.704 + for(;;) { 1.705 + c1=*s1; 1.706 + c2=*s2; 1.707 + if(c1!=c2) { 1.708 + break; 1.709 + } 1.710 + if(c1==0) { 1.711 + return 0; 1.712 + } 1.713 + ++s1; 1.714 + ++s2; 1.715 + } 1.716 + 1.717 + /* setup for fix-up */ 1.718 + limit1=limit2=NULL; 1.719 + } else if(strncmpStyle) { 1.720 + /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ 1.721 + if(s1==s2) { 1.722 + return 0; 1.723 + } 1.724 + 1.725 + limit1=start1+length1; 1.726 + 1.727 + for(;;) { 1.728 + /* both lengths are same, check only one limit */ 1.729 + if(s1==limit1) { 1.730 + return 0; 1.731 + } 1.732 + 1.733 + c1=*s1; 1.734 + c2=*s2; 1.735 + if(c1!=c2) { 1.736 + break; 1.737 + } 1.738 + if(c1==0) { 1.739 + return 0; 1.740 + } 1.741 + ++s1; 1.742 + ++s2; 1.743 + } 1.744 + 1.745 + /* setup for fix-up */ 1.746 + limit2=start2+length1; /* use length1 here, too, to enforce assumption */ 1.747 + } else { 1.748 + /* memcmp/UnicodeString style, both length-specified */ 1.749 + int32_t lengthResult; 1.750 + 1.751 + if(length1<0) { 1.752 + length1=u_strlen(s1); 1.753 + } 1.754 + if(length2<0) { 1.755 + length2=u_strlen(s2); 1.756 + } 1.757 + 1.758 + /* limit1=start1+min(lenght1, length2) */ 1.759 + if(length1<length2) { 1.760 + lengthResult=-1; 1.761 + limit1=start1+length1; 1.762 + } else if(length1==length2) { 1.763 + lengthResult=0; 1.764 + limit1=start1+length1; 1.765 + } else /* length1>length2 */ { 1.766 + lengthResult=1; 1.767 + limit1=start1+length2; 1.768 + } 1.769 + 1.770 + if(s1==s2) { 1.771 + return lengthResult; 1.772 + } 1.773 + 1.774 + for(;;) { 1.775 + /* check pseudo-limit */ 1.776 + if(s1==limit1) { 1.777 + return lengthResult; 1.778 + } 1.779 + 1.780 + c1=*s1; 1.781 + c2=*s2; 1.782 + if(c1!=c2) { 1.783 + break; 1.784 + } 1.785 + ++s1; 1.786 + ++s2; 1.787 + } 1.788 + 1.789 + /* setup for fix-up */ 1.790 + limit1=start1+length1; 1.791 + limit2=start2+length2; 1.792 + } 1.793 + 1.794 + /* if both values are in or above the surrogate range, fix them up */ 1.795 + if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 1.796 + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 1.797 + if( 1.798 + (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || 1.799 + (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) 1.800 + ) { 1.801 + /* part of a surrogate pair, leave >=d800 */ 1.802 + } else { 1.803 + /* BMP code point - may be surrogate code point - make <d800 */ 1.804 + c1-=0x2800; 1.805 + } 1.806 + 1.807 + if( 1.808 + (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || 1.809 + (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) 1.810 + ) { 1.811 + /* part of a surrogate pair, leave >=d800 */ 1.812 + } else { 1.813 + /* BMP code point - may be surrogate code point - make <d800 */ 1.814 + c2-=0x2800; 1.815 + } 1.816 + } 1.817 + 1.818 + /* now c1 and c2 are in the requested (code unit or code point) order */ 1.819 + return (int32_t)c1-(int32_t)c2; 1.820 +} 1.821 + 1.822 +/* 1.823 + * Compare two strings as presented by UCharIterators. 1.824 + * Use code unit or code point order. 1.825 + * When the function returns, it is undefined where the iterators 1.826 + * have stopped. 1.827 + */ 1.828 +U_CAPI int32_t U_EXPORT2 1.829 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { 1.830 + UChar32 c1, c2; 1.831 + 1.832 + /* argument checking */ 1.833 + if(iter1==NULL || iter2==NULL) { 1.834 + return 0; /* bad arguments */ 1.835 + } 1.836 + if(iter1==iter2) { 1.837 + return 0; /* identical iterators */ 1.838 + } 1.839 + 1.840 + /* reset iterators to start? */ 1.841 + iter1->move(iter1, 0, UITER_START); 1.842 + iter2->move(iter2, 0, UITER_START); 1.843 + 1.844 + /* compare identical prefixes - they do not need to be fixed up */ 1.845 + for(;;) { 1.846 + c1=iter1->next(iter1); 1.847 + c2=iter2->next(iter2); 1.848 + if(c1!=c2) { 1.849 + break; 1.850 + } 1.851 + if(c1==-1) { 1.852 + return 0; 1.853 + } 1.854 + } 1.855 + 1.856 + /* if both values are in or above the surrogate range, fix them up */ 1.857 + if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 1.858 + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 1.859 + if( 1.860 + (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || 1.861 + (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) 1.862 + ) { 1.863 + /* part of a surrogate pair, leave >=d800 */ 1.864 + } else { 1.865 + /* BMP code point - may be surrogate code point - make <d800 */ 1.866 + c1-=0x2800; 1.867 + } 1.868 + 1.869 + if( 1.870 + (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || 1.871 + (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) 1.872 + ) { 1.873 + /* part of a surrogate pair, leave >=d800 */ 1.874 + } else { 1.875 + /* BMP code point - may be surrogate code point - make <d800 */ 1.876 + c2-=0x2800; 1.877 + } 1.878 + } 1.879 + 1.880 + /* now c1 and c2 are in the requested (code unit or code point) order */ 1.881 + return (int32_t)c1-(int32_t)c2; 1.882 +} 1.883 + 1.884 +#if 0 1.885 +/* 1.886 + * u_strCompareIter() does not leave the iterators _on_ the different units. 1.887 + * This is possible but would cost a few extra indirect function calls to back 1.888 + * up if the last unit (c1 or c2 respectively) was >=0. 1.889 + * 1.890 + * Consistently leaving them _behind_ the different units is not an option 1.891 + * because the current "unit" is the end of the string if that is reached, 1.892 + * and in such a case the iterator does not move. 1.893 + * For example, when comparing "ab" with "abc", both iterators rest _on_ the end 1.894 + * of their strings. Calling previous() on each does not move them to where 1.895 + * the comparison fails. 1.896 + * 1.897 + * So the simplest semantics is to not define where the iterators end up. 1.898 + * 1.899 + * The following fragment is part of what would need to be done for backing up. 1.900 + */ 1.901 +void fragment { 1.902 + /* iff a surrogate is part of a surrogate pair, leave >=d800 */ 1.903 + if(c1<=0xdbff) { 1.904 + if(!U16_IS_TRAIL(iter1->current(iter1))) { 1.905 + /* lead surrogate code point - make <d800 */ 1.906 + c1-=0x2800; 1.907 + } 1.908 + } else if(c1<=0xdfff) { 1.909 + int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); 1.910 + iter1->previous(iter1); /* ==c1 */ 1.911 + if(!U16_IS_LEAD(iter1->previous(iter1))) { 1.912 + /* trail surrogate code point - make <d800 */ 1.913 + c1-=0x2800; 1.914 + } 1.915 + /* go back to behind where the difference is */ 1.916 + iter1->move(iter1, idx, UITER_ZERO); 1.917 + } else /* 0xe000<=c1<=0xffff */ { 1.918 + /* BMP code point - make <d800 */ 1.919 + c1-=0x2800; 1.920 + } 1.921 +} 1.922 +#endif 1.923 + 1.924 +U_CAPI int32_t U_EXPORT2 1.925 +u_strCompare(const UChar *s1, int32_t length1, 1.926 + const UChar *s2, int32_t length2, 1.927 + UBool codePointOrder) { 1.928 + /* argument checking */ 1.929 + if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 1.930 + return 0; 1.931 + } 1.932 + return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); 1.933 +} 1.934 + 1.935 +/* String compare in code point order - u_strcmp() compares in code unit order. */ 1.936 +U_CAPI int32_t U_EXPORT2 1.937 +u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { 1.938 + return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); 1.939 +} 1.940 + 1.941 +U_CAPI int32_t U_EXPORT2 1.942 +u_strncmp(const UChar *s1, 1.943 + const UChar *s2, 1.944 + int32_t n) 1.945 +{ 1.946 + if(n > 0) { 1.947 + int32_t rc; 1.948 + for(;;) { 1.949 + rc = (int32_t)*s1 - (int32_t)*s2; 1.950 + if(rc != 0 || *s1 == 0 || --n == 0) { 1.951 + return rc; 1.952 + } 1.953 + ++s1; 1.954 + ++s2; 1.955 + } 1.956 + } else { 1.957 + return 0; 1.958 + } 1.959 +} 1.960 + 1.961 +U_CAPI int32_t U_EXPORT2 1.962 +u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { 1.963 + return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); 1.964 +} 1.965 + 1.966 +U_CAPI UChar* U_EXPORT2 1.967 +u_strcpy(UChar *dst, 1.968 + const UChar *src) 1.969 +{ 1.970 + UChar *anchor = dst; /* save a pointer to start of dst */ 1.971 + 1.972 + while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 1.973 + } 1.974 + 1.975 + return anchor; 1.976 +} 1.977 + 1.978 +U_CAPI UChar* U_EXPORT2 1.979 +u_strncpy(UChar *dst, 1.980 + const UChar *src, 1.981 + int32_t n) 1.982 +{ 1.983 + UChar *anchor = dst; /* save a pointer to start of dst */ 1.984 + 1.985 + /* copy string 2 over */ 1.986 + while(n > 0 && (*(dst++) = *(src++)) != 0) { 1.987 + --n; 1.988 + } 1.989 + 1.990 + return anchor; 1.991 +} 1.992 + 1.993 +U_CAPI int32_t U_EXPORT2 1.994 +u_strlen(const UChar *s) 1.995 +{ 1.996 +#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR 1.997 + return (int32_t)uprv_wcslen(s); 1.998 +#else 1.999 + const UChar *t = s; 1.1000 + while(*t != 0) { 1.1001 + ++t; 1.1002 + } 1.1003 + return t - s; 1.1004 +#endif 1.1005 +} 1.1006 + 1.1007 +U_CAPI int32_t U_EXPORT2 1.1008 +u_countChar32(const UChar *s, int32_t length) { 1.1009 + int32_t count; 1.1010 + 1.1011 + if(s==NULL || length<-1) { 1.1012 + return 0; 1.1013 + } 1.1014 + 1.1015 + count=0; 1.1016 + if(length>=0) { 1.1017 + while(length>0) { 1.1018 + ++count; 1.1019 + if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { 1.1020 + s+=2; 1.1021 + length-=2; 1.1022 + } else { 1.1023 + ++s; 1.1024 + --length; 1.1025 + } 1.1026 + } 1.1027 + } else /* length==-1 */ { 1.1028 + UChar c; 1.1029 + 1.1030 + for(;;) { 1.1031 + if((c=*s++)==0) { 1.1032 + break; 1.1033 + } 1.1034 + ++count; 1.1035 + 1.1036 + /* 1.1037 + * sufficient to look ahead one because of UTF-16; 1.1038 + * safe to look ahead one because at worst that would be the terminating NUL 1.1039 + */ 1.1040 + if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { 1.1041 + ++s; 1.1042 + } 1.1043 + } 1.1044 + } 1.1045 + return count; 1.1046 +} 1.1047 + 1.1048 +U_CAPI UBool U_EXPORT2 1.1049 +u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { 1.1050 + 1.1051 + if(number<0) { 1.1052 + return TRUE; 1.1053 + } 1.1054 + if(s==NULL || length<-1) { 1.1055 + return FALSE; 1.1056 + } 1.1057 + 1.1058 + if(length==-1) { 1.1059 + /* s is NUL-terminated */ 1.1060 + UChar c; 1.1061 + 1.1062 + /* count code points until they exceed */ 1.1063 + for(;;) { 1.1064 + if((c=*s++)==0) { 1.1065 + return FALSE; 1.1066 + } 1.1067 + if(number==0) { 1.1068 + return TRUE; 1.1069 + } 1.1070 + if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { 1.1071 + ++s; 1.1072 + } 1.1073 + --number; 1.1074 + } 1.1075 + } else { 1.1076 + /* length>=0 known */ 1.1077 + const UChar *limit; 1.1078 + int32_t maxSupplementary; 1.1079 + 1.1080 + /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ 1.1081 + if(((length+1)/2)>number) { 1.1082 + return TRUE; 1.1083 + } 1.1084 + 1.1085 + /* check if s does not even contain enough UChars */ 1.1086 + maxSupplementary=length-number; 1.1087 + if(maxSupplementary<=0) { 1.1088 + return FALSE; 1.1089 + } 1.1090 + /* there are maxSupplementary=length-number more UChars than asked-for code points */ 1.1091 + 1.1092 + /* 1.1093 + * count code points until they exceed and also check that there are 1.1094 + * no more than maxSupplementary supplementary code points (UChar pairs) 1.1095 + */ 1.1096 + limit=s+length; 1.1097 + for(;;) { 1.1098 + if(s==limit) { 1.1099 + return FALSE; 1.1100 + } 1.1101 + if(number==0) { 1.1102 + return TRUE; 1.1103 + } 1.1104 + if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { 1.1105 + ++s; 1.1106 + if(--maxSupplementary<=0) { 1.1107 + /* too many pairs - too few code points */ 1.1108 + return FALSE; 1.1109 + } 1.1110 + } 1.1111 + --number; 1.1112 + } 1.1113 + } 1.1114 +} 1.1115 + 1.1116 +U_CAPI UChar * U_EXPORT2 1.1117 +u_memcpy(UChar *dest, const UChar *src, int32_t count) { 1.1118 + if(count > 0) { 1.1119 + uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); 1.1120 + } 1.1121 + return dest; 1.1122 +} 1.1123 + 1.1124 +U_CAPI UChar * U_EXPORT2 1.1125 +u_memmove(UChar *dest, const UChar *src, int32_t count) { 1.1126 + if(count > 0) { 1.1127 + uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); 1.1128 + } 1.1129 + return dest; 1.1130 +} 1.1131 + 1.1132 +U_CAPI UChar * U_EXPORT2 1.1133 +u_memset(UChar *dest, UChar c, int32_t count) { 1.1134 + if(count > 0) { 1.1135 + UChar *ptr = dest; 1.1136 + UChar *limit = dest + count; 1.1137 + 1.1138 + while (ptr < limit) { 1.1139 + *(ptr++) = c; 1.1140 + } 1.1141 + } 1.1142 + return dest; 1.1143 +} 1.1144 + 1.1145 +U_CAPI int32_t U_EXPORT2 1.1146 +u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { 1.1147 + if(count > 0) { 1.1148 + const UChar *limit = buf1 + count; 1.1149 + int32_t result; 1.1150 + 1.1151 + while (buf1 < limit) { 1.1152 + result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; 1.1153 + if (result != 0) { 1.1154 + return result; 1.1155 + } 1.1156 + buf1++; 1.1157 + buf2++; 1.1158 + } 1.1159 + } 1.1160 + return 0; 1.1161 +} 1.1162 + 1.1163 +U_CAPI int32_t U_EXPORT2 1.1164 +u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { 1.1165 + return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); 1.1166 +} 1.1167 + 1.1168 +/* u_unescape & support fns ------------------------------------------------- */ 1.1169 + 1.1170 +/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 1.1171 +static const UChar UNESCAPE_MAP[] = { 1.1172 + /*" 0x22, 0x22 */ 1.1173 + /*' 0x27, 0x27 */ 1.1174 + /*? 0x3F, 0x3F */ 1.1175 + /*\ 0x5C, 0x5C */ 1.1176 + /*a*/ 0x61, 0x07, 1.1177 + /*b*/ 0x62, 0x08, 1.1178 + /*e*/ 0x65, 0x1b, 1.1179 + /*f*/ 0x66, 0x0c, 1.1180 + /*n*/ 0x6E, 0x0a, 1.1181 + /*r*/ 0x72, 0x0d, 1.1182 + /*t*/ 0x74, 0x09, 1.1183 + /*v*/ 0x76, 0x0b 1.1184 +}; 1.1185 +enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; 1.1186 + 1.1187 +/* Convert one octal digit to a numeric value 0..7, or -1 on failure */ 1.1188 +static int8_t _digit8(UChar c) { 1.1189 + if (c >= 0x0030 && c <= 0x0037) { 1.1190 + return (int8_t)(c - 0x0030); 1.1191 + } 1.1192 + return -1; 1.1193 +} 1.1194 + 1.1195 +/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ 1.1196 +static int8_t _digit16(UChar c) { 1.1197 + if (c >= 0x0030 && c <= 0x0039) { 1.1198 + return (int8_t)(c - 0x0030); 1.1199 + } 1.1200 + if (c >= 0x0041 && c <= 0x0046) { 1.1201 + return (int8_t)(c - (0x0041 - 10)); 1.1202 + } 1.1203 + if (c >= 0x0061 && c <= 0x0066) { 1.1204 + return (int8_t)(c - (0x0061 - 10)); 1.1205 + } 1.1206 + return -1; 1.1207 +} 1.1208 + 1.1209 +/* Parse a single escape sequence. Although this method deals in 1.1210 + * UChars, it does not use C++ or UnicodeString. This allows it to 1.1211 + * be used from C contexts. */ 1.1212 +U_CAPI UChar32 U_EXPORT2 1.1213 +u_unescapeAt(UNESCAPE_CHAR_AT charAt, 1.1214 + int32_t *offset, 1.1215 + int32_t length, 1.1216 + void *context) { 1.1217 + 1.1218 + int32_t start = *offset; 1.1219 + UChar c; 1.1220 + UChar32 result = 0; 1.1221 + int8_t n = 0; 1.1222 + int8_t minDig = 0; 1.1223 + int8_t maxDig = 0; 1.1224 + int8_t bitsPerDigit = 4; 1.1225 + int8_t dig; 1.1226 + int32_t i; 1.1227 + UBool braces = FALSE; 1.1228 + 1.1229 + /* Check that offset is in range */ 1.1230 + if (*offset < 0 || *offset >= length) { 1.1231 + goto err; 1.1232 + } 1.1233 + 1.1234 + /* Fetch first UChar after '\\' */ 1.1235 + c = charAt((*offset)++, context); 1.1236 + 1.1237 + /* Convert hexadecimal and octal escapes */ 1.1238 + switch (c) { 1.1239 + case 0x0075 /*'u'*/: 1.1240 + minDig = maxDig = 4; 1.1241 + break; 1.1242 + case 0x0055 /*'U'*/: 1.1243 + minDig = maxDig = 8; 1.1244 + break; 1.1245 + case 0x0078 /*'x'*/: 1.1246 + minDig = 1; 1.1247 + if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { 1.1248 + ++(*offset); 1.1249 + braces = TRUE; 1.1250 + maxDig = 8; 1.1251 + } else { 1.1252 + maxDig = 2; 1.1253 + } 1.1254 + break; 1.1255 + default: 1.1256 + dig = _digit8(c); 1.1257 + if (dig >= 0) { 1.1258 + minDig = 1; 1.1259 + maxDig = 3; 1.1260 + n = 1; /* Already have first octal digit */ 1.1261 + bitsPerDigit = 3; 1.1262 + result = dig; 1.1263 + } 1.1264 + break; 1.1265 + } 1.1266 + if (minDig != 0) { 1.1267 + while (*offset < length && n < maxDig) { 1.1268 + c = charAt(*offset, context); 1.1269 + dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); 1.1270 + if (dig < 0) { 1.1271 + break; 1.1272 + } 1.1273 + result = (result << bitsPerDigit) | dig; 1.1274 + ++(*offset); 1.1275 + ++n; 1.1276 + } 1.1277 + if (n < minDig) { 1.1278 + goto err; 1.1279 + } 1.1280 + if (braces) { 1.1281 + if (c != 0x7D /*}*/) { 1.1282 + goto err; 1.1283 + } 1.1284 + ++(*offset); 1.1285 + } 1.1286 + if (result < 0 || result >= 0x110000) { 1.1287 + goto err; 1.1288 + } 1.1289 + /* If an escape sequence specifies a lead surrogate, see if 1.1290 + * there is a trail surrogate after it, either as an escape or 1.1291 + * as a literal. If so, join them up into a supplementary. 1.1292 + */ 1.1293 + if (*offset < length && U16_IS_LEAD(result)) { 1.1294 + int32_t ahead = *offset + 1; 1.1295 + c = charAt(*offset, context); 1.1296 + if (c == 0x5C /*'\\'*/ && ahead < length) { 1.1297 + c = (UChar) u_unescapeAt(charAt, &ahead, length, context); 1.1298 + } 1.1299 + if (U16_IS_TRAIL(c)) { 1.1300 + *offset = ahead; 1.1301 + result = U16_GET_SUPPLEMENTARY(result, c); 1.1302 + } 1.1303 + } 1.1304 + return result; 1.1305 + } 1.1306 + 1.1307 + /* Convert C-style escapes in table */ 1.1308 + for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { 1.1309 + if (c == UNESCAPE_MAP[i]) { 1.1310 + return UNESCAPE_MAP[i+1]; 1.1311 + } else if (c < UNESCAPE_MAP[i]) { 1.1312 + break; 1.1313 + } 1.1314 + } 1.1315 + 1.1316 + /* Map \cX to control-X: X & 0x1F */ 1.1317 + if (c == 0x0063 /*'c'*/ && *offset < length) { 1.1318 + c = charAt((*offset)++, context); 1.1319 + if (U16_IS_LEAD(c) && *offset < length) { 1.1320 + UChar c2 = charAt(*offset, context); 1.1321 + if (U16_IS_TRAIL(c2)) { 1.1322 + ++(*offset); 1.1323 + c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ 1.1324 + } 1.1325 + } 1.1326 + return 0x1F & c; 1.1327 + } 1.1328 + 1.1329 + /* If no special forms are recognized, then consider 1.1330 + * the backslash to generically escape the next character. 1.1331 + * Deal with surrogate pairs. */ 1.1332 + if (U16_IS_LEAD(c) && *offset < length) { 1.1333 + UChar c2 = charAt(*offset, context); 1.1334 + if (U16_IS_TRAIL(c2)) { 1.1335 + ++(*offset); 1.1336 + return U16_GET_SUPPLEMENTARY(c, c2); 1.1337 + } 1.1338 + } 1.1339 + return c; 1.1340 + 1.1341 + err: 1.1342 + /* Invalid escape sequence */ 1.1343 + *offset = start; /* Reset to initial value */ 1.1344 + return (UChar32)0xFFFFFFFF; 1.1345 +} 1.1346 + 1.1347 +/* u_unescapeAt() callback to return a UChar from a char* */ 1.1348 +static UChar U_CALLCONV 1.1349 +_charPtr_charAt(int32_t offset, void *context) { 1.1350 + UChar c16; 1.1351 + /* It would be more efficient to access the invariant tables 1.1352 + * directly but there is no API for that. */ 1.1353 + u_charsToUChars(((char*) context) + offset, &c16, 1); 1.1354 + return c16; 1.1355 +} 1.1356 + 1.1357 +/* Append an escape-free segment of the text; used by u_unescape() */ 1.1358 +static void _appendUChars(UChar *dest, int32_t destCapacity, 1.1359 + const char *src, int32_t srcLen) { 1.1360 + if (destCapacity < 0) { 1.1361 + destCapacity = 0; 1.1362 + } 1.1363 + if (srcLen > destCapacity) { 1.1364 + srcLen = destCapacity; 1.1365 + } 1.1366 + u_charsToUChars(src, dest, srcLen); 1.1367 +} 1.1368 + 1.1369 +/* Do an invariant conversion of char* -> UChar*, with escape parsing */ 1.1370 +U_CAPI int32_t U_EXPORT2 1.1371 +u_unescape(const char *src, UChar *dest, int32_t destCapacity) { 1.1372 + const char *segment = src; 1.1373 + int32_t i = 0; 1.1374 + char c; 1.1375 + 1.1376 + while ((c=*src) != 0) { 1.1377 + /* '\\' intentionally written as compiler-specific 1.1378 + * character constant to correspond to compiler-specific 1.1379 + * char* constants. */ 1.1380 + if (c == '\\') { 1.1381 + int32_t lenParsed = 0; 1.1382 + UChar32 c32; 1.1383 + if (src != segment) { 1.1384 + if (dest != NULL) { 1.1385 + _appendUChars(dest + i, destCapacity - i, 1.1386 + segment, (int32_t)(src - segment)); 1.1387 + } 1.1388 + i += (int32_t)(src - segment); 1.1389 + } 1.1390 + ++src; /* advance past '\\' */ 1.1391 + c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); 1.1392 + if (lenParsed == 0) { 1.1393 + goto err; 1.1394 + } 1.1395 + src += lenParsed; /* advance past escape seq. */ 1.1396 + if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { 1.1397 + U16_APPEND_UNSAFE(dest, i, c32); 1.1398 + } else { 1.1399 + i += U16_LENGTH(c32); 1.1400 + } 1.1401 + segment = src; 1.1402 + } else { 1.1403 + ++src; 1.1404 + } 1.1405 + } 1.1406 + if (src != segment) { 1.1407 + if (dest != NULL) { 1.1408 + _appendUChars(dest + i, destCapacity - i, 1.1409 + segment, (int32_t)(src - segment)); 1.1410 + } 1.1411 + i += (int32_t)(src - segment); 1.1412 + } 1.1413 + if (dest != NULL && i < destCapacity) { 1.1414 + dest[i] = 0; 1.1415 + } 1.1416 + return i; 1.1417 + 1.1418 + err: 1.1419 + if (dest != NULL && destCapacity > 0) { 1.1420 + *dest = 0; 1.1421 + } 1.1422 + return 0; 1.1423 +} 1.1424 + 1.1425 +/* NUL-termination of strings ----------------------------------------------- */ 1.1426 + 1.1427 +/** 1.1428 + * NUL-terminate a string no matter what its type. 1.1429 + * Set warning and error codes accordingly. 1.1430 + */ 1.1431 +#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ 1.1432 + if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ 1.1433 + /* not a public function, so no complete argument checking */ \ 1.1434 + \ 1.1435 + if(length<0) { \ 1.1436 + /* assume that the caller handles this */ \ 1.1437 + } else if(length<destCapacity) { \ 1.1438 + /* NUL-terminate the string, the NUL fits */ \ 1.1439 + dest[length]=0; \ 1.1440 + /* unset the not-terminated warning but leave all others */ \ 1.1441 + if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ 1.1442 + *pErrorCode=U_ZERO_ERROR; \ 1.1443 + } \ 1.1444 + } else if(length==destCapacity) { \ 1.1445 + /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ 1.1446 + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ 1.1447 + } else /* length>destCapacity */ { \ 1.1448 + /* even the string itself did not fit - set an error code */ \ 1.1449 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ 1.1450 + } \ 1.1451 + } 1.1452 + 1.1453 +U_CAPI int32_t U_EXPORT2 1.1454 +u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1.1455 + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1.1456 + return length; 1.1457 +} 1.1458 + 1.1459 +U_CAPI int32_t U_EXPORT2 1.1460 +u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1.1461 + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1.1462 + return length; 1.1463 +} 1.1464 + 1.1465 +U_CAPI int32_t U_EXPORT2 1.1466 +u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1.1467 + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1.1468 + return length; 1.1469 +} 1.1470 + 1.1471 +U_CAPI int32_t U_EXPORT2 1.1472 +u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1.1473 + __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1.1474 + return length; 1.1475 +} 1.1476 + 1.1477 +// Compute the hash code for a string -------------------------------------- *** 1.1478 + 1.1479 +// Moved here from uhash.c so that UnicodeString::hashCode() does not depend 1.1480 +// on UHashtable code. 1.1481 + 1.1482 +/* 1.1483 + Compute the hash by iterating sparsely over about 32 (up to 63) 1.1484 + characters spaced evenly through the string. For each character, 1.1485 + multiply the previous hash value by a prime number and add the new 1.1486 + character in, like a linear congruential random number generator, 1.1487 + producing a pseudorandom deterministic value well distributed over 1.1488 + the output range. [LIU] 1.1489 +*/ 1.1490 + 1.1491 +#define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ 1.1492 + int32_t hash = 0; \ 1.1493 + const TYPE *p = (const TYPE*) STR; \ 1.1494 + if (p != NULL) { \ 1.1495 + int32_t len = (int32_t)(STRLEN); \ 1.1496 + int32_t inc = ((len - 32) / 32) + 1; \ 1.1497 + const TYPE *limit = p + len; \ 1.1498 + while (p<limit) { \ 1.1499 + hash = (hash * 37) + DEREF; \ 1.1500 + p += inc; \ 1.1501 + } \ 1.1502 + } \ 1.1503 + return hash 1.1504 + 1.1505 +/* Used by UnicodeString to compute its hashcode - Not public API. */ 1.1506 +U_CAPI int32_t U_EXPORT2 1.1507 +ustr_hashUCharsN(const UChar *str, int32_t length) { 1.1508 + STRING_HASH(UChar, str, length, *p); 1.1509 +} 1.1510 + 1.1511 +U_CAPI int32_t U_EXPORT2 1.1512 +ustr_hashCharsN(const char *str, int32_t length) { 1.1513 + STRING_HASH(uint8_t, str, length, *p); 1.1514 +} 1.1515 + 1.1516 +U_CAPI int32_t U_EXPORT2 1.1517 +ustr_hashICharsN(const char *str, int32_t length) { 1.1518 + STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); 1.1519 +}