The Tor Browser: diff intl/icu/source/common/ustring.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ustring.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1516 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 1998-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*
    1.12 +* File ustring.cpp
    1.13 +*
    1.14 +* Modification History:
    1.15 +*
    1.16 +*   Date        Name        Description
    1.17 +*   12/07/98    bertrand    Creation.
    1.18 +******************************************************************************
    1.19 +*/
    1.20 +
    1.21 +#include "unicode/utypes.h"
    1.22 +#include "unicode/putil.h"
    1.23 +#include "unicode/ustring.h"
    1.24 +#include "unicode/utf16.h"
    1.25 +#include "cstring.h"
    1.26 +#include "cwchar.h"
    1.27 +#include "cmemory.h"
    1.28 +#include "ustr_imp.h"
    1.29 +
    1.30 +/* ANSI string.h - style functions ------------------------------------------ */
    1.31 +
    1.32 +/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
    1.33 +#define U_BMP_MAX 0xffff
    1.34 +
    1.35 +/* Forward binary string search functions ----------------------------------- */
    1.36 +
    1.37 +/*
    1.38 + * Test if a substring match inside a string is at code point boundaries.
    1.39 + * All pointers refer to the same buffer.
    1.40 + * The limit pointer may be NULL, all others must be real pointers.
    1.41 + */
    1.42 +static inline UBool
    1.43 +isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
    1.44 +    if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
    1.45 +        /* the leading edge of the match is in the middle of a surrogate pair */
    1.46 +        return FALSE;
    1.47 +    }
    1.48 +    if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
    1.49 +        /* the trailing edge of the match is in the middle of a surrogate pair */
    1.50 +        return FALSE;
    1.51 +    }
    1.52 +    return TRUE;
    1.53 +}
    1.54 +
    1.55 +U_CAPI UChar * U_EXPORT2
    1.56 +u_strFindFirst(const UChar *s, int32_t length,
    1.57 +               const UChar *sub, int32_t subLength) {
    1.58 +    const UChar *start, *p, *q, *subLimit;
    1.59 +    UChar c, cs, cq;
    1.60 +
    1.61 +    if(sub==NULL || subLength<-1) {
    1.62 +        return (UChar *)s;
    1.63 +    }
    1.64 +    if(s==NULL || length<-1) {
    1.65 +        return NULL;
    1.66 +    }
    1.67 +
    1.68 +    start=s;
    1.69 +
    1.70 +    if(length<0 && subLength<0) {
    1.71 +        /* both strings are NUL-terminated */
    1.72 +        if((cs=*sub++)==0) {
    1.73 +            return (UChar *)s;
    1.74 +        }
    1.75 +        if(*sub==0 && !U16_IS_SURROGATE(cs)) {
    1.76 +            /* the substring consists of a single, non-surrogate BMP code point */
    1.77 +            return u_strchr(s, cs);
    1.78 +        }
    1.79 +
    1.80 +        while((c=*s++)!=0) {
    1.81 +            if(c==cs) {
    1.82 +                /* found first substring UChar, compare rest */
    1.83 +                p=s;
    1.84 +                q=sub;
    1.85 +                for(;;) {
    1.86 +                    if((cq=*q)==0) {
    1.87 +                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
    1.88 +                            return (UChar *)(s-1); /* well-formed match */
    1.89 +                        } else {
    1.90 +                            break; /* no match because surrogate pair is split */
    1.91 +                        }
    1.92 +                    }
    1.93 +                    if((c=*p)==0) {
    1.94 +                        return NULL; /* no match, and none possible after s */
    1.95 +                    }
    1.96 +                    if(c!=cq) {
    1.97 +                        break; /* no match */
    1.98 +                    }
    1.99 +                    ++p;
   1.100 +                    ++q;
   1.101 +                }
   1.102 +            }
   1.103 +        }
   1.104 +
   1.105 +        /* not found */
   1.106 +        return NULL;
   1.107 +    }
   1.108 +
   1.109 +    if(subLength<0) {
   1.110 +        subLength=u_strlen(sub);
   1.111 +    }
   1.112 +    if(subLength==0) {
   1.113 +        return (UChar *)s;
   1.114 +    }
   1.115 +
   1.116 +    /* get sub[0] to search for it fast */
   1.117 +    cs=*sub++;
   1.118 +    --subLength;
   1.119 +    subLimit=sub+subLength;
   1.120 +
   1.121 +    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
   1.122 +        /* the substring consists of a single, non-surrogate BMP code point */
   1.123 +        return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
   1.124 +    }
   1.125 +
   1.126 +    if(length<0) {
   1.127 +        /* s is NUL-terminated */
   1.128 +        while((c=*s++)!=0) {
   1.129 +            if(c==cs) {
   1.130 +                /* found first substring UChar, compare rest */
   1.131 +                p=s;
   1.132 +                q=sub;
   1.133 +                for(;;) {
   1.134 +                    if(q==subLimit) {
   1.135 +                        if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
   1.136 +                            return (UChar *)(s-1); /* well-formed match */
   1.137 +                        } else {
   1.138 +                            break; /* no match because surrogate pair is split */
   1.139 +                        }
   1.140 +                    }
   1.141 +                    if((c=*p)==0) {
   1.142 +                        return NULL; /* no match, and none possible after s */
   1.143 +                    }
   1.144 +                    if(c!=*q) {
   1.145 +                        break; /* no match */
   1.146 +                    }
   1.147 +                    ++p;
   1.148 +                    ++q;
   1.149 +                }
   1.150 +            }
   1.151 +        }
   1.152 +    } else {
   1.153 +        const UChar *limit, *preLimit;
   1.154 +
   1.155 +        /* subLength was decremented above */
   1.156 +        if(length<=subLength) {
   1.157 +            return NULL; /* s is shorter than sub */
   1.158 +        }
   1.159 +
   1.160 +        limit=s+length;
   1.161 +
   1.162 +        /* the substring must start before preLimit */
   1.163 +        preLimit=limit-subLength;
   1.164 +
   1.165 +        while(s!=preLimit) {
   1.166 +            c=*s++;
   1.167 +            if(c==cs) {
   1.168 +                /* found first substring UChar, compare rest */
   1.169 +                p=s;
   1.170 +                q=sub;
   1.171 +                for(;;) {
   1.172 +                    if(q==subLimit) {
   1.173 +                        if(isMatchAtCPBoundary(start, s-1, p, limit)) {
   1.174 +                            return (UChar *)(s-1); /* well-formed match */
   1.175 +                        } else {
   1.176 +                            break; /* no match because surrogate pair is split */
   1.177 +                        }
   1.178 +                    }
   1.179 +                    if(*p!=*q) {
   1.180 +                        break; /* no match */
   1.181 +                    }
   1.182 +                    ++p;
   1.183 +                    ++q;
   1.184 +                }
   1.185 +            }
   1.186 +        }
   1.187 +    }
   1.188 +
   1.189 +    /* not found */
   1.190 +    return NULL;
   1.191 +}
   1.192 +
   1.193 +U_CAPI UChar * U_EXPORT2
   1.194 +u_strstr(const UChar *s, const UChar *substring) {
   1.195 +    return u_strFindFirst(s, -1, substring, -1);
   1.196 +}
   1.197 +
   1.198 +U_CAPI UChar * U_EXPORT2
   1.199 +u_strchr(const UChar *s, UChar c) {
   1.200 +    if(U16_IS_SURROGATE(c)) {
   1.201 +        /* make sure to not find half of a surrogate pair */
   1.202 +        return u_strFindFirst(s, -1, &c, 1);
   1.203 +    } else {
   1.204 +        UChar cs;
   1.205 +
   1.206 +        /* trivial search for a BMP code point */
   1.207 +        for(;;) {
   1.208 +            if((cs=*s)==c) {
   1.209 +                return (UChar *)s;
   1.210 +            }
   1.211 +            if(cs==0) {
   1.212 +                return NULL;
   1.213 +            }
   1.214 +            ++s;
   1.215 +        }
   1.216 +    }
   1.217 +}
   1.218 +
   1.219 +U_CAPI UChar * U_EXPORT2
   1.220 +u_strchr32(const UChar *s, UChar32 c) {
   1.221 +    if((uint32_t)c<=U_BMP_MAX) {
   1.222 +        /* find BMP code point */
   1.223 +        return u_strchr(s, (UChar)c);
   1.224 +    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
   1.225 +        /* find supplementary code point as surrogate pair */
   1.226 +        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
   1.227 +
   1.228 +        while((cs=*s++)!=0) {
   1.229 +            if(cs==lead && *s==trail) {
   1.230 +                return (UChar *)(s-1);
   1.231 +            }
   1.232 +        }
   1.233 +        return NULL;
   1.234 +    } else {
   1.235 +        /* not a Unicode code point, not findable */
   1.236 +        return NULL;
   1.237 +    }
   1.238 +}
   1.239 +
   1.240 +U_CAPI UChar * U_EXPORT2
   1.241 +u_memchr(const UChar *s, UChar c, int32_t count) {
   1.242 +    if(count<=0) {
   1.243 +        return NULL; /* no string */
   1.244 +    } else if(U16_IS_SURROGATE(c)) {
   1.245 +        /* make sure to not find half of a surrogate pair */
   1.246 +        return u_strFindFirst(s, count, &c, 1);
   1.247 +    } else {
   1.248 +        /* trivial search for a BMP code point */
   1.249 +        const UChar *limit=s+count;
   1.250 +        do {
   1.251 +            if(*s==c) {
   1.252 +                return (UChar *)s;
   1.253 +            }
   1.254 +        } while(++s!=limit);
   1.255 +        return NULL;
   1.256 +    }
   1.257 +}
   1.258 +
   1.259 +U_CAPI UChar * U_EXPORT2
   1.260 +u_memchr32(const UChar *s, UChar32 c, int32_t count) {
   1.261 +    if((uint32_t)c<=U_BMP_MAX) {
   1.262 +        /* find BMP code point */
   1.263 +        return u_memchr(s, (UChar)c, count);
   1.264 +    } else if(count<2) {
   1.265 +        /* too short for a surrogate pair */
   1.266 +        return NULL;
   1.267 +    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
   1.268 +        /* find supplementary code point as surrogate pair */
   1.269 +        const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
   1.270 +        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
   1.271 +
   1.272 +        do {
   1.273 +            if(*s==lead && *(s+1)==trail) {
   1.274 +                return (UChar *)s;
   1.275 +            }
   1.276 +        } while(++s!=limit);
   1.277 +        return NULL;
   1.278 +    } else {
   1.279 +        /* not a Unicode code point, not findable */
   1.280 +        return NULL;
   1.281 +    }
   1.282 +}
   1.283 +
   1.284 +/* Backward binary string search functions ---------------------------------- */
   1.285 +
   1.286 +U_CAPI UChar * U_EXPORT2
   1.287 +u_strFindLast(const UChar *s, int32_t length,
   1.288 +              const UChar *sub, int32_t subLength) {
   1.289 +    const UChar *start, *limit, *p, *q, *subLimit;
   1.290 +    UChar c, cs;
   1.291 +
   1.292 +    if(sub==NULL || subLength<-1) {
   1.293 +        return (UChar *)s;
   1.294 +    }
   1.295 +    if(s==NULL || length<-1) {
   1.296 +        return NULL;
   1.297 +    }
   1.298 +
   1.299 +    /*
   1.300 +     * This implementation is more lazy than the one for u_strFindFirst():
   1.301 +     * There is no special search code for NUL-terminated strings.
   1.302 +     * It does not seem to be worth it for searching substrings to
   1.303 +     * search forward and find all matches like in u_strrchr() and similar.
   1.304 +     * Therefore, we simply get both string lengths and search backward.
   1.305 +     *
   1.306 +     * markus 2002oct23
   1.307 +     */
   1.308 +
   1.309 +    if(subLength<0) {
   1.310 +        subLength=u_strlen(sub);
   1.311 +    }
   1.312 +    if(subLength==0) {
   1.313 +        return (UChar *)s;
   1.314 +    }
   1.315 +
   1.316 +    /* get sub[subLength-1] to search for it fast */
   1.317 +    subLimit=sub+subLength;
   1.318 +    cs=*(--subLimit);
   1.319 +    --subLength;
   1.320 +
   1.321 +    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
   1.322 +        /* the substring consists of a single, non-surrogate BMP code point */
   1.323 +        return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
   1.324 +    }
   1.325 +
   1.326 +    if(length<0) {
   1.327 +        length=u_strlen(s);
   1.328 +    }
   1.329 +
   1.330 +    /* subLength was decremented above */
   1.331 +    if(length<=subLength) {
   1.332 +        return NULL; /* s is shorter than sub */
   1.333 +    }
   1.334 +
   1.335 +    start=s;
   1.336 +    limit=s+length;
   1.337 +
   1.338 +    /* the substring must start no later than s+subLength */
   1.339 +    s+=subLength;
   1.340 +
   1.341 +    while(s!=limit) {
   1.342 +        c=*(--limit);
   1.343 +        if(c==cs) {
   1.344 +            /* found last substring UChar, compare rest */
   1.345 +            p=limit;
   1.346 +            q=subLimit;
   1.347 +            for(;;) {
   1.348 +                if(q==sub) {
   1.349 +                    if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
   1.350 +                        return (UChar *)p; /* well-formed match */
   1.351 +                    } else {
   1.352 +                        break; /* no match because surrogate pair is split */
   1.353 +                    }
   1.354 +                }
   1.355 +                if(*(--p)!=*(--q)) {
   1.356 +                    break; /* no match */
   1.357 +                }
   1.358 +            }
   1.359 +        }
   1.360 +    }
   1.361 +
   1.362 +    /* not found */
   1.363 +    return NULL;
   1.364 +}
   1.365 +
   1.366 +U_CAPI UChar * U_EXPORT2
   1.367 +u_strrstr(const UChar *s, const UChar *substring) {
   1.368 +    return u_strFindLast(s, -1, substring, -1);
   1.369 +}
   1.370 +
   1.371 +U_CAPI UChar * U_EXPORT2
   1.372 +u_strrchr(const UChar *s, UChar c) {
   1.373 +    if(U16_IS_SURROGATE(c)) {
   1.374 +        /* make sure to not find half of a surrogate pair */
   1.375 +        return u_strFindLast(s, -1, &c, 1);
   1.376 +    } else {
   1.377 +        const UChar *result=NULL;
   1.378 +        UChar cs;
   1.379 +
   1.380 +        /* trivial search for a BMP code point */
   1.381 +        for(;;) {
   1.382 +            if((cs=*s)==c) {
   1.383 +                result=s;
   1.384 +            }
   1.385 +            if(cs==0) {
   1.386 +                return (UChar *)result;
   1.387 +            }
   1.388 +            ++s;
   1.389 +        }
   1.390 +    }
   1.391 +}
   1.392 +
   1.393 +U_CAPI UChar * U_EXPORT2
   1.394 +u_strrchr32(const UChar *s, UChar32 c) {
   1.395 +    if((uint32_t)c<=U_BMP_MAX) {
   1.396 +        /* find BMP code point */
   1.397 +        return u_strrchr(s, (UChar)c);
   1.398 +    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
   1.399 +        /* find supplementary code point as surrogate pair */
   1.400 +        const UChar *result=NULL;
   1.401 +        UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
   1.402 +
   1.403 +        while((cs=*s++)!=0) {
   1.404 +            if(cs==lead && *s==trail) {
   1.405 +                result=s-1;
   1.406 +            }
   1.407 +        }
   1.408 +        return (UChar *)result;
   1.409 +    } else {
   1.410 +        /* not a Unicode code point, not findable */
   1.411 +        return NULL;
   1.412 +    }
   1.413 +}
   1.414 +
   1.415 +U_CAPI UChar * U_EXPORT2
   1.416 +u_memrchr(const UChar *s, UChar c, int32_t count) {
   1.417 +    if(count<=0) {
   1.418 +        return NULL; /* no string */
   1.419 +    } else if(U16_IS_SURROGATE(c)) {
   1.420 +        /* make sure to not find half of a surrogate pair */
   1.421 +        return u_strFindLast(s, count, &c, 1);
   1.422 +    } else {
   1.423 +        /* trivial search for a BMP code point */
   1.424 +        const UChar *limit=s+count;
   1.425 +        do {
   1.426 +            if(*(--limit)==c) {
   1.427 +                return (UChar *)limit;
   1.428 +            }
   1.429 +        } while(s!=limit);
   1.430 +        return NULL;
   1.431 +    }
   1.432 +}
   1.433 +
   1.434 +U_CAPI UChar * U_EXPORT2
   1.435 +u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
   1.436 +    if((uint32_t)c<=U_BMP_MAX) {
   1.437 +        /* find BMP code point */
   1.438 +        return u_memrchr(s, (UChar)c, count);
   1.439 +    } else if(count<2) {
   1.440 +        /* too short for a surrogate pair */
   1.441 +        return NULL;
   1.442 +    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
   1.443 +        /* find supplementary code point as surrogate pair */
   1.444 +        const UChar *limit=s+count-1;
   1.445 +        UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
   1.446 +
   1.447 +        do {
   1.448 +            if(*limit==trail && *(limit-1)==lead) {
   1.449 +                return (UChar *)(limit-1);
   1.450 +            }
   1.451 +        } while(s!=--limit);
   1.452 +        return NULL;
   1.453 +    } else {
   1.454 +        /* not a Unicode code point, not findable */
   1.455 +        return NULL;
   1.456 +    }
   1.457 +}
   1.458 +
   1.459 +/* Tokenization functions --------------------------------------------------- */
   1.460 +
   1.461 +/*
   1.462 + * Match each code point in a string against each code point in the matchSet.
   1.463 + * Return the index of the first string code point that
   1.464 + * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
   1.465 + * Return -(string length)-1 if there is no such code point.
   1.466 + */
   1.467 +static int32_t
   1.468 +_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
   1.469 +    int32_t matchLen, matchBMPLen, strItr, matchItr;
   1.470 +    UChar32 stringCh, matchCh;
   1.471 +    UChar c, c2;
   1.472 +
   1.473 +    /* first part of matchSet contains only BMP code points */
   1.474 +    matchBMPLen = 0;
   1.475 +    while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
   1.476 +        ++matchBMPLen;
   1.477 +    }
   1.478 +
   1.479 +    /* second part of matchSet contains BMP and supplementary code points */
   1.480 +    matchLen = matchBMPLen;
   1.481 +    while(matchSet[matchLen] != 0) {
   1.482 +        ++matchLen;
   1.483 +    }
   1.484 +
   1.485 +    for(strItr = 0; (c = string[strItr]) != 0;) {
   1.486 +        ++strItr;
   1.487 +        if(U16_IS_SINGLE(c)) {
   1.488 +            if(polarity) {
   1.489 +                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
   1.490 +                    if(c == matchSet[matchItr]) {
   1.491 +                        return strItr - 1; /* one matches */
   1.492 +                    }
   1.493 +                }
   1.494 +            } else {
   1.495 +                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
   1.496 +                    if(c == matchSet[matchItr]) {
   1.497 +                        goto endloop;
   1.498 +                    }
   1.499 +                }
   1.500 +                return strItr - 1; /* none matches */
   1.501 +            }
   1.502 +        } else {
   1.503 +            /*
   1.504 +             * No need to check for string length before U16_IS_TRAIL
   1.505 +             * because c2 could at worst be the terminating NUL.
   1.506 +             */
   1.507 +            if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
   1.508 +                ++strItr;
   1.509 +                stringCh = U16_GET_SUPPLEMENTARY(c, c2);
   1.510 +            } else {
   1.511 +                stringCh = c; /* unpaired trail surrogate */
   1.512 +            }
   1.513 +
   1.514 +            if(polarity) {
   1.515 +                for(matchItr = matchBMPLen; matchItr < matchLen;) {
   1.516 +                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
   1.517 +                    if(stringCh == matchCh) {
   1.518 +                        return strItr - U16_LENGTH(stringCh); /* one matches */
   1.519 +                    }
   1.520 +                }
   1.521 +            } else {
   1.522 +                for(matchItr = matchBMPLen; matchItr < matchLen;) {
   1.523 +                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
   1.524 +                    if(stringCh == matchCh) {
   1.525 +                        goto endloop;
   1.526 +                    }
   1.527 +                }
   1.528 +                return strItr - U16_LENGTH(stringCh); /* none matches */
   1.529 +            }
   1.530 +        }
   1.531 +endloop:
   1.532 +        /* wish C had continue with labels like Java... */;
   1.533 +    }
   1.534 +
   1.535 +    /* Didn't find it. */
   1.536 +    return -strItr-1;
   1.537 +}
   1.538 +
   1.539 +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
   1.540 +U_CAPI UChar * U_EXPORT2
   1.541 +u_strpbrk(const UChar *string, const UChar *matchSet)
   1.542 +{
   1.543 +    int32_t idx = _matchFromSet(string, matchSet, TRUE);
   1.544 +    if(idx >= 0) {
   1.545 +        return (UChar *)string + idx;
   1.546 +    } else {
   1.547 +        return NULL;
   1.548 +    }
   1.549 +}
   1.550 +
   1.551 +/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
   1.552 +U_CAPI int32_t U_EXPORT2
   1.553 +u_strcspn(const UChar *string, const UChar *matchSet)
   1.554 +{
   1.555 +    int32_t idx = _matchFromSet(string, matchSet, TRUE);
   1.556 +    if(idx >= 0) {
   1.557 +        return idx;
   1.558 +    } else {
   1.559 +        return -idx - 1; /* == u_strlen(string) */
   1.560 +    }
   1.561 +}
   1.562 +
   1.563 +/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
   1.564 +U_CAPI int32_t U_EXPORT2
   1.565 +u_strspn(const UChar *string, const UChar *matchSet)
   1.566 +{
   1.567 +    int32_t idx = _matchFromSet(string, matchSet, FALSE);
   1.568 +    if(idx >= 0) {
   1.569 +        return idx;
   1.570 +    } else {
   1.571 +        return -idx - 1; /* == u_strlen(string) */
   1.572 +    }
   1.573 +}
   1.574 +
   1.575 +/* ----- Text manipulation functions --- */
   1.576 +
   1.577 +U_CAPI UChar* U_EXPORT2
   1.578 +u_strtok_r(UChar    *src, 
   1.579 +     const UChar    *delim,
   1.580 +           UChar   **saveState)
   1.581 +{
   1.582 +    UChar *tokSource;
   1.583 +    UChar *nextToken;
   1.584 +    uint32_t nonDelimIdx;
   1.585 +
   1.586 +    /* If saveState is NULL, the user messed up. */
   1.587 +    if (src != NULL) {
   1.588 +        tokSource = src;
   1.589 +        *saveState = src; /* Set to "src" in case there are no delimiters */
   1.590 +    }
   1.591 +    else if (*saveState) {
   1.592 +        tokSource = *saveState;
   1.593 +    }
   1.594 +    else {
   1.595 +        /* src == NULL && *saveState == NULL */
   1.596 +        /* This shouldn't happen. We already finished tokenizing. */
   1.597 +        return NULL;
   1.598 +    }
   1.599 +
   1.600 +    /* Skip initial delimiters */
   1.601 +    nonDelimIdx = u_strspn(tokSource, delim);
   1.602 +    tokSource = &tokSource[nonDelimIdx];
   1.603 +
   1.604 +    if (*tokSource) {
   1.605 +        nextToken = u_strpbrk(tokSource, delim);
   1.606 +        if (nextToken != NULL) {
   1.607 +            /* Create a token */
   1.608 +            *(nextToken++) = 0;
   1.609 +            *saveState = nextToken;
   1.610 +            return tokSource;
   1.611 +        }
   1.612 +        else if (*saveState) {
   1.613 +            /* Return the last token */
   1.614 +            *saveState = NULL;
   1.615 +            return tokSource;
   1.616 +        }
   1.617 +    }
   1.618 +    else {
   1.619 +        /* No tokens were found. Only delimiters were left. */
   1.620 +        *saveState = NULL;
   1.621 +    }
   1.622 +    return NULL;
   1.623 +}
   1.624 +
   1.625 +/* Miscellaneous functions -------------------------------------------------- */
   1.626 +
   1.627 +U_CAPI UChar* U_EXPORT2
   1.628 +u_strcat(UChar     *dst, 
   1.629 +    const UChar     *src)
   1.630 +{
   1.631 +    UChar *anchor = dst;            /* save a pointer to start of dst */
   1.632 +
   1.633 +    while(*dst != 0) {              /* To end of first string          */
   1.634 +        ++dst;
   1.635 +    }
   1.636 +    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
   1.637 +    }
   1.638 +
   1.639 +    return anchor;
   1.640 +}
   1.641 +
   1.642 +U_CAPI UChar*  U_EXPORT2
   1.643 +u_strncat(UChar     *dst, 
   1.644 +     const UChar     *src, 
   1.645 +     int32_t     n ) 
   1.646 +{
   1.647 +    if(n > 0) {
   1.648 +        UChar *anchor = dst;            /* save a pointer to start of dst */
   1.649 +
   1.650 +        while(*dst != 0) {              /* To end of first string          */
   1.651 +            ++dst;
   1.652 +        }
   1.653 +        while((*dst = *src) != 0) {     /* copy string 2 over              */
   1.654 +            ++dst;
   1.655 +            if(--n == 0) {
   1.656 +                *dst = 0;
   1.657 +                break;
   1.658 +            }
   1.659 +            ++src;
   1.660 +        }
   1.661 +
   1.662 +        return anchor;
   1.663 +    } else {
   1.664 +        return dst;
   1.665 +    }
   1.666 +}
   1.667 +
   1.668 +/* ----- Text property functions --- */
   1.669 +
   1.670 +U_CAPI int32_t   U_EXPORT2
   1.671 +u_strcmp(const UChar *s1, 
   1.672 +    const UChar *s2) 
   1.673 +{
   1.674 +    UChar  c1, c2;
   1.675 +
   1.676 +    for(;;) {
   1.677 +        c1=*s1++;
   1.678 +        c2=*s2++;
   1.679 +        if (c1 != c2 || c1 == 0) {
   1.680 +            break;
   1.681 +        }
   1.682 +    }
   1.683 +    return (int32_t)c1 - (int32_t)c2;
   1.684 +}
   1.685 +
   1.686 +U_CFUNC int32_t U_EXPORT2
   1.687 +uprv_strCompare(const UChar *s1, int32_t length1,
   1.688 +                const UChar *s2, int32_t length2,
   1.689 +                UBool strncmpStyle, UBool codePointOrder) {
   1.690 +    const UChar *start1, *start2, *limit1, *limit2;
   1.691 +    UChar c1, c2;
   1.692 +
   1.693 +    /* setup for fix-up */
   1.694 +    start1=s1;
   1.695 +    start2=s2;
   1.696 +
   1.697 +    /* compare identical prefixes - they do not need to be fixed up */
   1.698 +    if(length1<0 && length2<0) {
   1.699 +        /* strcmp style, both NUL-terminated */
   1.700 +        if(s1==s2) {
   1.701 +            return 0;
   1.702 +        }
   1.703 +
   1.704 +        for(;;) {
   1.705 +            c1=*s1;
   1.706 +            c2=*s2;
   1.707 +            if(c1!=c2) {
   1.708 +                break;
   1.709 +            }
   1.710 +            if(c1==0) {
   1.711 +                return 0;
   1.712 +            }
   1.713 +            ++s1;
   1.714 +            ++s2;
   1.715 +        }
   1.716 +
   1.717 +        /* setup for fix-up */
   1.718 +        limit1=limit2=NULL;
   1.719 +    } else if(strncmpStyle) {
   1.720 +        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
   1.721 +        if(s1==s2) {
   1.722 +            return 0;
   1.723 +        }
   1.724 +
   1.725 +        limit1=start1+length1;
   1.726 +
   1.727 +        for(;;) {
   1.728 +            /* both lengths are same, check only one limit */
   1.729 +            if(s1==limit1) {
   1.730 +                return 0;
   1.731 +            }
   1.732 +
   1.733 +            c1=*s1;
   1.734 +            c2=*s2;
   1.735 +            if(c1!=c2) {
   1.736 +                break;
   1.737 +            }
   1.738 +            if(c1==0) {
   1.739 +                return 0;
   1.740 +            }
   1.741 +            ++s1;
   1.742 +            ++s2;
   1.743 +        }
   1.744 +
   1.745 +        /* setup for fix-up */
   1.746 +        limit2=start2+length1; /* use length1 here, too, to enforce assumption */
   1.747 +    } else {
   1.748 +        /* memcmp/UnicodeString style, both length-specified */
   1.749 +        int32_t lengthResult;
   1.750 +
   1.751 +        if(length1<0) {
   1.752 +            length1=u_strlen(s1);
   1.753 +        }
   1.754 +        if(length2<0) {
   1.755 +            length2=u_strlen(s2);
   1.756 +        }
   1.757 +
   1.758 +        /* limit1=start1+min(lenght1, length2) */
   1.759 +        if(length1<length2) {
   1.760 +            lengthResult=-1;
   1.761 +            limit1=start1+length1;
   1.762 +        } else if(length1==length2) {
   1.763 +            lengthResult=0;
   1.764 +            limit1=start1+length1;
   1.765 +        } else /* length1>length2 */ {
   1.766 +            lengthResult=1;
   1.767 +            limit1=start1+length2;
   1.768 +        }
   1.769 +
   1.770 +        if(s1==s2) {
   1.771 +            return lengthResult;
   1.772 +        }
   1.773 +
   1.774 +        for(;;) {
   1.775 +            /* check pseudo-limit */
   1.776 +            if(s1==limit1) {
   1.777 +                return lengthResult;
   1.778 +            }
   1.779 +
   1.780 +            c1=*s1;
   1.781 +            c2=*s2;
   1.782 +            if(c1!=c2) {
   1.783 +                break;
   1.784 +            }
   1.785 +            ++s1;
   1.786 +            ++s2;
   1.787 +        }
   1.788 +
   1.789 +        /* setup for fix-up */
   1.790 +        limit1=start1+length1;
   1.791 +        limit2=start2+length2;
   1.792 +    }
   1.793 +
   1.794 +    /* if both values are in or above the surrogate range, fix them up */
   1.795 +    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
   1.796 +        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
   1.797 +        if(
   1.798 +            (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
   1.799 +            (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
   1.800 +        ) {
   1.801 +            /* part of a surrogate pair, leave >=d800 */
   1.802 +        } else {
   1.803 +            /* BMP code point - may be surrogate code point - make <d800 */
   1.804 +            c1-=0x2800;
   1.805 +        }
   1.806 +
   1.807 +        if(
   1.808 +            (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
   1.809 +            (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
   1.810 +        ) {
   1.811 +            /* part of a surrogate pair, leave >=d800 */
   1.812 +        } else {
   1.813 +            /* BMP code point - may be surrogate code point - make <d800 */
   1.814 +            c2-=0x2800;
   1.815 +        }
   1.816 +    }
   1.817 +
   1.818 +    /* now c1 and c2 are in the requested (code unit or code point) order */
   1.819 +    return (int32_t)c1-(int32_t)c2;
   1.820 +}
   1.821 +
   1.822 +/*
   1.823 + * Compare two strings as presented by UCharIterators.
   1.824 + * Use code unit or code point order.
   1.825 + * When the function returns, it is undefined where the iterators
   1.826 + * have stopped.
   1.827 + */
   1.828 +U_CAPI int32_t U_EXPORT2
   1.829 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
   1.830 +    UChar32 c1, c2;
   1.831 +
   1.832 +    /* argument checking */
   1.833 +    if(iter1==NULL || iter2==NULL) {
   1.834 +        return 0; /* bad arguments */
   1.835 +    }
   1.836 +    if(iter1==iter2) {
   1.837 +        return 0; /* identical iterators */
   1.838 +    }
   1.839 +
   1.840 +    /* reset iterators to start? */
   1.841 +    iter1->move(iter1, 0, UITER_START);
   1.842 +    iter2->move(iter2, 0, UITER_START);
   1.843 +
   1.844 +    /* compare identical prefixes - they do not need to be fixed up */
   1.845 +    for(;;) {
   1.846 +        c1=iter1->next(iter1);
   1.847 +        c2=iter2->next(iter2);
   1.848 +        if(c1!=c2) {
   1.849 +            break;
   1.850 +        }
   1.851 +        if(c1==-1) {
   1.852 +            return 0;
   1.853 +        }
   1.854 +    }
   1.855 +
   1.856 +    /* if both values are in or above the surrogate range, fix them up */
   1.857 +    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
   1.858 +        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
   1.859 +        if(
   1.860 +            (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
   1.861 +            (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
   1.862 +        ) {
   1.863 +            /* part of a surrogate pair, leave >=d800 */
   1.864 +        } else {
   1.865 +            /* BMP code point - may be surrogate code point - make <d800 */
   1.866 +            c1-=0x2800;
   1.867 +        }
   1.868 +
   1.869 +        if(
   1.870 +            (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
   1.871 +            (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
   1.872 +        ) {
   1.873 +            /* part of a surrogate pair, leave >=d800 */
   1.874 +        } else {
   1.875 +            /* BMP code point - may be surrogate code point - make <d800 */
   1.876 +            c2-=0x2800;
   1.877 +        }
   1.878 +    }
   1.879 +
   1.880 +    /* now c1 and c2 are in the requested (code unit or code point) order */
   1.881 +    return (int32_t)c1-(int32_t)c2;
   1.882 +}
   1.883 +
   1.884 +#if 0
   1.885 +/*
   1.886 + * u_strCompareIter() does not leave the iterators _on_ the different units.
   1.887 + * This is possible but would cost a few extra indirect function calls to back
   1.888 + * up if the last unit (c1 or c2 respectively) was >=0.
   1.889 + *
   1.890 + * Consistently leaving them _behind_ the different units is not an option
   1.891 + * because the current "unit" is the end of the string if that is reached,
   1.892 + * and in such a case the iterator does not move.
   1.893 + * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
   1.894 + * of their strings. Calling previous() on each does not move them to where
   1.895 + * the comparison fails.
   1.896 + *
   1.897 + * So the simplest semantics is to not define where the iterators end up.
   1.898 + *
   1.899 + * The following fragment is part of what would need to be done for backing up.
   1.900 + */
   1.901 +void fragment {
   1.902 +        /* iff a surrogate is part of a surrogate pair, leave >=d800 */
   1.903 +        if(c1<=0xdbff) {
   1.904 +            if(!U16_IS_TRAIL(iter1->current(iter1))) {
   1.905 +                /* lead surrogate code point - make <d800 */
   1.906 +                c1-=0x2800;
   1.907 +            }
   1.908 +        } else if(c1<=0xdfff) {
   1.909 +            int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
   1.910 +            iter1->previous(iter1); /* ==c1 */
   1.911 +            if(!U16_IS_LEAD(iter1->previous(iter1))) {
   1.912 +                /* trail surrogate code point - make <d800 */
   1.913 +                c1-=0x2800;
   1.914 +            }
   1.915 +            /* go back to behind where the difference is */
   1.916 +            iter1->move(iter1, idx, UITER_ZERO);
   1.917 +        } else /* 0xe000<=c1<=0xffff */ {
   1.918 +            /* BMP code point - make <d800 */
   1.919 +            c1-=0x2800;
   1.920 +        }
   1.921 +}
   1.922 +#endif
   1.923 +
   1.924 +U_CAPI int32_t U_EXPORT2
   1.925 +u_strCompare(const UChar *s1, int32_t length1,
   1.926 +             const UChar *s2, int32_t length2,
   1.927 +             UBool codePointOrder) {
   1.928 +    /* argument checking */
   1.929 +    if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
   1.930 +        return 0;
   1.931 +    }
   1.932 +    return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
   1.933 +}
   1.934 +
   1.935 +/* String compare in code point order - u_strcmp() compares in code unit order. */
   1.936 +U_CAPI int32_t U_EXPORT2
   1.937 +u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
   1.938 +    return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
   1.939 +}
   1.940 +
   1.941 +U_CAPI int32_t   U_EXPORT2
   1.942 +u_strncmp(const UChar     *s1, 
   1.943 +     const UChar     *s2, 
   1.944 +     int32_t     n) 
   1.945 +{
   1.946 +    if(n > 0) {
   1.947 +        int32_t rc;
   1.948 +        for(;;) {
   1.949 +            rc = (int32_t)*s1 - (int32_t)*s2;
   1.950 +            if(rc != 0 || *s1 == 0 || --n == 0) {
   1.951 +                return rc;
   1.952 +            }
   1.953 +            ++s1;
   1.954 +            ++s2;
   1.955 +        }
   1.956 +    } else {
   1.957 +        return 0;
   1.958 +    }
   1.959 +}
   1.960 +
   1.961 +U_CAPI int32_t U_EXPORT2
   1.962 +u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
   1.963 +    return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
   1.964 +}
   1.965 +
   1.966 +U_CAPI UChar* U_EXPORT2
   1.967 +u_strcpy(UChar     *dst, 
   1.968 +    const UChar     *src) 
   1.969 +{
   1.970 +    UChar *anchor = dst;            /* save a pointer to start of dst */
   1.971 +
   1.972 +    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
   1.973 +    }
   1.974 +
   1.975 +    return anchor;
   1.976 +}
   1.977 +
   1.978 +U_CAPI UChar*  U_EXPORT2
   1.979 +u_strncpy(UChar     *dst, 
   1.980 +     const UChar     *src, 
   1.981 +     int32_t     n) 
   1.982 +{
   1.983 +    UChar *anchor = dst;            /* save a pointer to start of dst */
   1.984 +
   1.985 +    /* copy string 2 over */
   1.986 +    while(n > 0 && (*(dst++) = *(src++)) != 0) {
   1.987 +        --n;
   1.988 +    }
   1.989 +
   1.990 +    return anchor;
   1.991 +}
   1.992 +
   1.993 +U_CAPI int32_t   U_EXPORT2
   1.994 +u_strlen(const UChar *s) 
   1.995 +{
   1.996 +#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
   1.997 +    return (int32_t)uprv_wcslen(s);
   1.998 +#else
   1.999 +    const UChar *t = s;
  1.1000 +    while(*t != 0) {
  1.1001 +      ++t;
  1.1002 +    }
  1.1003 +    return t - s;
  1.1004 +#endif
  1.1005 +}
  1.1006 +
  1.1007 +U_CAPI int32_t U_EXPORT2
  1.1008 +u_countChar32(const UChar *s, int32_t length) {
  1.1009 +    int32_t count;
  1.1010 +
  1.1011 +    if(s==NULL || length<-1) {
  1.1012 +        return 0;
  1.1013 +    }
  1.1014 +
  1.1015 +    count=0;
  1.1016 +    if(length>=0) {
  1.1017 +        while(length>0) {
  1.1018 +            ++count;
  1.1019 +            if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
  1.1020 +                s+=2;
  1.1021 +                length-=2;
  1.1022 +            } else {
  1.1023 +                ++s;
  1.1024 +                --length;
  1.1025 +            }
  1.1026 +        }
  1.1027 +    } else /* length==-1 */ {
  1.1028 +        UChar c;
  1.1029 +
  1.1030 +        for(;;) {
  1.1031 +            if((c=*s++)==0) {
  1.1032 +                break;
  1.1033 +            }
  1.1034 +            ++count;
  1.1035 +
  1.1036 +            /*
  1.1037 +             * sufficient to look ahead one because of UTF-16;
  1.1038 +             * safe to look ahead one because at worst that would be the terminating NUL
  1.1039 +             */
  1.1040 +            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
  1.1041 +                ++s;
  1.1042 +            }
  1.1043 +        }
  1.1044 +    }
  1.1045 +    return count;
  1.1046 +}
  1.1047 +
  1.1048 +U_CAPI UBool U_EXPORT2
  1.1049 +u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
  1.1050 +
  1.1051 +    if(number<0) {
  1.1052 +        return TRUE;
  1.1053 +    }
  1.1054 +    if(s==NULL || length<-1) {
  1.1055 +        return FALSE;
  1.1056 +    }
  1.1057 +
  1.1058 +    if(length==-1) {
  1.1059 +        /* s is NUL-terminated */
  1.1060 +        UChar c;
  1.1061 +
  1.1062 +        /* count code points until they exceed */
  1.1063 +        for(;;) {
  1.1064 +            if((c=*s++)==0) {
  1.1065 +                return FALSE;
  1.1066 +            }
  1.1067 +            if(number==0) {
  1.1068 +                return TRUE;
  1.1069 +            }
  1.1070 +            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
  1.1071 +                ++s;
  1.1072 +            }
  1.1073 +            --number;
  1.1074 +        }
  1.1075 +    } else {
  1.1076 +        /* length>=0 known */
  1.1077 +        const UChar *limit;
  1.1078 +        int32_t maxSupplementary;
  1.1079 +
  1.1080 +        /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
  1.1081 +        if(((length+1)/2)>number) {
  1.1082 +            return TRUE;
  1.1083 +        }
  1.1084 +
  1.1085 +        /* check if s does not even contain enough UChars */
  1.1086 +        maxSupplementary=length-number;
  1.1087 +        if(maxSupplementary<=0) {
  1.1088 +            return FALSE;
  1.1089 +        }
  1.1090 +        /* there are maxSupplementary=length-number more UChars than asked-for code points */
  1.1091 +
  1.1092 +        /*
  1.1093 +         * count code points until they exceed and also check that there are
  1.1094 +         * no more than maxSupplementary supplementary code points (UChar pairs)
  1.1095 +         */
  1.1096 +        limit=s+length;
  1.1097 +        for(;;) {
  1.1098 +            if(s==limit) {
  1.1099 +                return FALSE;
  1.1100 +            }
  1.1101 +            if(number==0) {
  1.1102 +                return TRUE;
  1.1103 +            }
  1.1104 +            if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
  1.1105 +                ++s;
  1.1106 +                if(--maxSupplementary<=0) {
  1.1107 +                    /* too many pairs - too few code points */
  1.1108 +                    return FALSE;
  1.1109 +                }
  1.1110 +            }
  1.1111 +            --number;
  1.1112 +        }
  1.1113 +    }
  1.1114 +}
  1.1115 +
  1.1116 +U_CAPI UChar * U_EXPORT2
  1.1117 +u_memcpy(UChar *dest, const UChar *src, int32_t count) {
  1.1118 +    if(count > 0) {
  1.1119 +        uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR);
  1.1120 +    }
  1.1121 +    return dest;
  1.1122 +}
  1.1123 +
  1.1124 +U_CAPI UChar * U_EXPORT2
  1.1125 +u_memmove(UChar *dest, const UChar *src, int32_t count) {
  1.1126 +    if(count > 0) {
  1.1127 +        uprv_memmove(dest, src, count*U_SIZEOF_UCHAR);
  1.1128 +    }
  1.1129 +    return dest;
  1.1130 +}
  1.1131 +
  1.1132 +U_CAPI UChar * U_EXPORT2
  1.1133 +u_memset(UChar *dest, UChar c, int32_t count) {
  1.1134 +    if(count > 0) {
  1.1135 +        UChar *ptr = dest;
  1.1136 +        UChar *limit = dest + count;
  1.1137 +
  1.1138 +        while (ptr < limit) {
  1.1139 +            *(ptr++) = c;
  1.1140 +        }
  1.1141 +    }
  1.1142 +    return dest;
  1.1143 +}
  1.1144 +
  1.1145 +U_CAPI int32_t U_EXPORT2
  1.1146 +u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
  1.1147 +    if(count > 0) {
  1.1148 +        const UChar *limit = buf1 + count;
  1.1149 +        int32_t result;
  1.1150 +
  1.1151 +        while (buf1 < limit) {
  1.1152 +            result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
  1.1153 +            if (result != 0) {
  1.1154 +                return result;
  1.1155 +            }
  1.1156 +            buf1++;
  1.1157 +            buf2++;
  1.1158 +        }
  1.1159 +    }
  1.1160 +    return 0;
  1.1161 +}
  1.1162 +
  1.1163 +U_CAPI int32_t U_EXPORT2
  1.1164 +u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
  1.1165 +    return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
  1.1166 +}
  1.1167 +
  1.1168 +/* u_unescape & support fns ------------------------------------------------- */
  1.1169 +
  1.1170 +/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
  1.1171 +static const UChar UNESCAPE_MAP[] = {
  1.1172 +    /*"   0x22, 0x22 */
  1.1173 +    /*'   0x27, 0x27 */
  1.1174 +    /*?   0x3F, 0x3F */
  1.1175 +    /*\   0x5C, 0x5C */
  1.1176 +    /*a*/ 0x61, 0x07,
  1.1177 +    /*b*/ 0x62, 0x08,
  1.1178 +    /*e*/ 0x65, 0x1b,
  1.1179 +    /*f*/ 0x66, 0x0c,
  1.1180 +    /*n*/ 0x6E, 0x0a,
  1.1181 +    /*r*/ 0x72, 0x0d,
  1.1182 +    /*t*/ 0x74, 0x09,
  1.1183 +    /*v*/ 0x76, 0x0b
  1.1184 +};
  1.1185 +enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
  1.1186 +
  1.1187 +/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
  1.1188 +static int8_t _digit8(UChar c) {
  1.1189 +    if (c >= 0x0030 && c <= 0x0037) {
  1.1190 +        return (int8_t)(c - 0x0030);
  1.1191 +    }
  1.1192 +    return -1;
  1.1193 +}
  1.1194 +
  1.1195 +/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
  1.1196 +static int8_t _digit16(UChar c) {
  1.1197 +    if (c >= 0x0030 && c <= 0x0039) {
  1.1198 +        return (int8_t)(c - 0x0030);
  1.1199 +    }
  1.1200 +    if (c >= 0x0041 && c <= 0x0046) {
  1.1201 +        return (int8_t)(c - (0x0041 - 10));
  1.1202 +    }
  1.1203 +    if (c >= 0x0061 && c <= 0x0066) {
  1.1204 +        return (int8_t)(c - (0x0061 - 10));
  1.1205 +    }
  1.1206 +    return -1;
  1.1207 +}
  1.1208 +
  1.1209 +/* Parse a single escape sequence.  Although this method deals in
  1.1210 + * UChars, it does not use C++ or UnicodeString.  This allows it to
  1.1211 + * be used from C contexts. */
  1.1212 +U_CAPI UChar32 U_EXPORT2
  1.1213 +u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  1.1214 +             int32_t *offset,
  1.1215 +             int32_t length,
  1.1216 +             void *context) {
  1.1217 +
  1.1218 +    int32_t start = *offset;
  1.1219 +    UChar c;
  1.1220 +    UChar32 result = 0;
  1.1221 +    int8_t n = 0;
  1.1222 +    int8_t minDig = 0;
  1.1223 +    int8_t maxDig = 0;
  1.1224 +    int8_t bitsPerDigit = 4; 
  1.1225 +    int8_t dig;
  1.1226 +    int32_t i;
  1.1227 +    UBool braces = FALSE;
  1.1228 +
  1.1229 +    /* Check that offset is in range */
  1.1230 +    if (*offset < 0 || *offset >= length) {
  1.1231 +        goto err;
  1.1232 +    }
  1.1233 +
  1.1234 +    /* Fetch first UChar after '\\' */
  1.1235 +    c = charAt((*offset)++, context);
  1.1236 +
  1.1237 +    /* Convert hexadecimal and octal escapes */
  1.1238 +    switch (c) {
  1.1239 +    case 0x0075 /*'u'*/:
  1.1240 +        minDig = maxDig = 4;
  1.1241 +        break;
  1.1242 +    case 0x0055 /*'U'*/:
  1.1243 +        minDig = maxDig = 8;
  1.1244 +        break;
  1.1245 +    case 0x0078 /*'x'*/:
  1.1246 +        minDig = 1;
  1.1247 +        if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
  1.1248 +            ++(*offset);
  1.1249 +            braces = TRUE;
  1.1250 +            maxDig = 8;
  1.1251 +        } else {
  1.1252 +            maxDig = 2;
  1.1253 +        }
  1.1254 +        break;
  1.1255 +    default:
  1.1256 +        dig = _digit8(c);
  1.1257 +        if (dig >= 0) {
  1.1258 +            minDig = 1;
  1.1259 +            maxDig = 3;
  1.1260 +            n = 1; /* Already have first octal digit */
  1.1261 +            bitsPerDigit = 3;
  1.1262 +            result = dig;
  1.1263 +        }
  1.1264 +        break;
  1.1265 +    }
  1.1266 +    if (minDig != 0) {
  1.1267 +        while (*offset < length && n < maxDig) {
  1.1268 +            c = charAt(*offset, context);
  1.1269 +            dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
  1.1270 +            if (dig < 0) {
  1.1271 +                break;
  1.1272 +            }
  1.1273 +            result = (result << bitsPerDigit) | dig;
  1.1274 +            ++(*offset);
  1.1275 +            ++n;
  1.1276 +        }
  1.1277 +        if (n < minDig) {
  1.1278 +            goto err;
  1.1279 +        }
  1.1280 +        if (braces) {
  1.1281 +            if (c != 0x7D /*}*/) {
  1.1282 +                goto err;
  1.1283 +            }
  1.1284 +            ++(*offset);
  1.1285 +        }
  1.1286 +        if (result < 0 || result >= 0x110000) {
  1.1287 +            goto err;
  1.1288 +        }
  1.1289 +        /* If an escape sequence specifies a lead surrogate, see if
  1.1290 +         * there is a trail surrogate after it, either as an escape or
  1.1291 +         * as a literal.  If so, join them up into a supplementary.
  1.1292 +         */
  1.1293 +        if (*offset < length && U16_IS_LEAD(result)) {
  1.1294 +            int32_t ahead = *offset + 1;
  1.1295 +            c = charAt(*offset, context);
  1.1296 +            if (c == 0x5C /*'\\'*/ && ahead < length) {
  1.1297 +                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
  1.1298 +            }
  1.1299 +            if (U16_IS_TRAIL(c)) {
  1.1300 +                *offset = ahead;
  1.1301 +                result = U16_GET_SUPPLEMENTARY(result, c);
  1.1302 +            }
  1.1303 +        }
  1.1304 +        return result;
  1.1305 +    }
  1.1306 +
  1.1307 +    /* Convert C-style escapes in table */
  1.1308 +    for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
  1.1309 +        if (c == UNESCAPE_MAP[i]) {
  1.1310 +            return UNESCAPE_MAP[i+1];
  1.1311 +        } else if (c < UNESCAPE_MAP[i]) {
  1.1312 +            break;
  1.1313 +        }
  1.1314 +    }
  1.1315 +
  1.1316 +    /* Map \cX to control-X: X & 0x1F */
  1.1317 +    if (c == 0x0063 /*'c'*/ && *offset < length) {
  1.1318 +        c = charAt((*offset)++, context);
  1.1319 +        if (U16_IS_LEAD(c) && *offset < length) {
  1.1320 +            UChar c2 = charAt(*offset, context);
  1.1321 +            if (U16_IS_TRAIL(c2)) {
  1.1322 +                ++(*offset);
  1.1323 +                c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
  1.1324 +            }
  1.1325 +        }
  1.1326 +        return 0x1F & c;
  1.1327 +    }
  1.1328 +
  1.1329 +    /* If no special forms are recognized, then consider
  1.1330 +     * the backslash to generically escape the next character.
  1.1331 +     * Deal with surrogate pairs. */
  1.1332 +    if (U16_IS_LEAD(c) && *offset < length) {
  1.1333 +        UChar c2 = charAt(*offset, context);
  1.1334 +        if (U16_IS_TRAIL(c2)) {
  1.1335 +            ++(*offset);
  1.1336 +            return U16_GET_SUPPLEMENTARY(c, c2);
  1.1337 +        }
  1.1338 +    }
  1.1339 +    return c;
  1.1340 +
  1.1341 + err:
  1.1342 +    /* Invalid escape sequence */
  1.1343 +    *offset = start; /* Reset to initial value */
  1.1344 +    return (UChar32)0xFFFFFFFF;
  1.1345 +}
  1.1346 +
  1.1347 +/* u_unescapeAt() callback to return a UChar from a char* */
  1.1348 +static UChar U_CALLCONV
  1.1349 +_charPtr_charAt(int32_t offset, void *context) {
  1.1350 +    UChar c16;
  1.1351 +    /* It would be more efficient to access the invariant tables
  1.1352 +     * directly but there is no API for that. */
  1.1353 +    u_charsToUChars(((char*) context) + offset, &c16, 1);
  1.1354 +    return c16;
  1.1355 +}
  1.1356 +
  1.1357 +/* Append an escape-free segment of the text; used by u_unescape() */
  1.1358 +static void _appendUChars(UChar *dest, int32_t destCapacity,
  1.1359 +                          const char *src, int32_t srcLen) {
  1.1360 +    if (destCapacity < 0) {
  1.1361 +        destCapacity = 0;
  1.1362 +    }
  1.1363 +    if (srcLen > destCapacity) {
  1.1364 +        srcLen = destCapacity;
  1.1365 +    }
  1.1366 +    u_charsToUChars(src, dest, srcLen);
  1.1367 +}
  1.1368 +
  1.1369 +/* Do an invariant conversion of char* -> UChar*, with escape parsing */
  1.1370 +U_CAPI int32_t U_EXPORT2
  1.1371 +u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
  1.1372 +    const char *segment = src;
  1.1373 +    int32_t i = 0;
  1.1374 +    char c;
  1.1375 +
  1.1376 +    while ((c=*src) != 0) {
  1.1377 +        /* '\\' intentionally written as compiler-specific
  1.1378 +         * character constant to correspond to compiler-specific
  1.1379 +         * char* constants. */
  1.1380 +        if (c == '\\') {
  1.1381 +            int32_t lenParsed = 0;
  1.1382 +            UChar32 c32;
  1.1383 +            if (src != segment) {
  1.1384 +                if (dest != NULL) {
  1.1385 +                    _appendUChars(dest + i, destCapacity - i,
  1.1386 +                                  segment, (int32_t)(src - segment));
  1.1387 +                }
  1.1388 +                i += (int32_t)(src - segment);
  1.1389 +            }
  1.1390 +            ++src; /* advance past '\\' */
  1.1391 +            c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
  1.1392 +            if (lenParsed == 0) {
  1.1393 +                goto err;
  1.1394 +            }
  1.1395 +            src += lenParsed; /* advance past escape seq. */
  1.1396 +            if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
  1.1397 +                U16_APPEND_UNSAFE(dest, i, c32);
  1.1398 +            } else {
  1.1399 +                i += U16_LENGTH(c32);
  1.1400 +            }
  1.1401 +            segment = src;
  1.1402 +        } else {
  1.1403 +            ++src;
  1.1404 +        }
  1.1405 +    }
  1.1406 +    if (src != segment) {
  1.1407 +        if (dest != NULL) {
  1.1408 +            _appendUChars(dest + i, destCapacity - i,
  1.1409 +                          segment, (int32_t)(src - segment));
  1.1410 +        }
  1.1411 +        i += (int32_t)(src - segment);
  1.1412 +    }
  1.1413 +    if (dest != NULL && i < destCapacity) {
  1.1414 +        dest[i] = 0;
  1.1415 +    }
  1.1416 +    return i;
  1.1417 +
  1.1418 + err:
  1.1419 +    if (dest != NULL && destCapacity > 0) {
  1.1420 +        *dest = 0;
  1.1421 +    }
  1.1422 +    return 0;
  1.1423 +}
  1.1424 +
  1.1425 +/* NUL-termination of strings ----------------------------------------------- */
  1.1426 +
  1.1427 +/**
  1.1428 + * NUL-terminate a string no matter what its type.
  1.1429 + * Set warning and error codes accordingly.
  1.1430 + */
  1.1431 +#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode)      \
  1.1432 +    if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) {                    \
  1.1433 +        /* not a public function, so no complete argument checking */   \
  1.1434 +                                                                        \
  1.1435 +        if(length<0) {                                                  \
  1.1436 +            /* assume that the caller handles this */                   \
  1.1437 +        } else if(length<destCapacity) {                                \
  1.1438 +            /* NUL-terminate the string, the NUL fits */                \
  1.1439 +            dest[length]=0;                                             \
  1.1440 +            /* unset the not-terminated warning but leave all others */ \
  1.1441 +            if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
  1.1442 +                *pErrorCode=U_ZERO_ERROR;                               \
  1.1443 +            }                                                           \
  1.1444 +        } else if(length==destCapacity) {                               \
  1.1445 +            /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
  1.1446 +            *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
  1.1447 +        } else /* length>destCapacity */ {                              \
  1.1448 +            /* even the string itself did not fit - set an error code */ \
  1.1449 +            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
  1.1450 +        }                                                               \
  1.1451 +    }
  1.1452 +
  1.1453 +U_CAPI int32_t U_EXPORT2
  1.1454 +u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1.1455 +    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1.1456 +    return length;
  1.1457 +}
  1.1458 +
  1.1459 +U_CAPI int32_t U_EXPORT2
  1.1460 +u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1.1461 +    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1.1462 +    return length;
  1.1463 +}
  1.1464 +
  1.1465 +U_CAPI int32_t U_EXPORT2
  1.1466 +u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1.1467 +    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1.1468 +    return length;
  1.1469 +}
  1.1470 +
  1.1471 +U_CAPI int32_t U_EXPORT2
  1.1472 +u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
  1.1473 +    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
  1.1474 +    return length;
  1.1475 +}
  1.1476 +
  1.1477 +// Compute the hash code for a string -------------------------------------- ***
  1.1478 +
  1.1479 +// Moved here from uhash.c so that UnicodeString::hashCode() does not depend
  1.1480 +// on UHashtable code.
  1.1481 +
  1.1482 +/*
  1.1483 +  Compute the hash by iterating sparsely over about 32 (up to 63)
  1.1484 +  characters spaced evenly through the string.  For each character,
  1.1485 +  multiply the previous hash value by a prime number and add the new
  1.1486 +  character in, like a linear congruential random number generator,
  1.1487 +  producing a pseudorandom deterministic value well distributed over
  1.1488 +  the output range. [LIU]
  1.1489 +*/
  1.1490 +
  1.1491 +#define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
  1.1492 +    int32_t hash = 0;                         \
  1.1493 +    const TYPE *p = (const TYPE*) STR;        \
  1.1494 +    if (p != NULL) {                          \
  1.1495 +        int32_t len = (int32_t)(STRLEN);      \
  1.1496 +        int32_t inc = ((len - 32) / 32) + 1;  \
  1.1497 +        const TYPE *limit = p + len;          \
  1.1498 +        while (p<limit) {                     \
  1.1499 +            hash = (hash * 37) + DEREF;       \
  1.1500 +            p += inc;                         \
  1.1501 +        }                                     \
  1.1502 +    }                                         \
  1.1503 +    return hash
  1.1504 +
  1.1505 +/* Used by UnicodeString to compute its hashcode - Not public API. */
  1.1506 +U_CAPI int32_t U_EXPORT2
  1.1507 +ustr_hashUCharsN(const UChar *str, int32_t length) {
  1.1508 +    STRING_HASH(UChar, str, length, *p);
  1.1509 +}
  1.1510 +
  1.1511 +U_CAPI int32_t U_EXPORT2
  1.1512 +ustr_hashCharsN(const char *str, int32_t length) {
  1.1513 +    STRING_HASH(uint8_t, str, length, *p);
  1.1514 +}
  1.1515 +
  1.1516 +U_CAPI int32_t U_EXPORT2
  1.1517 +ustr_hashICharsN(const char *str, int32_t length) {
  1.1518 +    STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
  1.1519 +}
The Tor Browser / file diff

diff: intl/icu/source/common/ustring.cpp

intl/icu/source/common/ustring.cpp