intl/icu/source/common/punycode.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/punycode.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,587 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2002-2011, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  punycode.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2002jan31
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +
    1.21 +/* This ICU code derived from: */
    1.22 +/*
    1.23 +punycode.c 0.4.0 (2001-Nov-17-Sat)
    1.24 +http://www.cs.berkeley.edu/~amc/idn/
    1.25 +Adam M. Costello
    1.26 +http://www.nicemice.net/amc/
    1.27 +
    1.28 +Disclaimer and license
    1.29 +
    1.30 +    Regarding this entire document or any portion of it (including
    1.31 +    the pseudocode and C code), the author makes no guarantees and
    1.32 +    is not responsible for any damage resulting from its use.  The
    1.33 +    author grants irrevocable permission to anyone to use, modify,
    1.34 +    and distribute it in any way that does not diminish the rights
    1.35 +    of anyone else to use, modify, and distribute it, provided that
    1.36 +    redistributed derivative works do not contain misleading author or
    1.37 +    version information.  Derivative works need not be licensed under
    1.38 +    similar terms.
    1.39 +*/
    1.40 +/*
    1.41 + * ICU modifications:
    1.42 + * - ICU data types and coding conventions
    1.43 + * - ICU string buffer handling with implicit source lengths
    1.44 + *   and destination preflighting
    1.45 + * - UTF-16 handling
    1.46 + */
    1.47 +
    1.48 +#include "unicode/utypes.h"
    1.49 +
    1.50 +#if !UCONFIG_NO_IDNA
    1.51 +
    1.52 +#include "unicode/ustring.h"
    1.53 +#include "unicode/utf.h"
    1.54 +#include "unicode/utf16.h"
    1.55 +#include "ustr_imp.h"
    1.56 +#include "cstring.h"
    1.57 +#include "cmemory.h"
    1.58 +#include "punycode.h"
    1.59 +#include "uassert.h"
    1.60 +
    1.61 +
    1.62 +/* Punycode ----------------------------------------------------------------- */
    1.63 +
    1.64 +/* Punycode parameters for Bootstring */
    1.65 +#define BASE            36
    1.66 +#define TMIN            1
    1.67 +#define TMAX            26
    1.68 +#define SKEW            38
    1.69 +#define DAMP            700
    1.70 +#define INITIAL_BIAS    72
    1.71 +#define INITIAL_N       0x80
    1.72 +
    1.73 +/* "Basic" Unicode/ASCII code points */
    1.74 +#define _HYPHEN         0X2d
    1.75 +#define DELIMITER       _HYPHEN
    1.76 +
    1.77 +#define _ZERO_          0X30
    1.78 +#define _NINE           0x39
    1.79 +
    1.80 +#define _SMALL_A        0X61
    1.81 +#define _SMALL_Z        0X7a
    1.82 +
    1.83 +#define _CAPITAL_A      0X41
    1.84 +#define _CAPITAL_Z      0X5a
    1.85 +
    1.86 +#define IS_BASIC(c) ((c)<0x80)
    1.87 +#define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
    1.88 +
    1.89 +/**
    1.90 + * digitToBasic() returns the basic code point whose value
    1.91 + * (when used for representing integers) is d, which must be in the
    1.92 + * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
    1.93 + * nonzero, in which case the uppercase form is used.
    1.94 + */
    1.95 +static inline char
    1.96 +digitToBasic(int32_t digit, UBool uppercase) {
    1.97 +    /*  0..25 map to ASCII a..z or A..Z */
    1.98 +    /* 26..35 map to ASCII 0..9         */
    1.99 +    if(digit<26) {
   1.100 +        if(uppercase) {
   1.101 +            return (char)(_CAPITAL_A+digit);
   1.102 +        } else {
   1.103 +            return (char)(_SMALL_A+digit);
   1.104 +        }
   1.105 +    } else {
   1.106 +        return (char)((_ZERO_-26)+digit);
   1.107 +    }
   1.108 +}
   1.109 +
   1.110 +/**
   1.111 + * basicToDigit[] contains the numeric value of a basic code
   1.112 + * point (for use in representing integers) in the range 0 to
   1.113 + * BASE-1, or -1 if b is does not represent a value.
   1.114 + */
   1.115 +static const int8_t
   1.116 +basicToDigit[256]={
   1.117 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.118 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.119 +
   1.120 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.121 +    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
   1.122 +
   1.123 +    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
   1.124 +    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
   1.125 +
   1.126 +    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
   1.127 +    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
   1.128 +
   1.129 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.130 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.131 +
   1.132 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.133 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.134 +
   1.135 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.136 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.137 +
   1.138 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   1.139 +    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
   1.140 +};
   1.141 +
   1.142 +static inline char
   1.143 +asciiCaseMap(char b, UBool uppercase) {
   1.144 +    if(uppercase) {
   1.145 +        if(_SMALL_A<=b && b<=_SMALL_Z) {
   1.146 +            b-=(_SMALL_A-_CAPITAL_A);
   1.147 +        }
   1.148 +    } else {
   1.149 +        if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
   1.150 +            b+=(_SMALL_A-_CAPITAL_A);
   1.151 +        }
   1.152 +    }
   1.153 +    return b;
   1.154 +}
   1.155 +
   1.156 +/* Punycode-specific Bootstring code ---------------------------------------- */
   1.157 +
   1.158 +/*
   1.159 + * The following code omits the {parts} of the pseudo-algorithm in the spec
   1.160 + * that are not used with the Punycode parameter set.
   1.161 + */
   1.162 +
   1.163 +/* Bias adaptation function. */
   1.164 +static int32_t
   1.165 +adaptBias(int32_t delta, int32_t length, UBool firstTime) {
   1.166 +    int32_t count;
   1.167 +
   1.168 +    if(firstTime) {
   1.169 +        delta/=DAMP;
   1.170 +    } else {
   1.171 +        delta/=2;
   1.172 +    }
   1.173 +
   1.174 +    delta+=delta/length;
   1.175 +    for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
   1.176 +        delta/=(BASE-TMIN);
   1.177 +    }
   1.178 +
   1.179 +    return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
   1.180 +}
   1.181 +
   1.182 +#define MAX_CP_COUNT    200
   1.183 +
   1.184 +U_CFUNC int32_t
   1.185 +u_strToPunycode(const UChar *src, int32_t srcLength,
   1.186 +                UChar *dest, int32_t destCapacity,
   1.187 +                const UBool *caseFlags,
   1.188 +                UErrorCode *pErrorCode) {
   1.189 +
   1.190 +    int32_t cpBuffer[MAX_CP_COUNT];
   1.191 +    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
   1.192 +    UChar c, c2;
   1.193 +
   1.194 +    /* argument checking */
   1.195 +    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1.196 +        return 0;
   1.197 +    }
   1.198 +
   1.199 +    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
   1.200 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.201 +        return 0;
   1.202 +    }
   1.203 +
   1.204 +    /*
   1.205 +     * Handle the basic code points and
   1.206 +     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
   1.207 +     */
   1.208 +    srcCPCount=destLength=0;
   1.209 +    if(srcLength==-1) {
   1.210 +        /* NUL-terminated input */
   1.211 +        for(j=0; /* no condition */; ++j) {
   1.212 +            if((c=src[j])==0) {
   1.213 +                break;
   1.214 +            }
   1.215 +            if(srcCPCount==MAX_CP_COUNT) {
   1.216 +                /* too many input code points */
   1.217 +                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.218 +                return 0;
   1.219 +            }
   1.220 +            if(IS_BASIC(c)) {
   1.221 +                cpBuffer[srcCPCount++]=0;
   1.222 +                if(destLength<destCapacity) {
   1.223 +                    dest[destLength]=
   1.224 +                        caseFlags!=NULL ?
   1.225 +                            asciiCaseMap((char)c, caseFlags[j]) :
   1.226 +                            (char)c;
   1.227 +                }
   1.228 +                ++destLength;
   1.229 +            } else {
   1.230 +                n=(caseFlags!=NULL && caseFlags[j])<<31L;
   1.231 +                if(U16_IS_SINGLE(c)) {
   1.232 +                    n|=c;
   1.233 +                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
   1.234 +                    ++j;
   1.235 +                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
   1.236 +                } else {
   1.237 +                    /* error: unmatched surrogate */
   1.238 +                    *pErrorCode=U_INVALID_CHAR_FOUND;
   1.239 +                    return 0;
   1.240 +                }
   1.241 +                cpBuffer[srcCPCount++]=n;
   1.242 +            }
   1.243 +        }
   1.244 +    } else {
   1.245 +        /* length-specified input */
   1.246 +        for(j=0; j<srcLength; ++j) {
   1.247 +            if(srcCPCount==MAX_CP_COUNT) {
   1.248 +                /* too many input code points */
   1.249 +                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.250 +                return 0;
   1.251 +            }
   1.252 +            c=src[j];
   1.253 +            if(IS_BASIC(c)) {
   1.254 +                cpBuffer[srcCPCount++]=0;
   1.255 +                if(destLength<destCapacity) {
   1.256 +                    dest[destLength]=
   1.257 +                        caseFlags!=NULL ?
   1.258 +                            asciiCaseMap((char)c, caseFlags[j]) :
   1.259 +                            (char)c;
   1.260 +                }
   1.261 +                ++destLength;
   1.262 +            } else {
   1.263 +                n=(caseFlags!=NULL && caseFlags[j])<<31L;
   1.264 +                if(U16_IS_SINGLE(c)) {
   1.265 +                    n|=c;
   1.266 +                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
   1.267 +                    ++j;
   1.268 +                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
   1.269 +                } else {
   1.270 +                    /* error: unmatched surrogate */
   1.271 +                    *pErrorCode=U_INVALID_CHAR_FOUND;
   1.272 +                    return 0;
   1.273 +                }
   1.274 +                cpBuffer[srcCPCount++]=n;
   1.275 +            }
   1.276 +        }
   1.277 +    }
   1.278 +
   1.279 +    /* Finish the basic string - if it is not empty - with a delimiter. */
   1.280 +    basicLength=destLength;
   1.281 +    if(basicLength>0) {
   1.282 +        if(destLength<destCapacity) {
   1.283 +            dest[destLength]=DELIMITER;
   1.284 +        }
   1.285 +        ++destLength;
   1.286 +    }
   1.287 +
   1.288 +    /*
   1.289 +     * handledCPCount is the number of code points that have been handled
   1.290 +     * basicLength is the number of basic code points
   1.291 +     * destLength is the number of chars that have been output
   1.292 +     */
   1.293 +
   1.294 +    /* Initialize the state: */
   1.295 +    n=INITIAL_N;
   1.296 +    delta=0;
   1.297 +    bias=INITIAL_BIAS;
   1.298 +
   1.299 +    /* Main encoding loop: */
   1.300 +    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
   1.301 +        /*
   1.302 +         * All non-basic code points < n have been handled already.
   1.303 +         * Find the next larger one:
   1.304 +         */
   1.305 +        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
   1.306 +            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
   1.307 +            if(n<=q && q<m) {
   1.308 +                m=q;
   1.309 +            }
   1.310 +        }
   1.311 +
   1.312 +        /*
   1.313 +         * Increase delta enough to advance the decoder's
   1.314 +         * <n,i> state to <m,0>, but guard against overflow:
   1.315 +         */
   1.316 +        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
   1.317 +            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
   1.318 +            return 0;
   1.319 +        }
   1.320 +        delta+=(m-n)*(handledCPCount+1);
   1.321 +        n=m;
   1.322 +
   1.323 +        /* Encode a sequence of same code points n */
   1.324 +        for(j=0; j<srcCPCount; ++j) {
   1.325 +            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
   1.326 +            if(q<n) {
   1.327 +                ++delta;
   1.328 +            } else if(q==n) {
   1.329 +                /* Represent delta as a generalized variable-length integer: */
   1.330 +                for(q=delta, k=BASE; /* no condition */; k+=BASE) {
   1.331 +
   1.332 +                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
   1.333 +
   1.334 +                    t=k-bias;
   1.335 +                    if(t<TMIN) {
   1.336 +                        t=TMIN;
   1.337 +                    } else if(t>TMAX) {
   1.338 +                        t=TMAX;
   1.339 +                    }
   1.340 +                    */
   1.341 +
   1.342 +                    t=k-bias;
   1.343 +                    if(t<TMIN) {
   1.344 +                        t=TMIN;
   1.345 +                    } else if(k>=(bias+TMAX)) {
   1.346 +                        t=TMAX;
   1.347 +                    }
   1.348 +
   1.349 +                    if(q<t) {
   1.350 +                        break;
   1.351 +                    }
   1.352 +
   1.353 +                    if(destLength<destCapacity) {
   1.354 +                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
   1.355 +                    }
   1.356 +                    ++destLength;
   1.357 +                    q=(q-t)/(BASE-t);
   1.358 +                }
   1.359 +
   1.360 +                if(destLength<destCapacity) {
   1.361 +                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
   1.362 +                }
   1.363 +                ++destLength;
   1.364 +                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
   1.365 +                delta=0;
   1.366 +                ++handledCPCount;
   1.367 +            }
   1.368 +        }
   1.369 +
   1.370 +        ++delta;
   1.371 +        ++n;
   1.372 +    }
   1.373 +
   1.374 +    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   1.375 +}
   1.376 +
   1.377 +U_CFUNC int32_t
   1.378 +u_strFromPunycode(const UChar *src, int32_t srcLength,
   1.379 +                  UChar *dest, int32_t destCapacity,
   1.380 +                  UBool *caseFlags,
   1.381 +                  UErrorCode *pErrorCode) {
   1.382 +    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
   1.383 +            destCPCount, firstSupplementaryIndex, cpLength;
   1.384 +    UChar b;
   1.385 +
   1.386 +    /* argument checking */
   1.387 +    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1.388 +        return 0;
   1.389 +    }
   1.390 +
   1.391 +    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
   1.392 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.393 +        return 0;
   1.394 +    }
   1.395 +
   1.396 +    if(srcLength==-1) {
   1.397 +        srcLength=u_strlen(src);
   1.398 +    }
   1.399 +
   1.400 +    /*
   1.401 +     * Handle the basic code points:
   1.402 +     * Let basicLength be the number of input code points
   1.403 +     * before the last delimiter, or 0 if there is none,
   1.404 +     * then copy the first basicLength code points to the output.
   1.405 +     *
   1.406 +     * The two following loops iterate backward.
   1.407 +     */
   1.408 +    for(j=srcLength; j>0;) {
   1.409 +        if(src[--j]==DELIMITER) {
   1.410 +            break;
   1.411 +        }
   1.412 +    }
   1.413 +    destLength=basicLength=destCPCount=j;
   1.414 +    U_ASSERT(destLength>=0);
   1.415 +
   1.416 +    while(j>0) {
   1.417 +        b=src[--j];
   1.418 +        if(!IS_BASIC(b)) {
   1.419 +            *pErrorCode=U_INVALID_CHAR_FOUND;
   1.420 +            return 0;
   1.421 +        }
   1.422 +
   1.423 +        if(j<destCapacity) {
   1.424 +            dest[j]=(UChar)b;
   1.425 +
   1.426 +            if(caseFlags!=NULL) {
   1.427 +                caseFlags[j]=IS_BASIC_UPPERCASE(b);
   1.428 +            }
   1.429 +        }
   1.430 +    }
   1.431 +
   1.432 +    /* Initialize the state: */
   1.433 +    n=INITIAL_N;
   1.434 +    i=0;
   1.435 +    bias=INITIAL_BIAS;
   1.436 +    firstSupplementaryIndex=1000000000;
   1.437 +
   1.438 +    /*
   1.439 +     * Main decoding loop:
   1.440 +     * Start just after the last delimiter if any
   1.441 +     * basic code points were copied; start at the beginning otherwise.
   1.442 +     */
   1.443 +    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
   1.444 +        /*
   1.445 +         * in is the index of the next character to be consumed, and
   1.446 +         * destCPCount is the number of code points in the output array.
   1.447 +         *
   1.448 +         * Decode a generalized variable-length integer into delta,
   1.449 +         * which gets added to i.  The overflow checking is easier
   1.450 +         * if we increase i as we go, then subtract off its starting
   1.451 +         * value at the end to obtain delta.
   1.452 +         */
   1.453 +        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
   1.454 +            if(in>=srcLength) {
   1.455 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.456 +                return 0;
   1.457 +            }
   1.458 +
   1.459 +            digit=basicToDigit[(uint8_t)src[in++]];
   1.460 +            if(digit<0) {
   1.461 +                *pErrorCode=U_INVALID_CHAR_FOUND;
   1.462 +                return 0;
   1.463 +            }
   1.464 +            if(digit>(0x7fffffff-i)/w) {
   1.465 +                /* integer overflow */
   1.466 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.467 +                return 0;
   1.468 +            }
   1.469 +
   1.470 +            i+=digit*w;
   1.471 +            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt  
   1.472 +            t=k-bias;
   1.473 +            if(t<TMIN) {
   1.474 +                t=TMIN;
   1.475 +            } else if(t>TMAX) {
   1.476 +                t=TMAX;
   1.477 +            }
   1.478 +            */
   1.479 +            t=k-bias;
   1.480 +            if(t<TMIN) {
   1.481 +                t=TMIN;
   1.482 +            } else if(k>=(bias+TMAX)) {
   1.483 +                t=TMAX;
   1.484 +            }
   1.485 +            if(digit<t) {
   1.486 +                break;
   1.487 +            }
   1.488 +
   1.489 +            if(w>0x7fffffff/(BASE-t)) {
   1.490 +                /* integer overflow */
   1.491 +                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.492 +                return 0;
   1.493 +            }
   1.494 +            w*=BASE-t;
   1.495 +        }
   1.496 +
   1.497 +        /*
   1.498 +         * Modification from sample code:
   1.499 +         * Increments destCPCount here,
   1.500 +         * where needed instead of in for() loop tail.
   1.501 +         */
   1.502 +        ++destCPCount;
   1.503 +        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
   1.504 +
   1.505 +        /*
   1.506 +         * i was supposed to wrap around from (incremented) destCPCount to 0,
   1.507 +         * incrementing n each time, so we'll fix that now:
   1.508 +         */
   1.509 +        if(i/destCPCount>(0x7fffffff-n)) {
   1.510 +            /* integer overflow */
   1.511 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.512 +            return 0;
   1.513 +        }
   1.514 +
   1.515 +        n+=i/destCPCount;
   1.516 +        i%=destCPCount;
   1.517 +        /* not needed for Punycode: */
   1.518 +        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
   1.519 +
   1.520 +        if(n>0x10ffff || U_IS_SURROGATE(n)) {
   1.521 +            /* Unicode code point overflow */
   1.522 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.523 +            return 0;
   1.524 +        }
   1.525 +
   1.526 +        /* Insert n at position i of the output: */
   1.527 +        cpLength=U16_LENGTH(n);
   1.528 +        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
   1.529 +            int32_t codeUnitIndex;
   1.530 +
   1.531 +            /*
   1.532 +             * Handle indexes when supplementary code points are present.
   1.533 +             *
   1.534 +             * In almost all cases, there will be only BMP code points before i
   1.535 +             * and even in the entire string.
   1.536 +             * This is handled with the same efficiency as with UTF-32.
   1.537 +             *
   1.538 +             * Only the rare cases with supplementary code points are handled
   1.539 +             * more slowly - but not too bad since this is an insertion anyway.
   1.540 +             */
   1.541 +            if(i<=firstSupplementaryIndex) {
   1.542 +                codeUnitIndex=i;
   1.543 +                if(cpLength>1) {
   1.544 +                    firstSupplementaryIndex=codeUnitIndex;
   1.545 +                } else {
   1.546 +                    ++firstSupplementaryIndex;
   1.547 +                }
   1.548 +            } else {
   1.549 +                codeUnitIndex=firstSupplementaryIndex;
   1.550 +                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
   1.551 +            }
   1.552 +
   1.553 +            /* use the UChar index codeUnitIndex instead of the code point index i */
   1.554 +            if(codeUnitIndex<destLength) {
   1.555 +                uprv_memmove(dest+codeUnitIndex+cpLength,
   1.556 +                             dest+codeUnitIndex,
   1.557 +                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
   1.558 +                if(caseFlags!=NULL) {
   1.559 +                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
   1.560 +                                 caseFlags+codeUnitIndex,
   1.561 +                                 destLength-codeUnitIndex);
   1.562 +                }
   1.563 +            }
   1.564 +            if(cpLength==1) {
   1.565 +                /* BMP, insert one code unit */
   1.566 +                dest[codeUnitIndex]=(UChar)n;
   1.567 +            } else {
   1.568 +                /* supplementary character, insert two code units */
   1.569 +                dest[codeUnitIndex]=U16_LEAD(n);
   1.570 +                dest[codeUnitIndex+1]=U16_TRAIL(n);
   1.571 +            }
   1.572 +            if(caseFlags!=NULL) {
   1.573 +                /* Case of last character determines uppercase flag: */
   1.574 +                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
   1.575 +                if(cpLength==2) {
   1.576 +                    caseFlags[codeUnitIndex+1]=FALSE;
   1.577 +                }
   1.578 +            }
   1.579 +        }
   1.580 +        destLength+=cpLength;
   1.581 +        U_ASSERT(destLength>=0);
   1.582 +        ++i;
   1.583 +    }
   1.584 +
   1.585 +    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
   1.586 +}
   1.587 +
   1.588 +/* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
   1.589 +
   1.590 +#endif /* #if !UCONFIG_NO_IDNA */

mercurial