intl/icu/source/common/punycode.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2002-2011, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: punycode.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2002jan31
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17
michael@0 18 /* This ICU code derived from: */
michael@0 19 /*
michael@0 20 punycode.c 0.4.0 (2001-Nov-17-Sat)
michael@0 21 http://www.cs.berkeley.edu/~amc/idn/
michael@0 22 Adam M. Costello
michael@0 23 http://www.nicemice.net/amc/
michael@0 24
michael@0 25 Disclaimer and license
michael@0 26
michael@0 27 Regarding this entire document or any portion of it (including
michael@0 28 the pseudocode and C code), the author makes no guarantees and
michael@0 29 is not responsible for any damage resulting from its use. The
michael@0 30 author grants irrevocable permission to anyone to use, modify,
michael@0 31 and distribute it in any way that does not diminish the rights
michael@0 32 of anyone else to use, modify, and distribute it, provided that
michael@0 33 redistributed derivative works do not contain misleading author or
michael@0 34 version information. Derivative works need not be licensed under
michael@0 35 similar terms.
michael@0 36 */
michael@0 37 /*
michael@0 38 * ICU modifications:
michael@0 39 * - ICU data types and coding conventions
michael@0 40 * - ICU string buffer handling with implicit source lengths
michael@0 41 * and destination preflighting
michael@0 42 * - UTF-16 handling
michael@0 43 */
michael@0 44
michael@0 45 #include "unicode/utypes.h"
michael@0 46
michael@0 47 #if !UCONFIG_NO_IDNA
michael@0 48
michael@0 49 #include "unicode/ustring.h"
michael@0 50 #include "unicode/utf.h"
michael@0 51 #include "unicode/utf16.h"
michael@0 52 #include "ustr_imp.h"
michael@0 53 #include "cstring.h"
michael@0 54 #include "cmemory.h"
michael@0 55 #include "punycode.h"
michael@0 56 #include "uassert.h"
michael@0 57
michael@0 58
michael@0 59 /* Punycode ----------------------------------------------------------------- */
michael@0 60
michael@0 61 /* Punycode parameters for Bootstring */
michael@0 62 #define BASE 36
michael@0 63 #define TMIN 1
michael@0 64 #define TMAX 26
michael@0 65 #define SKEW 38
michael@0 66 #define DAMP 700
michael@0 67 #define INITIAL_BIAS 72
michael@0 68 #define INITIAL_N 0x80
michael@0 69
michael@0 70 /* "Basic" Unicode/ASCII code points */
michael@0 71 #define _HYPHEN 0X2d
michael@0 72 #define DELIMITER _HYPHEN
michael@0 73
michael@0 74 #define _ZERO_ 0X30
michael@0 75 #define _NINE 0x39
michael@0 76
michael@0 77 #define _SMALL_A 0X61
michael@0 78 #define _SMALL_Z 0X7a
michael@0 79
michael@0 80 #define _CAPITAL_A 0X41
michael@0 81 #define _CAPITAL_Z 0X5a
michael@0 82
michael@0 83 #define IS_BASIC(c) ((c)<0x80)
michael@0 84 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
michael@0 85
michael@0 86 /**
michael@0 87 * digitToBasic() returns the basic code point whose value
michael@0 88 * (when used for representing integers) is d, which must be in the
michael@0 89 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
michael@0 90 * nonzero, in which case the uppercase form is used.
michael@0 91 */
michael@0 92 static inline char
michael@0 93 digitToBasic(int32_t digit, UBool uppercase) {
michael@0 94 /* 0..25 map to ASCII a..z or A..Z */
michael@0 95 /* 26..35 map to ASCII 0..9 */
michael@0 96 if(digit<26) {
michael@0 97 if(uppercase) {
michael@0 98 return (char)(_CAPITAL_A+digit);
michael@0 99 } else {
michael@0 100 return (char)(_SMALL_A+digit);
michael@0 101 }
michael@0 102 } else {
michael@0 103 return (char)((_ZERO_-26)+digit);
michael@0 104 }
michael@0 105 }
michael@0 106
michael@0 107 /**
michael@0 108 * basicToDigit[] contains the numeric value of a basic code
michael@0 109 * point (for use in representing integers) in the range 0 to
michael@0 110 * BASE-1, or -1 if b is does not represent a value.
michael@0 111 */
michael@0 112 static const int8_t
michael@0 113 basicToDigit[256]={
michael@0 114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 115 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 116
michael@0 117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 118 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
michael@0 119
michael@0 120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
michael@0 121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
michael@0 122
michael@0 123 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
michael@0 124 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
michael@0 125
michael@0 126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 128
michael@0 129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 131
michael@0 132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 134
michael@0 135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
michael@0 137 };
michael@0 138
michael@0 139 static inline char
michael@0 140 asciiCaseMap(char b, UBool uppercase) {
michael@0 141 if(uppercase) {
michael@0 142 if(_SMALL_A<=b && b<=_SMALL_Z) {
michael@0 143 b-=(_SMALL_A-_CAPITAL_A);
michael@0 144 }
michael@0 145 } else {
michael@0 146 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
michael@0 147 b+=(_SMALL_A-_CAPITAL_A);
michael@0 148 }
michael@0 149 }
michael@0 150 return b;
michael@0 151 }
michael@0 152
michael@0 153 /* Punycode-specific Bootstring code ---------------------------------------- */
michael@0 154
michael@0 155 /*
michael@0 156 * The following code omits the {parts} of the pseudo-algorithm in the spec
michael@0 157 * that are not used with the Punycode parameter set.
michael@0 158 */
michael@0 159
michael@0 160 /* Bias adaptation function. */
michael@0 161 static int32_t
michael@0 162 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
michael@0 163 int32_t count;
michael@0 164
michael@0 165 if(firstTime) {
michael@0 166 delta/=DAMP;
michael@0 167 } else {
michael@0 168 delta/=2;
michael@0 169 }
michael@0 170
michael@0 171 delta+=delta/length;
michael@0 172 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
michael@0 173 delta/=(BASE-TMIN);
michael@0 174 }
michael@0 175
michael@0 176 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
michael@0 177 }
michael@0 178
michael@0 179 #define MAX_CP_COUNT 200
michael@0 180
michael@0 181 U_CFUNC int32_t
michael@0 182 u_strToPunycode(const UChar *src, int32_t srcLength,
michael@0 183 UChar *dest, int32_t destCapacity,
michael@0 184 const UBool *caseFlags,
michael@0 185 UErrorCode *pErrorCode) {
michael@0 186
michael@0 187 int32_t cpBuffer[MAX_CP_COUNT];
michael@0 188 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
michael@0 189 UChar c, c2;
michael@0 190
michael@0 191 /* argument checking */
michael@0 192 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 193 return 0;
michael@0 194 }
michael@0 195
michael@0 196 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
michael@0 197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 198 return 0;
michael@0 199 }
michael@0 200
michael@0 201 /*
michael@0 202 * Handle the basic code points and
michael@0 203 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
michael@0 204 */
michael@0 205 srcCPCount=destLength=0;
michael@0 206 if(srcLength==-1) {
michael@0 207 /* NUL-terminated input */
michael@0 208 for(j=0; /* no condition */; ++j) {
michael@0 209 if((c=src[j])==0) {
michael@0 210 break;
michael@0 211 }
michael@0 212 if(srcCPCount==MAX_CP_COUNT) {
michael@0 213 /* too many input code points */
michael@0 214 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 215 return 0;
michael@0 216 }
michael@0 217 if(IS_BASIC(c)) {
michael@0 218 cpBuffer[srcCPCount++]=0;
michael@0 219 if(destLength<destCapacity) {
michael@0 220 dest[destLength]=
michael@0 221 caseFlags!=NULL ?
michael@0 222 asciiCaseMap((char)c, caseFlags[j]) :
michael@0 223 (char)c;
michael@0 224 }
michael@0 225 ++destLength;
michael@0 226 } else {
michael@0 227 n=(caseFlags!=NULL && caseFlags[j])<<31L;
michael@0 228 if(U16_IS_SINGLE(c)) {
michael@0 229 n|=c;
michael@0 230 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
michael@0 231 ++j;
michael@0 232 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
michael@0 233 } else {
michael@0 234 /* error: unmatched surrogate */
michael@0 235 *pErrorCode=U_INVALID_CHAR_FOUND;
michael@0 236 return 0;
michael@0 237 }
michael@0 238 cpBuffer[srcCPCount++]=n;
michael@0 239 }
michael@0 240 }
michael@0 241 } else {
michael@0 242 /* length-specified input */
michael@0 243 for(j=0; j<srcLength; ++j) {
michael@0 244 if(srcCPCount==MAX_CP_COUNT) {
michael@0 245 /* too many input code points */
michael@0 246 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 247 return 0;
michael@0 248 }
michael@0 249 c=src[j];
michael@0 250 if(IS_BASIC(c)) {
michael@0 251 cpBuffer[srcCPCount++]=0;
michael@0 252 if(destLength<destCapacity) {
michael@0 253 dest[destLength]=
michael@0 254 caseFlags!=NULL ?
michael@0 255 asciiCaseMap((char)c, caseFlags[j]) :
michael@0 256 (char)c;
michael@0 257 }
michael@0 258 ++destLength;
michael@0 259 } else {
michael@0 260 n=(caseFlags!=NULL && caseFlags[j])<<31L;
michael@0 261 if(U16_IS_SINGLE(c)) {
michael@0 262 n|=c;
michael@0 263 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
michael@0 264 ++j;
michael@0 265 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
michael@0 266 } else {
michael@0 267 /* error: unmatched surrogate */
michael@0 268 *pErrorCode=U_INVALID_CHAR_FOUND;
michael@0 269 return 0;
michael@0 270 }
michael@0 271 cpBuffer[srcCPCount++]=n;
michael@0 272 }
michael@0 273 }
michael@0 274 }
michael@0 275
michael@0 276 /* Finish the basic string - if it is not empty - with a delimiter. */
michael@0 277 basicLength=destLength;
michael@0 278 if(basicLength>0) {
michael@0 279 if(destLength<destCapacity) {
michael@0 280 dest[destLength]=DELIMITER;
michael@0 281 }
michael@0 282 ++destLength;
michael@0 283 }
michael@0 284
michael@0 285 /*
michael@0 286 * handledCPCount is the number of code points that have been handled
michael@0 287 * basicLength is the number of basic code points
michael@0 288 * destLength is the number of chars that have been output
michael@0 289 */
michael@0 290
michael@0 291 /* Initialize the state: */
michael@0 292 n=INITIAL_N;
michael@0 293 delta=0;
michael@0 294 bias=INITIAL_BIAS;
michael@0 295
michael@0 296 /* Main encoding loop: */
michael@0 297 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
michael@0 298 /*
michael@0 299 * All non-basic code points < n have been handled already.
michael@0 300 * Find the next larger one:
michael@0 301 */
michael@0 302 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
michael@0 303 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
michael@0 304 if(n<=q && q<m) {
michael@0 305 m=q;
michael@0 306 }
michael@0 307 }
michael@0 308
michael@0 309 /*
michael@0 310 * Increase delta enough to advance the decoder's
michael@0 311 * <n,i> state to <m,0>, but guard against overflow:
michael@0 312 */
michael@0 313 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
michael@0 314 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
michael@0 315 return 0;
michael@0 316 }
michael@0 317 delta+=(m-n)*(handledCPCount+1);
michael@0 318 n=m;
michael@0 319
michael@0 320 /* Encode a sequence of same code points n */
michael@0 321 for(j=0; j<srcCPCount; ++j) {
michael@0 322 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
michael@0 323 if(q<n) {
michael@0 324 ++delta;
michael@0 325 } else if(q==n) {
michael@0 326 /* Represent delta as a generalized variable-length integer: */
michael@0 327 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
michael@0 328
michael@0 329 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
michael@0 330
michael@0 331 t=k-bias;
michael@0 332 if(t<TMIN) {
michael@0 333 t=TMIN;
michael@0 334 } else if(t>TMAX) {
michael@0 335 t=TMAX;
michael@0 336 }
michael@0 337 */
michael@0 338
michael@0 339 t=k-bias;
michael@0 340 if(t<TMIN) {
michael@0 341 t=TMIN;
michael@0 342 } else if(k>=(bias+TMAX)) {
michael@0 343 t=TMAX;
michael@0 344 }
michael@0 345
michael@0 346 if(q<t) {
michael@0 347 break;
michael@0 348 }
michael@0 349
michael@0 350 if(destLength<destCapacity) {
michael@0 351 dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
michael@0 352 }
michael@0 353 ++destLength;
michael@0 354 q=(q-t)/(BASE-t);
michael@0 355 }
michael@0 356
michael@0 357 if(destLength<destCapacity) {
michael@0 358 dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
michael@0 359 }
michael@0 360 ++destLength;
michael@0 361 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
michael@0 362 delta=0;
michael@0 363 ++handledCPCount;
michael@0 364 }
michael@0 365 }
michael@0 366
michael@0 367 ++delta;
michael@0 368 ++n;
michael@0 369 }
michael@0 370
michael@0 371 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
michael@0 372 }
michael@0 373
michael@0 374 U_CFUNC int32_t
michael@0 375 u_strFromPunycode(const UChar *src, int32_t srcLength,
michael@0 376 UChar *dest, int32_t destCapacity,
michael@0 377 UBool *caseFlags,
michael@0 378 UErrorCode *pErrorCode) {
michael@0 379 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
michael@0 380 destCPCount, firstSupplementaryIndex, cpLength;
michael@0 381 UChar b;
michael@0 382
michael@0 383 /* argument checking */
michael@0 384 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
michael@0 385 return 0;
michael@0 386 }
michael@0 387
michael@0 388 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
michael@0 389 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 390 return 0;
michael@0 391 }
michael@0 392
michael@0 393 if(srcLength==-1) {
michael@0 394 srcLength=u_strlen(src);
michael@0 395 }
michael@0 396
michael@0 397 /*
michael@0 398 * Handle the basic code points:
michael@0 399 * Let basicLength be the number of input code points
michael@0 400 * before the last delimiter, or 0 if there is none,
michael@0 401 * then copy the first basicLength code points to the output.
michael@0 402 *
michael@0 403 * The two following loops iterate backward.
michael@0 404 */
michael@0 405 for(j=srcLength; j>0;) {
michael@0 406 if(src[--j]==DELIMITER) {
michael@0 407 break;
michael@0 408 }
michael@0 409 }
michael@0 410 destLength=basicLength=destCPCount=j;
michael@0 411 U_ASSERT(destLength>=0);
michael@0 412
michael@0 413 while(j>0) {
michael@0 414 b=src[--j];
michael@0 415 if(!IS_BASIC(b)) {
michael@0 416 *pErrorCode=U_INVALID_CHAR_FOUND;
michael@0 417 return 0;
michael@0 418 }
michael@0 419
michael@0 420 if(j<destCapacity) {
michael@0 421 dest[j]=(UChar)b;
michael@0 422
michael@0 423 if(caseFlags!=NULL) {
michael@0 424 caseFlags[j]=IS_BASIC_UPPERCASE(b);
michael@0 425 }
michael@0 426 }
michael@0 427 }
michael@0 428
michael@0 429 /* Initialize the state: */
michael@0 430 n=INITIAL_N;
michael@0 431 i=0;
michael@0 432 bias=INITIAL_BIAS;
michael@0 433 firstSupplementaryIndex=1000000000;
michael@0 434
michael@0 435 /*
michael@0 436 * Main decoding loop:
michael@0 437 * Start just after the last delimiter if any
michael@0 438 * basic code points were copied; start at the beginning otherwise.
michael@0 439 */
michael@0 440 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
michael@0 441 /*
michael@0 442 * in is the index of the next character to be consumed, and
michael@0 443 * destCPCount is the number of code points in the output array.
michael@0 444 *
michael@0 445 * Decode a generalized variable-length integer into delta,
michael@0 446 * which gets added to i. The overflow checking is easier
michael@0 447 * if we increase i as we go, then subtract off its starting
michael@0 448 * value at the end to obtain delta.
michael@0 449 */
michael@0 450 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
michael@0 451 if(in>=srcLength) {
michael@0 452 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 453 return 0;
michael@0 454 }
michael@0 455
michael@0 456 digit=basicToDigit[(uint8_t)src[in++]];
michael@0 457 if(digit<0) {
michael@0 458 *pErrorCode=U_INVALID_CHAR_FOUND;
michael@0 459 return 0;
michael@0 460 }
michael@0 461 if(digit>(0x7fffffff-i)/w) {
michael@0 462 /* integer overflow */
michael@0 463 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 464 return 0;
michael@0 465 }
michael@0 466
michael@0 467 i+=digit*w;
michael@0 468 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
michael@0 469 t=k-bias;
michael@0 470 if(t<TMIN) {
michael@0 471 t=TMIN;
michael@0 472 } else if(t>TMAX) {
michael@0 473 t=TMAX;
michael@0 474 }
michael@0 475 */
michael@0 476 t=k-bias;
michael@0 477 if(t<TMIN) {
michael@0 478 t=TMIN;
michael@0 479 } else if(k>=(bias+TMAX)) {
michael@0 480 t=TMAX;
michael@0 481 }
michael@0 482 if(digit<t) {
michael@0 483 break;
michael@0 484 }
michael@0 485
michael@0 486 if(w>0x7fffffff/(BASE-t)) {
michael@0 487 /* integer overflow */
michael@0 488 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 489 return 0;
michael@0 490 }
michael@0 491 w*=BASE-t;
michael@0 492 }
michael@0 493
michael@0 494 /*
michael@0 495 * Modification from sample code:
michael@0 496 * Increments destCPCount here,
michael@0 497 * where needed instead of in for() loop tail.
michael@0 498 */
michael@0 499 ++destCPCount;
michael@0 500 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
michael@0 501
michael@0 502 /*
michael@0 503 * i was supposed to wrap around from (incremented) destCPCount to 0,
michael@0 504 * incrementing n each time, so we'll fix that now:
michael@0 505 */
michael@0 506 if(i/destCPCount>(0x7fffffff-n)) {
michael@0 507 /* integer overflow */
michael@0 508 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 509 return 0;
michael@0 510 }
michael@0 511
michael@0 512 n+=i/destCPCount;
michael@0 513 i%=destCPCount;
michael@0 514 /* not needed for Punycode: */
michael@0 515 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
michael@0 516
michael@0 517 if(n>0x10ffff || U_IS_SURROGATE(n)) {
michael@0 518 /* Unicode code point overflow */
michael@0 519 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 520 return 0;
michael@0 521 }
michael@0 522
michael@0 523 /* Insert n at position i of the output: */
michael@0 524 cpLength=U16_LENGTH(n);
michael@0 525 if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
michael@0 526 int32_t codeUnitIndex;
michael@0 527
michael@0 528 /*
michael@0 529 * Handle indexes when supplementary code points are present.
michael@0 530 *
michael@0 531 * In almost all cases, there will be only BMP code points before i
michael@0 532 * and even in the entire string.
michael@0 533 * This is handled with the same efficiency as with UTF-32.
michael@0 534 *
michael@0 535 * Only the rare cases with supplementary code points are handled
michael@0 536 * more slowly - but not too bad since this is an insertion anyway.
michael@0 537 */
michael@0 538 if(i<=firstSupplementaryIndex) {
michael@0 539 codeUnitIndex=i;
michael@0 540 if(cpLength>1) {
michael@0 541 firstSupplementaryIndex=codeUnitIndex;
michael@0 542 } else {
michael@0 543 ++firstSupplementaryIndex;
michael@0 544 }
michael@0 545 } else {
michael@0 546 codeUnitIndex=firstSupplementaryIndex;
michael@0 547 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
michael@0 548 }
michael@0 549
michael@0 550 /* use the UChar index codeUnitIndex instead of the code point index i */
michael@0 551 if(codeUnitIndex<destLength) {
michael@0 552 uprv_memmove(dest+codeUnitIndex+cpLength,
michael@0 553 dest+codeUnitIndex,
michael@0 554 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
michael@0 555 if(caseFlags!=NULL) {
michael@0 556 uprv_memmove(caseFlags+codeUnitIndex+cpLength,
michael@0 557 caseFlags+codeUnitIndex,
michael@0 558 destLength-codeUnitIndex);
michael@0 559 }
michael@0 560 }
michael@0 561 if(cpLength==1) {
michael@0 562 /* BMP, insert one code unit */
michael@0 563 dest[codeUnitIndex]=(UChar)n;
michael@0 564 } else {
michael@0 565 /* supplementary character, insert two code units */
michael@0 566 dest[codeUnitIndex]=U16_LEAD(n);
michael@0 567 dest[codeUnitIndex+1]=U16_TRAIL(n);
michael@0 568 }
michael@0 569 if(caseFlags!=NULL) {
michael@0 570 /* Case of last character determines uppercase flag: */
michael@0 571 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
michael@0 572 if(cpLength==2) {
michael@0 573 caseFlags[codeUnitIndex+1]=FALSE;
michael@0 574 }
michael@0 575 }
michael@0 576 }
michael@0 577 destLength+=cpLength;
michael@0 578 U_ASSERT(destLength>=0);
michael@0 579 ++i;
michael@0 580 }
michael@0 581
michael@0 582 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
michael@0 583 }
michael@0 584
michael@0 585 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
michael@0 586
michael@0 587 #endif /* #if !UCONFIG_NO_IDNA */

mercurial