intl/icu/source/common/uts46.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2010-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: uts46.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2010mar09
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16
michael@0 17 #if !UCONFIG_NO_IDNA
michael@0 18
michael@0 19 #include "unicode/idna.h"
michael@0 20 #include "unicode/normalizer2.h"
michael@0 21 #include "unicode/uscript.h"
michael@0 22 #include "unicode/ustring.h"
michael@0 23 #include "unicode/utf16.h"
michael@0 24 #include "cmemory.h"
michael@0 25 #include "cstring.h"
michael@0 26 #include "punycode.h"
michael@0 27 #include "ubidi_props.h"
michael@0 28 #include "ustr_imp.h"
michael@0 29
michael@0 30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 31
michael@0 32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:
michael@0 33 //
michael@0 34 // The domain name length limit is 255 octets in an internal DNS representation
michael@0 35 // where the last ("root") label is the empty label
michael@0 36 // represented by length byte 0 alone.
michael@0 37 // In a conventional string, this translates to 253 characters, or 254
michael@0 38 // if there is a trailing dot for the root label.
michael@0 39
michael@0 40 U_NAMESPACE_BEGIN
michael@0 41
michael@0 42 // Severe errors which usually result in a U+FFFD replacement character in the result string.
michael@0 43 const uint32_t severeErrors=
michael@0 44 UIDNA_ERROR_LEADING_COMBINING_MARK|
michael@0 45 UIDNA_ERROR_DISALLOWED|
michael@0 46 UIDNA_ERROR_PUNYCODE|
michael@0 47 UIDNA_ERROR_LABEL_HAS_DOT|
michael@0 48 UIDNA_ERROR_INVALID_ACE_LABEL;
michael@0 49
michael@0 50 static inline UBool
michael@0 51 isASCIIString(const UnicodeString &dest) {
michael@0 52 const UChar *s=dest.getBuffer();
michael@0 53 const UChar *limit=s+dest.length();
michael@0 54 while(s<limit) {
michael@0 55 if(*s++>0x7f) {
michael@0 56 return FALSE;
michael@0 57 }
michael@0 58 }
michael@0 59 return TRUE;
michael@0 60 }
michael@0 61
michael@0 62 static UBool
michael@0 63 isASCIIOkBiDi(const UChar *s, int32_t length);
michael@0 64
michael@0 65 static UBool
michael@0 66 isASCIIOkBiDi(const char *s, int32_t length);
michael@0 67
michael@0 68 // IDNA class default implementations -------------------------------------- ***
michael@0 69
michael@0 70 IDNA::~IDNA() {}
michael@0 71
michael@0 72 void
michael@0 73 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
michael@0 74 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 75 if(U_SUCCESS(errorCode)) {
michael@0 76 UnicodeString destString;
michael@0 77 labelToASCII(UnicodeString::fromUTF8(label), destString,
michael@0 78 info, errorCode).toUTF8(dest);
michael@0 79 }
michael@0 80 }
michael@0 81
michael@0 82 void
michael@0 83 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
michael@0 84 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 85 if(U_SUCCESS(errorCode)) {
michael@0 86 UnicodeString destString;
michael@0 87 labelToUnicode(UnicodeString::fromUTF8(label), destString,
michael@0 88 info, errorCode).toUTF8(dest);
michael@0 89 }
michael@0 90 }
michael@0 91
michael@0 92 void
michael@0 93 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
michael@0 94 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 95 if(U_SUCCESS(errorCode)) {
michael@0 96 UnicodeString destString;
michael@0 97 nameToASCII(UnicodeString::fromUTF8(name), destString,
michael@0 98 info, errorCode).toUTF8(dest);
michael@0 99 }
michael@0 100 }
michael@0 101
michael@0 102 void
michael@0 103 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
michael@0 104 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 105 if(U_SUCCESS(errorCode)) {
michael@0 106 UnicodeString destString;
michael@0 107 nameToUnicode(UnicodeString::fromUTF8(name), destString,
michael@0 108 info, errorCode).toUTF8(dest);
michael@0 109 }
michael@0 110 }
michael@0 111
michael@0 112 // UTS46 class declaration ------------------------------------------------- ***
michael@0 113
michael@0 114 class UTS46 : public IDNA {
michael@0 115 public:
michael@0 116 UTS46(uint32_t options, UErrorCode &errorCode);
michael@0 117 virtual ~UTS46();
michael@0 118
michael@0 119 virtual UnicodeString &
michael@0 120 labelToASCII(const UnicodeString &label, UnicodeString &dest,
michael@0 121 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 122
michael@0 123 virtual UnicodeString &
michael@0 124 labelToUnicode(const UnicodeString &label, UnicodeString &dest,
michael@0 125 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 126
michael@0 127 virtual UnicodeString &
michael@0 128 nameToASCII(const UnicodeString &name, UnicodeString &dest,
michael@0 129 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 130
michael@0 131 virtual UnicodeString &
michael@0 132 nameToUnicode(const UnicodeString &name, UnicodeString &dest,
michael@0 133 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 134
michael@0 135 virtual void
michael@0 136 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
michael@0 137 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 138
michael@0 139 virtual void
michael@0 140 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
michael@0 141 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 142
michael@0 143 virtual void
michael@0 144 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
michael@0 145 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 146
michael@0 147 virtual void
michael@0 148 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
michael@0 149 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 150
michael@0 151 private:
michael@0 152 UnicodeString &
michael@0 153 process(const UnicodeString &src,
michael@0 154 UBool isLabel, UBool toASCII,
michael@0 155 UnicodeString &dest,
michael@0 156 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 157
michael@0 158 void
michael@0 159 processUTF8(const StringPiece &src,
michael@0 160 UBool isLabel, UBool toASCII,
michael@0 161 ByteSink &dest,
michael@0 162 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 163
michael@0 164 UnicodeString &
michael@0 165 processUnicode(const UnicodeString &src,
michael@0 166 int32_t labelStart, int32_t mappingStart,
michael@0 167 UBool isLabel, UBool toASCII,
michael@0 168 UnicodeString &dest,
michael@0 169 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 170
michael@0 171 // returns the new dest.length()
michael@0 172 int32_t
michael@0 173 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
michael@0 174 UErrorCode &errorCode) const;
michael@0 175
michael@0 176 // returns the new label length
michael@0 177 int32_t
michael@0 178 processLabel(UnicodeString &dest,
michael@0 179 int32_t labelStart, int32_t labelLength,
michael@0 180 UBool toASCII,
michael@0 181 IDNAInfo &info, UErrorCode &errorCode) const;
michael@0 182 int32_t
michael@0 183 markBadACELabel(UnicodeString &dest,
michael@0 184 int32_t labelStart, int32_t labelLength,
michael@0 185 UBool toASCII, IDNAInfo &info) const;
michael@0 186
michael@0 187 void
michael@0 188 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
michael@0 189
michael@0 190 UBool
michael@0 191 isLabelOkContextJ(const UChar *label, int32_t labelLength) const;
michael@0 192
michael@0 193 void
michael@0 194 checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;
michael@0 195
michael@0 196 const Normalizer2 &uts46Norm2; // uts46.nrm
michael@0 197 uint32_t options;
michael@0 198 };
michael@0 199
michael@0 200 IDNA *
michael@0 201 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {
michael@0 202 if(U_SUCCESS(errorCode)) {
michael@0 203 IDNA *idna=new UTS46(options, errorCode);
michael@0 204 if(idna==NULL) {
michael@0 205 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 206 } else if(U_FAILURE(errorCode)) {
michael@0 207 delete idna;
michael@0 208 idna=NULL;
michael@0 209 }
michael@0 210 return idna;
michael@0 211 } else {
michael@0 212 return NULL;
michael@0 213 }
michael@0 214 }
michael@0 215
michael@0 216 // UTS46 implementation ---------------------------------------------------- ***
michael@0 217
michael@0 218 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)
michael@0 219 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),
michael@0 220 options(opt) {}
michael@0 221
michael@0 222 UTS46::~UTS46() {}
michael@0 223
michael@0 224 UnicodeString &
michael@0 225 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,
michael@0 226 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 227 return process(label, TRUE, TRUE, dest, info, errorCode);
michael@0 228 }
michael@0 229
michael@0 230 UnicodeString &
michael@0 231 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,
michael@0 232 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 233 return process(label, TRUE, FALSE, dest, info, errorCode);
michael@0 234 }
michael@0 235
michael@0 236 UnicodeString &
michael@0 237 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,
michael@0 238 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 239 process(name, FALSE, TRUE, dest, info, errorCode);
michael@0 240 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&
michael@0 241 isASCIIString(dest) &&
michael@0 242 (dest.length()>254 || dest[253]!=0x2e)
michael@0 243 ) {
michael@0 244 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
michael@0 245 }
michael@0 246 return dest;
michael@0 247 }
michael@0 248
michael@0 249 UnicodeString &
michael@0 250 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,
michael@0 251 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 252 return process(name, FALSE, FALSE, dest, info, errorCode);
michael@0 253 }
michael@0 254
michael@0 255 void
michael@0 256 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
michael@0 257 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 258 processUTF8(label, TRUE, TRUE, dest, info, errorCode);
michael@0 259 }
michael@0 260
michael@0 261 void
michael@0 262 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
michael@0 263 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 264 processUTF8(label, TRUE, FALSE, dest, info, errorCode);
michael@0 265 }
michael@0 266
michael@0 267 void
michael@0 268 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
michael@0 269 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 270 processUTF8(name, FALSE, TRUE, dest, info, errorCode);
michael@0 271 }
michael@0 272
michael@0 273 void
michael@0 274 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
michael@0 275 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 276 processUTF8(name, FALSE, FALSE, dest, info, errorCode);
michael@0 277 }
michael@0 278
michael@0 279 // UTS #46 data for ASCII characters.
michael@0 280 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
michael@0 281 // and passes through all other ASCII characters.
michael@0 282 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed
michael@0 283 // using this data.
michael@0 284 // The ASCII fastpath also uses this data.
michael@0 285 // Values: -1=disallowed 0==valid 1==mapped (lowercase)
michael@0 286 static const int8_t asciiData[128]={
michael@0 287 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 288 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
michael@0 289 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP
michael@0 290 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1,
michael@0 291 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE
michael@0 292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
michael@0 293 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
michael@0 294 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
michael@0 295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
michael@0 296 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
michael@0 297 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
michael@0 298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1
michael@0 299 };
michael@0 300
michael@0 301 UnicodeString &
michael@0 302 UTS46::process(const UnicodeString &src,
michael@0 303 UBool isLabel, UBool toASCII,
michael@0 304 UnicodeString &dest,
michael@0 305 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 306 // uts46Norm2.normalize() would do all of this error checking and setup,
michael@0 307 // but with the ASCII fastpath we do not always call it, and do not
michael@0 308 // call it first.
michael@0 309 if(U_FAILURE(errorCode)) {
michael@0 310 dest.setToBogus();
michael@0 311 return dest;
michael@0 312 }
michael@0 313 const UChar *srcArray=src.getBuffer();
michael@0 314 if(&dest==&src || srcArray==NULL) {
michael@0 315 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 316 dest.setToBogus();
michael@0 317 return dest;
michael@0 318 }
michael@0 319 // Arguments are fine, reset output values.
michael@0 320 dest.remove();
michael@0 321 info.reset();
michael@0 322 int32_t srcLength=src.length();
michael@0 323 if(srcLength==0) {
michael@0 324 if(toASCII) {
michael@0 325 info.errors|=UIDNA_ERROR_EMPTY_LABEL;
michael@0 326 }
michael@0 327 return dest;
michael@0 328 }
michael@0 329 UChar *destArray=dest.getBuffer(srcLength);
michael@0 330 if(destArray==NULL) {
michael@0 331 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 332 return dest;
michael@0 333 }
michael@0 334 // ASCII fastpath
michael@0 335 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
michael@0 336 int32_t labelStart=0;
michael@0 337 int32_t i;
michael@0 338 for(i=0;; ++i) {
michael@0 339 if(i==srcLength) {
michael@0 340 if(toASCII) {
michael@0 341 if((i-labelStart)>63) {
michael@0 342 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 343 }
michael@0 344 // There is a trailing dot if labelStart==i.
michael@0 345 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
michael@0 346 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
michael@0 347 }
michael@0 348 }
michael@0 349 info.errors|=info.labelErrors;
michael@0 350 dest.releaseBuffer(i);
michael@0 351 return dest;
michael@0 352 }
michael@0 353 UChar c=srcArray[i];
michael@0 354 if(c>0x7f) {
michael@0 355 break;
michael@0 356 }
michael@0 357 int cData=asciiData[c];
michael@0 358 if(cData>0) {
michael@0 359 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
michael@0 360 } else if(cData<0 && disallowNonLDHDot) {
michael@0 361 break; // Replacing with U+FFFD can be complicated for toASCII.
michael@0 362 } else {
michael@0 363 destArray[i]=c;
michael@0 364 if(c==0x2d) { // hyphen
michael@0 365 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
michael@0 366 // "??--..." is Punycode or forbidden.
michael@0 367 ++i; // '-' was copied to dest already
michael@0 368 break;
michael@0 369 }
michael@0 370 if(i==labelStart) {
michael@0 371 // label starts with "-"
michael@0 372 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
michael@0 373 }
michael@0 374 if((i+1)==srcLength || srcArray[i+1]==0x2e) {
michael@0 375 // label ends with "-"
michael@0 376 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
michael@0 377 }
michael@0 378 } else if(c==0x2e) { // dot
michael@0 379 if(isLabel) {
michael@0 380 // Replacing with U+FFFD can be complicated for toASCII.
michael@0 381 ++i; // '.' was copied to dest already
michael@0 382 break;
michael@0 383 }
michael@0 384 if(toASCII) {
michael@0 385 // Permit an empty label at the end but not elsewhere.
michael@0 386 if(i==labelStart && i<(srcLength-1)) {
michael@0 387 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
michael@0 388 } else if((i-labelStart)>63) {
michael@0 389 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 390 }
michael@0 391 }
michael@0 392 info.errors|=info.labelErrors;
michael@0 393 info.labelErrors=0;
michael@0 394 labelStart=i+1;
michael@0 395 }
michael@0 396 }
michael@0 397 }
michael@0 398 info.errors|=info.labelErrors;
michael@0 399 dest.releaseBuffer(i);
michael@0 400 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);
michael@0 401 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
michael@0 402 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))
michael@0 403 ) {
michael@0 404 info.errors|=UIDNA_ERROR_BIDI;
michael@0 405 }
michael@0 406 return dest;
michael@0 407 }
michael@0 408
michael@0 409 void
michael@0 410 UTS46::processUTF8(const StringPiece &src,
michael@0 411 UBool isLabel, UBool toASCII,
michael@0 412 ByteSink &dest,
michael@0 413 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 414 if(U_FAILURE(errorCode)) {
michael@0 415 return;
michael@0 416 }
michael@0 417 const char *srcArray=src.data();
michael@0 418 int32_t srcLength=src.length();
michael@0 419 if(srcArray==NULL && srcLength!=0) {
michael@0 420 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 421 return;
michael@0 422 }
michael@0 423 // Arguments are fine, reset output values.
michael@0 424 info.reset();
michael@0 425 if(srcLength==0) {
michael@0 426 if(toASCII) {
michael@0 427 info.errors|=UIDNA_ERROR_EMPTY_LABEL;
michael@0 428 }
michael@0 429 dest.Flush();
michael@0 430 return;
michael@0 431 }
michael@0 432 UnicodeString destString;
michael@0 433 int32_t labelStart=0;
michael@0 434 if(srcLength<=256) { // length of stackArray[]
michael@0 435 // ASCII fastpath
michael@0 436 char stackArray[256];
michael@0 437 int32_t destCapacity;
michael@0 438 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,
michael@0 439 stackArray, LENGTHOF(stackArray), &destCapacity);
michael@0 440 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
michael@0 441 int32_t i;
michael@0 442 for(i=0;; ++i) {
michael@0 443 if(i==srcLength) {
michael@0 444 if(toASCII) {
michael@0 445 if((i-labelStart)>63) {
michael@0 446 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 447 }
michael@0 448 // There is a trailing dot if labelStart==i.
michael@0 449 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
michael@0 450 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
michael@0 451 }
michael@0 452 }
michael@0 453 info.errors|=info.labelErrors;
michael@0 454 dest.Append(destArray, i);
michael@0 455 dest.Flush();
michael@0 456 return;
michael@0 457 }
michael@0 458 char c=srcArray[i];
michael@0 459 if((int8_t)c<0) { // (uint8_t)c>0x7f
michael@0 460 break;
michael@0 461 }
michael@0 462 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char.
michael@0 463 if(cData>0) {
michael@0 464 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.
michael@0 465 } else if(cData<0 && disallowNonLDHDot) {
michael@0 466 break; // Replacing with U+FFFD can be complicated for toASCII.
michael@0 467 } else {
michael@0 468 destArray[i]=c;
michael@0 469 if(c==0x2d) { // hyphen
michael@0 470 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {
michael@0 471 // "??--..." is Punycode or forbidden.
michael@0 472 break;
michael@0 473 }
michael@0 474 if(i==labelStart) {
michael@0 475 // label starts with "-"
michael@0 476 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
michael@0 477 }
michael@0 478 if((i+1)==srcLength || srcArray[i+1]==0x2e) {
michael@0 479 // label ends with "-"
michael@0 480 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
michael@0 481 }
michael@0 482 } else if(c==0x2e) { // dot
michael@0 483 if(isLabel) {
michael@0 484 break; // Replacing with U+FFFD can be complicated for toASCII.
michael@0 485 }
michael@0 486 if(toASCII) {
michael@0 487 // Permit an empty label at the end but not elsewhere.
michael@0 488 if(i==labelStart && i<(srcLength-1)) {
michael@0 489 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
michael@0 490 } else if((i-labelStart)>63) {
michael@0 491 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 492 }
michael@0 493 }
michael@0 494 info.errors|=info.labelErrors;
michael@0 495 info.labelErrors=0;
michael@0 496 labelStart=i+1;
michael@0 497 }
michael@0 498 }
michael@0 499 }
michael@0 500 info.errors|=info.labelErrors;
michael@0 501 // Convert the processed ASCII prefix of the current label to UTF-16.
michael@0 502 int32_t mappingStart=i-labelStart;
michael@0 503 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));
michael@0 504 // Output the previous ASCII labels and process the rest of src in UTF-16.
michael@0 505 dest.Append(destArray, labelStart);
michael@0 506 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,
michael@0 507 isLabel, toASCII,
michael@0 508 destString, info, errorCode);
michael@0 509 } else {
michael@0 510 // src is too long for the ASCII fastpath implementation.
michael@0 511 processUnicode(UnicodeString::fromUTF8(src), 0, 0,
michael@0 512 isLabel, toASCII,
michael@0 513 destString, info, errorCode);
michael@0 514 }
michael@0 515 destString.toUTF8(dest); // calls dest.Flush()
michael@0 516 if(toASCII && !isLabel) {
michael@0 517 // length==labelStart==254 means that there is a trailing dot (ok) and
michael@0 518 // destString is empty (do not index at 253-labelStart).
michael@0 519 int32_t length=labelStart+destString.length();
michael@0 520 if( length>=254 && isASCIIString(destString) &&
michael@0 521 (length>254 ||
michael@0 522 (labelStart<254 && destString[253-labelStart]!=0x2e))
michael@0 523 ) {
michael@0 524 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
michael@0 525 }
michael@0 526 }
michael@0 527 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&
michael@0 528 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))
michael@0 529 ) {
michael@0 530 info.errors|=UIDNA_ERROR_BIDI;
michael@0 531 }
michael@0 532 }
michael@0 533
michael@0 534 UnicodeString &
michael@0 535 UTS46::processUnicode(const UnicodeString &src,
michael@0 536 int32_t labelStart, int32_t mappingStart,
michael@0 537 UBool isLabel, UBool toASCII,
michael@0 538 UnicodeString &dest,
michael@0 539 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 540 if(mappingStart==0) {
michael@0 541 uts46Norm2.normalize(src, dest, errorCode);
michael@0 542 } else {
michael@0 543 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);
michael@0 544 }
michael@0 545 if(U_FAILURE(errorCode)) {
michael@0 546 return dest;
michael@0 547 }
michael@0 548 UBool doMapDevChars=
michael@0 549 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :
michael@0 550 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;
michael@0 551 const UChar *destArray=dest.getBuffer();
michael@0 552 int32_t destLength=dest.length();
michael@0 553 int32_t labelLimit=labelStart;
michael@0 554 while(labelLimit<destLength) {
michael@0 555 UChar c=destArray[labelLimit];
michael@0 556 if(c==0x2e && !isLabel) {
michael@0 557 int32_t labelLength=labelLimit-labelStart;
michael@0 558 int32_t newLength=processLabel(dest, labelStart, labelLength,
michael@0 559 toASCII, info, errorCode);
michael@0 560 info.errors|=info.labelErrors;
michael@0 561 info.labelErrors=0;
michael@0 562 if(U_FAILURE(errorCode)) {
michael@0 563 return dest;
michael@0 564 }
michael@0 565 destArray=dest.getBuffer();
michael@0 566 destLength+=newLength-labelLength;
michael@0 567 labelLimit=labelStart+=newLength+1;
michael@0 568 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
michael@0 569 info.isTransDiff=TRUE;
michael@0 570 if(doMapDevChars) {
michael@0 571 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);
michael@0 572 if(U_FAILURE(errorCode)) {
michael@0 573 return dest;
michael@0 574 }
michael@0 575 destArray=dest.getBuffer();
michael@0 576 // Do not increment labelLimit in case c was removed.
michael@0 577 // All deviation characters have been mapped, no need to check for them again.
michael@0 578 doMapDevChars=FALSE;
michael@0 579 } else {
michael@0 580 ++labelLimit;
michael@0 581 }
michael@0 582 } else {
michael@0 583 ++labelLimit;
michael@0 584 }
michael@0 585 }
michael@0 586 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
michael@0 587 // but not an empty label elsewhere nor a completely empty domain name.
michael@0 588 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
michael@0 589 if(0==labelStart || labelStart<labelLimit) {
michael@0 590 processLabel(dest, labelStart, labelLimit-labelStart,
michael@0 591 toASCII, info, errorCode);
michael@0 592 info.errors|=info.labelErrors;
michael@0 593 }
michael@0 594 return dest;
michael@0 595 }
michael@0 596
michael@0 597 int32_t
michael@0 598 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,
michael@0 599 UErrorCode &errorCode) const {
michael@0 600 int32_t length=dest.length();
michael@0 601 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);
michael@0 602 if(s==NULL) {
michael@0 603 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 604 return length;
michael@0 605 }
michael@0 606 int32_t capacity=dest.getCapacity();
michael@0 607 UBool didMapDevChars=FALSE;
michael@0 608 int32_t readIndex=mappingStart, writeIndex=mappingStart;
michael@0 609 do {
michael@0 610 UChar c=s[readIndex++];
michael@0 611 switch(c) {
michael@0 612 case 0xdf:
michael@0 613 // Map sharp s to ss.
michael@0 614 didMapDevChars=TRUE;
michael@0 615 s[writeIndex++]=0x73; // Replace sharp s with first s.
michael@0 616 // Insert second s and account for possible buffer reallocation.
michael@0 617 if(writeIndex==readIndex) {
michael@0 618 if(length==capacity) {
michael@0 619 dest.releaseBuffer(length);
michael@0 620 s=dest.getBuffer(length+1);
michael@0 621 if(s==NULL) {
michael@0 622 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 623 return length;
michael@0 624 }
michael@0 625 capacity=dest.getCapacity();
michael@0 626 }
michael@0 627 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);
michael@0 628 ++readIndex;
michael@0 629 }
michael@0 630 s[writeIndex++]=0x73;
michael@0 631 ++length;
michael@0 632 break;
michael@0 633 case 0x3c2: // Map final sigma to nonfinal sigma.
michael@0 634 didMapDevChars=TRUE;
michael@0 635 s[writeIndex++]=0x3c3;
michael@0 636 break;
michael@0 637 case 0x200c: // Ignore/remove ZWNJ.
michael@0 638 case 0x200d: // Ignore/remove ZWJ.
michael@0 639 didMapDevChars=TRUE;
michael@0 640 --length;
michael@0 641 break;
michael@0 642 default:
michael@0 643 // Only really necessary if writeIndex was different from readIndex.
michael@0 644 s[writeIndex++]=c;
michael@0 645 break;
michael@0 646 }
michael@0 647 } while(writeIndex<length);
michael@0 648 dest.releaseBuffer(length);
michael@0 649 if(didMapDevChars) {
michael@0 650 // Mapping deviation characters might have resulted in an un-NFC string.
michael@0 651 // We could use either the NFC or the UTS #46 normalizer.
michael@0 652 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
michael@0 653 UnicodeString normalized;
michael@0 654 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);
michael@0 655 if(U_SUCCESS(errorCode)) {
michael@0 656 dest.replace(labelStart, 0x7fffffff, normalized);
michael@0 657 return dest.length();
michael@0 658 }
michael@0 659 }
michael@0 660 return length;
michael@0 661 }
michael@0 662
michael@0 663 // Some non-ASCII characters are equivalent to sequences with
michael@0 664 // non-LDH ASCII characters. To find them:
michael@0 665 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
michael@0 666 static inline UBool
michael@0 667 isNonASCIIDisallowedSTD3Valid(UChar32 c) {
michael@0 668 return c==0x2260 || c==0x226E || c==0x226F;
michael@0 669 }
michael@0 670
michael@0 671 // Replace the label in dest with the label string, if the label was modified.
michael@0 672 // If &label==&dest then the label was modified in-place and labelLength
michael@0 673 // is the new label length, different from label.length().
michael@0 674 // If &label!=&dest then labelLength==label.length().
michael@0 675 // Returns labelLength (= the new label length).
michael@0 676 static int32_t
michael@0 677 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,
michael@0 678 const UnicodeString &label, int32_t labelLength) {
michael@0 679 if(&label!=&dest) {
michael@0 680 dest.replace(destLabelStart, destLabelLength, label);
michael@0 681 }
michael@0 682 return labelLength;
michael@0 683 }
michael@0 684
michael@0 685 int32_t
michael@0 686 UTS46::processLabel(UnicodeString &dest,
michael@0 687 int32_t labelStart, int32_t labelLength,
michael@0 688 UBool toASCII,
michael@0 689 IDNAInfo &info, UErrorCode &errorCode) const {
michael@0 690 UnicodeString fromPunycode;
michael@0 691 UnicodeString *labelString;
michael@0 692 const UChar *label=dest.getBuffer()+labelStart;
michael@0 693 int32_t destLabelStart=labelStart;
michael@0 694 int32_t destLabelLength=labelLength;
michael@0 695 UBool wasPunycode;
michael@0 696 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {
michael@0 697 // Label starts with "xn--", try to un-Punycode it.
michael@0 698 wasPunycode=TRUE;
michael@0 699 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit
michael@0 700 if(unicodeBuffer==NULL) {
michael@0 701 // Should never occur if we used capacity==-1 which uses the internal buffer.
michael@0 702 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 703 return labelLength;
michael@0 704 }
michael@0 705 UErrorCode punycodeErrorCode=U_ZERO_ERROR;
michael@0 706 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,
michael@0 707 unicodeBuffer, fromPunycode.getCapacity(),
michael@0 708 NULL, &punycodeErrorCode);
michael@0 709 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0 710 fromPunycode.releaseBuffer(0);
michael@0 711 unicodeBuffer=fromPunycode.getBuffer(unicodeLength);
michael@0 712 if(unicodeBuffer==NULL) {
michael@0 713 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 714 return labelLength;
michael@0 715 }
michael@0 716 punycodeErrorCode=U_ZERO_ERROR;
michael@0 717 unicodeLength=u_strFromPunycode(label+4, labelLength-4,
michael@0 718 unicodeBuffer, fromPunycode.getCapacity(),
michael@0 719 NULL, &punycodeErrorCode);
michael@0 720 }
michael@0 721 fromPunycode.releaseBuffer(unicodeLength);
michael@0 722 if(U_FAILURE(punycodeErrorCode)) {
michael@0 723 info.labelErrors|=UIDNA_ERROR_PUNYCODE;
michael@0 724 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
michael@0 725 }
michael@0 726 // Check for NFC, and for characters that are not
michael@0 727 // valid or deviation characters according to the normalizer.
michael@0 728 // If there is something wrong, then the string will change.
michael@0 729 // Note that the normalizer passes through non-LDH ASCII and deviation characters.
michael@0 730 // Deviation characters are ok in Punycode even in transitional processing.
michael@0 731 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
michael@0 732 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
michael@0 733 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);
michael@0 734 if(U_FAILURE(errorCode)) {
michael@0 735 return labelLength;
michael@0 736 }
michael@0 737 if(!isValid) {
michael@0 738 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
michael@0 739 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
michael@0 740 }
michael@0 741 labelString=&fromPunycode;
michael@0 742 label=fromPunycode.getBuffer();
michael@0 743 labelStart=0;
michael@0 744 labelLength=fromPunycode.length();
michael@0 745 } else {
michael@0 746 wasPunycode=FALSE;
michael@0 747 labelString=&dest;
michael@0 748 }
michael@0 749 // Validity check
michael@0 750 if(labelLength==0) {
michael@0 751 if(toASCII) {
michael@0 752 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;
michael@0 753 }
michael@0 754 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
michael@0 755 }
michael@0 756 // labelLength>0
michael@0 757 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {
michael@0 758 // label starts with "??--"
michael@0 759 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;
michael@0 760 }
michael@0 761 if(label[0]==0x2d) {
michael@0 762 // label starts with "-"
michael@0 763 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;
michael@0 764 }
michael@0 765 if(label[labelLength-1]==0x2d) {
michael@0 766 // label ends with "-"
michael@0 767 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;
michael@0 768 }
michael@0 769 // If the label was not a Punycode label, then it was the result of
michael@0 770 // mapping, normalization and label segmentation.
michael@0 771 // If the label was in Punycode, then we mapped it again above
michael@0 772 // and checked its validity.
michael@0 773 // Now we handle the STD3 restriction to LDH characters (if set)
michael@0 774 // and we look for U+FFFD which indicates disallowed characters
michael@0 775 // in a non-Punycode label or U+FFFD itself in a Punycode label.
michael@0 776 // We also check for dots which can come from the input to a single-label function.
michael@0 777 // Ok to cast away const because we own the UnicodeString.
michael@0 778 UChar *s=(UChar *)label;
michael@0 779 const UChar *limit=label+labelLength;
michael@0 780 UChar oredChars=0;
michael@0 781 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
michael@0 782 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
michael@0 783 do {
michael@0 784 UChar c=*s;
michael@0 785 if(c<=0x7f) {
michael@0 786 if(c==0x2e) {
michael@0 787 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
michael@0 788 *s=0xfffd;
michael@0 789 } else if(disallowNonLDHDot && asciiData[c]<0) {
michael@0 790 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
michael@0 791 *s=0xfffd;
michael@0 792 }
michael@0 793 } else {
michael@0 794 oredChars|=c;
michael@0 795 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
michael@0 796 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
michael@0 797 *s=0xfffd;
michael@0 798 } else if(c==0xfffd) {
michael@0 799 info.labelErrors|=UIDNA_ERROR_DISALLOWED;
michael@0 800 }
michael@0 801 }
michael@0 802 ++s;
michael@0 803 } while(s<limit);
michael@0 804 // Check for a leading combining mark after other validity checks
michael@0 805 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.
michael@0 806 UChar32 c;
michael@0 807 int32_t cpLength=0;
michael@0 808 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
michael@0 809 U16_NEXT_UNSAFE(label, cpLength, c);
michael@0 810 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
michael@0 811 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;
michael@0 812 labelString->replace(labelStart, cpLength, (UChar)0xfffd);
michael@0 813 label=labelString->getBuffer()+labelStart;
michael@0 814 labelLength+=1-cpLength;
michael@0 815 if(labelString==&dest) {
michael@0 816 destLabelLength=labelLength;
michael@0 817 }
michael@0 818 }
michael@0 819 if((info.labelErrors&severeErrors)==0) {
michael@0 820 // Do contextual checks only if we do not have U+FFFD from a severe error
michael@0 821 // because U+FFFD can make these checks fail.
michael@0 822 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {
michael@0 823 checkLabelBiDi(label, labelLength, info);
michael@0 824 }
michael@0 825 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
michael@0 826 !isLabelOkContextJ(label, labelLength)
michael@0 827 ) {
michael@0 828 info.labelErrors|=UIDNA_ERROR_CONTEXTJ;
michael@0 829 }
michael@0 830 if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
michael@0 831 checkLabelContextO(label, labelLength, info);
michael@0 832 }
michael@0 833 if(toASCII) {
michael@0 834 if(wasPunycode) {
michael@0 835 // Leave a Punycode label unchanged if it has no severe errors.
michael@0 836 if(destLabelLength>63) {
michael@0 837 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 838 }
michael@0 839 return destLabelLength;
michael@0 840 } else if(oredChars>=0x80) {
michael@0 841 // Contains non-ASCII characters.
michael@0 842 UnicodeString punycode;
michael@0 843 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length
michael@0 844 if(buffer==NULL) {
michael@0 845 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 846 return destLabelLength;
michael@0 847 }
michael@0 848 buffer[0]=0x78; // Write "xn--".
michael@0 849 buffer[1]=0x6e;
michael@0 850 buffer[2]=0x2d;
michael@0 851 buffer[3]=0x2d;
michael@0 852 int32_t punycodeLength=u_strToPunycode(label, labelLength,
michael@0 853 buffer+4, punycode.getCapacity()-4,
michael@0 854 NULL, &errorCode);
michael@0 855 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0 856 errorCode=U_ZERO_ERROR;
michael@0 857 punycode.releaseBuffer(4);
michael@0 858 buffer=punycode.getBuffer(4+punycodeLength);
michael@0 859 if(buffer==NULL) {
michael@0 860 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 861 return destLabelLength;
michael@0 862 }
michael@0 863 punycodeLength=u_strToPunycode(label, labelLength,
michael@0 864 buffer+4, punycode.getCapacity()-4,
michael@0 865 NULL, &errorCode);
michael@0 866 }
michael@0 867 punycodeLength+=4;
michael@0 868 punycode.releaseBuffer(punycodeLength);
michael@0 869 if(U_FAILURE(errorCode)) {
michael@0 870 return destLabelLength;
michael@0 871 }
michael@0 872 if(punycodeLength>63) {
michael@0 873 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 874 }
michael@0 875 return replaceLabel(dest, destLabelStart, destLabelLength,
michael@0 876 punycode, punycodeLength);
michael@0 877 } else {
michael@0 878 // all-ASCII label
michael@0 879 if(labelLength>63) {
michael@0 880 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 881 }
michael@0 882 }
michael@0 883 }
michael@0 884 } else {
michael@0 885 // If a Punycode label has severe errors,
michael@0 886 // then leave it but make sure it does not look valid.
michael@0 887 if(wasPunycode) {
michael@0 888 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;
michael@0 889 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
michael@0 890 }
michael@0 891 }
michael@0 892 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);
michael@0 893 }
michael@0 894
michael@0 895 // Make sure an ACE label does not look valid.
michael@0 896 // Append U+FFFD if the label has only LDH characters.
michael@0 897 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.
michael@0 898 int32_t
michael@0 899 UTS46::markBadACELabel(UnicodeString &dest,
michael@0 900 int32_t labelStart, int32_t labelLength,
michael@0 901 UBool toASCII, IDNAInfo &info) const {
michael@0 902 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;
michael@0 903 UBool isASCII=TRUE;
michael@0 904 UBool onlyLDH=TRUE;
michael@0 905 const UChar *label=dest.getBuffer()+labelStart;
michael@0 906 // Ok to cast away const because we own the UnicodeString.
michael@0 907 UChar *s=(UChar *)label+4; // After the initial "xn--".
michael@0 908 const UChar *limit=label+labelLength;
michael@0 909 do {
michael@0 910 UChar c=*s;
michael@0 911 if(c<=0x7f) {
michael@0 912 if(c==0x2e) {
michael@0 913 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;
michael@0 914 *s=0xfffd;
michael@0 915 isASCII=onlyLDH=FALSE;
michael@0 916 } else if(asciiData[c]<0) {
michael@0 917 onlyLDH=FALSE;
michael@0 918 if(disallowNonLDHDot) {
michael@0 919 *s=0xfffd;
michael@0 920 isASCII=FALSE;
michael@0 921 }
michael@0 922 }
michael@0 923 } else {
michael@0 924 isASCII=onlyLDH=FALSE;
michael@0 925 }
michael@0 926 } while(++s<limit);
michael@0 927 if(onlyLDH) {
michael@0 928 dest.insert(labelStart+labelLength, (UChar)0xfffd);
michael@0 929 ++labelLength;
michael@0 930 } else {
michael@0 931 if(toASCII && isASCII && labelLength>63) {
michael@0 932 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;
michael@0 933 }
michael@0 934 }
michael@0 935 return labelLength;
michael@0 936 }
michael@0 937
michael@0 938 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);
michael@0 939 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);
michael@0 940 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;
michael@0 941
michael@0 942 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);
michael@0 943
michael@0 944 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);
michael@0 945 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
michael@0 946 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);
michael@0 947
michael@0 948 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=
michael@0 949 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|
michael@0 950 U_MASK(U_COMMON_NUMBER_SEPARATOR)|
michael@0 951 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|
michael@0 952 U_MASK(U_OTHER_NEUTRAL)|
michael@0 953 U_MASK(U_BOUNDARY_NEUTRAL)|
michael@0 954 U_MASK(U_DIR_NON_SPACING_MARK);
michael@0 955 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
michael@0 956 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
michael@0 957
michael@0 958 // We scan the whole label and check both for whether it contains RTL characters
michael@0 959 // and whether it passes the BiDi Rule.
michael@0 960 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
michael@0 961 // that a domain name is a BiDi domain name (has an RTL label) only after
michael@0 962 // processing several earlier labels.
michael@0 963 void
michael@0 964 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
michael@0 965 // IDNA2008 BiDi rule
michael@0 966 // Get the directionality of the first character.
michael@0 967 UChar32 c;
michael@0 968 int32_t i=0;
michael@0 969 U16_NEXT_UNSAFE(label, i, c);
michael@0 970 uint32_t firstMask=U_MASK(u_charDirection(c));
michael@0 971 // 1. The first character must be a character with BIDI property L, R
michael@0 972 // or AL. If it has the R or AL property, it is an RTL label; if it
michael@0 973 // has the L property, it is an LTR label.
michael@0 974 if((firstMask&~L_R_AL_MASK)!=0) {
michael@0 975 info.isOkBiDi=FALSE;
michael@0 976 }
michael@0 977 // Get the directionality of the last non-NSM character.
michael@0 978 uint32_t lastMask;
michael@0 979 for(;;) {
michael@0 980 if(i>=labelLength) {
michael@0 981 lastMask=firstMask;
michael@0 982 break;
michael@0 983 }
michael@0 984 U16_PREV_UNSAFE(label, labelLength, c);
michael@0 985 UCharDirection dir=u_charDirection(c);
michael@0 986 if(dir!=U_DIR_NON_SPACING_MARK) {
michael@0 987 lastMask=U_MASK(dir);
michael@0 988 break;
michael@0 989 }
michael@0 990 }
michael@0 991 // 3. In an RTL label, the end of the label must be a character with
michael@0 992 // BIDI property R, AL, EN or AN, followed by zero or more
michael@0 993 // characters with BIDI property NSM.
michael@0 994 // 6. In an LTR label, the end of the label must be a character with
michael@0 995 // BIDI property L or EN, followed by zero or more characters with
michael@0 996 // BIDI property NSM.
michael@0 997 if( (firstMask&L_MASK)!=0 ?
michael@0 998 (lastMask&~L_EN_MASK)!=0 :
michael@0 999 (lastMask&~R_AL_EN_AN_MASK)!=0
michael@0 1000 ) {
michael@0 1001 info.isOkBiDi=FALSE;
michael@0 1002 }
michael@0 1003 // Get the directionalities of the intervening characters.
michael@0 1004 uint32_t mask=0;
michael@0 1005 while(i<labelLength) {
michael@0 1006 U16_NEXT_UNSAFE(label, i, c);
michael@0 1007 mask|=U_MASK(u_charDirection(c));
michael@0 1008 }
michael@0 1009 if(firstMask&L_MASK) {
michael@0 1010 // 5. In an LTR label, only characters with the BIDI properties L, EN,
michael@0 1011 // ES, CS, ET, ON, BN and NSM are allowed.
michael@0 1012 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
michael@0 1013 info.isOkBiDi=FALSE;
michael@0 1014 }
michael@0 1015 } else {
michael@0 1016 // 2. In an RTL label, only characters with the BIDI properties R, AL,
michael@0 1017 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
michael@0 1018 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
michael@0 1019 info.isOkBiDi=FALSE;
michael@0 1020 }
michael@0 1021 // 4. In an RTL label, if an EN is present, no AN may be present, and
michael@0 1022 // vice versa.
michael@0 1023 if((mask&EN_AN_MASK)==EN_AN_MASK) {
michael@0 1024 info.isOkBiDi=FALSE;
michael@0 1025 }
michael@0 1026 }
michael@0 1027 // An RTL label is a label that contains at least one character of type
michael@0 1028 // R, AL or AN. [...]
michael@0 1029 // A "BIDI domain name" is a domain name that contains at least one RTL
michael@0 1030 // label. [...]
michael@0 1031 // The following rule, consisting of six conditions, applies to labels
michael@0 1032 // in BIDI domain names.
michael@0 1033 if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
michael@0 1034 info.isBiDi=TRUE;
michael@0 1035 }
michael@0 1036 }
michael@0 1037
michael@0 1038 // Special code for the ASCII prefix of a BiDi domain name.
michael@0 1039 // The ASCII prefix is all-LTR.
michael@0 1040
michael@0 1041 // IDNA2008 BiDi rule, parts relevant to ASCII labels:
michael@0 1042 // 1. The first character must be a character with BIDI property L [...]
michael@0 1043 // 5. In an LTR label, only characters with the BIDI properties L, EN,
michael@0 1044 // ES, CS, ET, ON, BN and NSM are allowed.
michael@0 1045 // 6. In an LTR label, the end of the label must be a character with
michael@0 1046 // BIDI property L or EN [...]
michael@0 1047
michael@0 1048 // UTF-16 version, called for mapped ASCII prefix.
michael@0 1049 // Cannot contain uppercase A-Z.
michael@0 1050 // s[length-1] must be the trailing dot.
michael@0 1051 static UBool
michael@0 1052 isASCIIOkBiDi(const UChar *s, int32_t length) {
michael@0 1053 int32_t labelStart=0;
michael@0 1054 for(int32_t i=0; i<length; ++i) {
michael@0 1055 UChar c=s[i];
michael@0 1056 if(c==0x2e) { // dot
michael@0 1057 if(i>labelStart) {
michael@0 1058 c=s[i-1];
michael@0 1059 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {
michael@0 1060 // Last character in the label is not an L or EN.
michael@0 1061 return FALSE;
michael@0 1062 }
michael@0 1063 }
michael@0 1064 labelStart=i+1;
michael@0 1065 } else if(i==labelStart) {
michael@0 1066 if(!(0x61<=c && c<=0x7a)) {
michael@0 1067 // First character in the label is not an L.
michael@0 1068 return FALSE;
michael@0 1069 }
michael@0 1070 } else {
michael@0 1071 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
michael@0 1072 // Intermediate character in the label is a B, S or WS.
michael@0 1073 return FALSE;
michael@0 1074 }
michael@0 1075 }
michael@0 1076 }
michael@0 1077 return TRUE;
michael@0 1078 }
michael@0 1079
michael@0 1080 // UTF-8 version, called for source ASCII prefix.
michael@0 1081 // Can contain uppercase A-Z.
michael@0 1082 // s[length-1] must be the trailing dot.
michael@0 1083 static UBool
michael@0 1084 isASCIIOkBiDi(const char *s, int32_t length) {
michael@0 1085 int32_t labelStart=0;
michael@0 1086 for(int32_t i=0; i<length; ++i) {
michael@0 1087 char c=s[i];
michael@0 1088 if(c==0x2e) { // dot
michael@0 1089 if(i>labelStart) {
michael@0 1090 c=s[i-1];
michael@0 1091 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {
michael@0 1092 // Last character in the label is not an L or EN.
michael@0 1093 return FALSE;
michael@0 1094 }
michael@0 1095 }
michael@0 1096 labelStart=i+1;
michael@0 1097 } else if(i==labelStart) {
michael@0 1098 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {
michael@0 1099 // First character in the label is not an L.
michael@0 1100 return FALSE;
michael@0 1101 }
michael@0 1102 } else {
michael@0 1103 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
michael@0 1104 // Intermediate character in the label is a B, S or WS.
michael@0 1105 return FALSE;
michael@0 1106 }
michael@0 1107 }
michael@0 1108 }
michael@0 1109 return TRUE;
michael@0 1110 }
michael@0 1111
michael@0 1112 UBool
michael@0 1113 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {
michael@0 1114 const UBiDiProps *bdp=ubidi_getSingleton();
michael@0 1115 // [IDNA2008-Tables]
michael@0 1116 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
michael@0 1117 for(int32_t i=0; i<labelLength; ++i) {
michael@0 1118 if(label[i]==0x200c) {
michael@0 1119 // Appendix A.1. ZERO WIDTH NON-JOINER
michael@0 1120 // Rule Set:
michael@0 1121 // False;
michael@0 1122 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
michael@0 1123 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
michael@0 1124 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
michael@0 1125 if(i==0) {
michael@0 1126 return FALSE;
michael@0 1127 }
michael@0 1128 UChar32 c;
michael@0 1129 int32_t j=i;
michael@0 1130 U16_PREV_UNSAFE(label, j, c);
michael@0 1131 if(uts46Norm2.getCombiningClass(c)==9) {
michael@0 1132 continue;
michael@0 1133 }
michael@0 1134 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
michael@0 1135 for(;;) {
michael@0 1136 UJoiningType type=ubidi_getJoiningType(bdp, c);
michael@0 1137 if(type==U_JT_TRANSPARENT) {
michael@0 1138 if(j==0) {
michael@0 1139 return FALSE;
michael@0 1140 }
michael@0 1141 U16_PREV_UNSAFE(label, j, c);
michael@0 1142 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {
michael@0 1143 break; // precontext fulfilled
michael@0 1144 } else {
michael@0 1145 return FALSE;
michael@0 1146 }
michael@0 1147 }
michael@0 1148 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
michael@0 1149 for(j=i+1;;) {
michael@0 1150 if(j==labelLength) {
michael@0 1151 return FALSE;
michael@0 1152 }
michael@0 1153 U16_NEXT_UNSAFE(label, j, c);
michael@0 1154 UJoiningType type=ubidi_getJoiningType(bdp, c);
michael@0 1155 if(type==U_JT_TRANSPARENT) {
michael@0 1156 // just skip this character
michael@0 1157 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {
michael@0 1158 break; // postcontext fulfilled
michael@0 1159 } else {
michael@0 1160 return FALSE;
michael@0 1161 }
michael@0 1162 }
michael@0 1163 } else if(label[i]==0x200d) {
michael@0 1164 // Appendix A.2. ZERO WIDTH JOINER (U+200D)
michael@0 1165 // Rule Set:
michael@0 1166 // False;
michael@0 1167 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
michael@0 1168 if(i==0) {
michael@0 1169 return FALSE;
michael@0 1170 }
michael@0 1171 UChar32 c;
michael@0 1172 int32_t j=i;
michael@0 1173 U16_PREV_UNSAFE(label, j, c);
michael@0 1174 if(uts46Norm2.getCombiningClass(c)!=9) {
michael@0 1175 return FALSE;
michael@0 1176 }
michael@0 1177 }
michael@0 1178 }
michael@0 1179 return TRUE;
michael@0 1180 }
michael@0 1181
michael@0 1182 void
michael@0 1183 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {
michael@0 1184 int32_t labelEnd=labelLength-1; // inclusive
michael@0 1185 int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx
michael@0 1186 for(int32_t i=0; i<=labelEnd; ++i) {
michael@0 1187 UChar32 c=label[i];
michael@0 1188 if(c<0xb7) {
michael@0 1189 // ASCII fastpath
michael@0 1190 } else if(c<=0x6f9) {
michael@0 1191 if(c==0xb7) {
michael@0 1192 // Appendix A.3. MIDDLE DOT (U+00B7)
michael@0 1193 // Rule Set:
michael@0 1194 // False;
michael@0 1195 // If Before(cp) .eq. U+006C And
michael@0 1196 // After(cp) .eq. U+006C Then True;
michael@0 1197 if(!(0<i && label[i-1]==0x6c &&
michael@0 1198 i<labelEnd && label[i+1]==0x6c)) {
michael@0 1199 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
michael@0 1200 }
michael@0 1201 } else if(c==0x375) {
michael@0 1202 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
michael@0 1203 // Rule Set:
michael@0 1204 // False;
michael@0 1205 // If Script(After(cp)) .eq. Greek Then True;
michael@0 1206 UScriptCode script=USCRIPT_INVALID_CODE;
michael@0 1207 if(i<labelEnd) {
michael@0 1208 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 1209 int32_t j=i+1;
michael@0 1210 U16_NEXT(label, j, labelLength, c);
michael@0 1211 script=uscript_getScript(c, &errorCode);
michael@0 1212 }
michael@0 1213 if(script!=USCRIPT_GREEK) {
michael@0 1214 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
michael@0 1215 }
michael@0 1216 } else if(c==0x5f3 || c==0x5f4) {
michael@0 1217 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
michael@0 1218 // Rule Set:
michael@0 1219 // False;
michael@0 1220 // If Script(Before(cp)) .eq. Hebrew Then True;
michael@0 1221 //
michael@0 1222 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
michael@0 1223 // Rule Set:
michael@0 1224 // False;
michael@0 1225 // If Script(Before(cp)) .eq. Hebrew Then True;
michael@0 1226 UScriptCode script=USCRIPT_INVALID_CODE;
michael@0 1227 if(0<i) {
michael@0 1228 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 1229 int32_t j=i;
michael@0 1230 U16_PREV(label, 0, j, c);
michael@0 1231 script=uscript_getScript(c, &errorCode);
michael@0 1232 }
michael@0 1233 if(script!=USCRIPT_HEBREW) {
michael@0 1234 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
michael@0 1235 }
michael@0 1236 } else if(0x660<=c /* && c<=0x6f9 */) {
michael@0 1237 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
michael@0 1238 // Rule Set:
michael@0 1239 // True;
michael@0 1240 // For All Characters:
michael@0 1241 // If cp .in. 06F0..06F9 Then False;
michael@0 1242 // End For;
michael@0 1243 //
michael@0 1244 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
michael@0 1245 // Rule Set:
michael@0 1246 // True;
michael@0 1247 // For All Characters:
michael@0 1248 // If cp .in. 0660..0669 Then False;
michael@0 1249 // End For;
michael@0 1250 if(c<=0x669) {
michael@0 1251 if(arabicDigits>0) {
michael@0 1252 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
michael@0 1253 }
michael@0 1254 arabicDigits=-1;
michael@0 1255 } else if(0x6f0<=c) {
michael@0 1256 if(arabicDigits<0) {
michael@0 1257 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;
michael@0 1258 }
michael@0 1259 arabicDigits=1;
michael@0 1260 }
michael@0 1261 }
michael@0 1262 } else if(c==0x30fb) {
michael@0 1263 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
michael@0 1264 // Rule Set:
michael@0 1265 // False;
michael@0 1266 // For All Characters:
michael@0 1267 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
michael@0 1268 // End For;
michael@0 1269 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 1270 for(int j=0;;) {
michael@0 1271 if(j>labelEnd) {
michael@0 1272 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;
michael@0 1273 break;
michael@0 1274 }
michael@0 1275 U16_NEXT(label, j, labelLength, c);
michael@0 1276 UScriptCode script=uscript_getScript(c, &errorCode);
michael@0 1277 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {
michael@0 1278 break;
michael@0 1279 }
michael@0 1280 }
michael@0 1281 }
michael@0 1282 }
michael@0 1283 }
michael@0 1284
michael@0 1285 U_NAMESPACE_END
michael@0 1286
michael@0 1287 // C API ------------------------------------------------------------------- ***
michael@0 1288
michael@0 1289 U_NAMESPACE_USE
michael@0 1290
michael@0 1291 U_CAPI UIDNA * U_EXPORT2
michael@0 1292 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {
michael@0 1293 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));
michael@0 1294 }
michael@0 1295
michael@0 1296 U_CAPI void U_EXPORT2
michael@0 1297 uidna_close(UIDNA *idna) {
michael@0 1298 delete reinterpret_cast<IDNA *>(idna);
michael@0 1299 }
michael@0 1300
michael@0 1301 static UBool
michael@0 1302 checkArgs(const void *label, int32_t length,
michael@0 1303 void *dest, int32_t capacity,
michael@0 1304 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1305 if(U_FAILURE(*pErrorCode)) {
michael@0 1306 return FALSE;
michael@0 1307 }
michael@0 1308 // sizeof(UIDNAInfo)=16 in the first API version.
michael@0 1309 if(pInfo==NULL || pInfo->size<16) {
michael@0 1310 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1311 return FALSE;
michael@0 1312 }
michael@0 1313 if( (label==NULL ? length!=0 : length<-1) ||
michael@0 1314 (dest==NULL ? capacity!=0 : capacity<0) ||
michael@0 1315 (dest==label && label!=NULL)
michael@0 1316 ) {
michael@0 1317 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1318 return FALSE;
michael@0 1319 }
michael@0 1320 // Set all *pInfo bytes to 0 except for the size field itself.
michael@0 1321 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));
michael@0 1322 return TRUE;
michael@0 1323 }
michael@0 1324
michael@0 1325 static void
michael@0 1326 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {
michael@0 1327 pInfo->isTransitionalDifferent=info.isTransitionalDifferent();
michael@0 1328 pInfo->errors=info.getErrors();
michael@0 1329 }
michael@0 1330
michael@0 1331 U_CAPI int32_t U_EXPORT2
michael@0 1332 uidna_labelToASCII(const UIDNA *idna,
michael@0 1333 const UChar *label, int32_t length,
michael@0 1334 UChar *dest, int32_t capacity,
michael@0 1335 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1336 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1337 return 0;
michael@0 1338 }
michael@0 1339 UnicodeString src((UBool)(length<0), label, length);
michael@0 1340 UnicodeString destString(dest, 0, capacity);
michael@0 1341 IDNAInfo info;
michael@0 1342 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);
michael@0 1343 idnaInfoToStruct(info, pInfo);
michael@0 1344 return destString.extract(dest, capacity, *pErrorCode);
michael@0 1345 }
michael@0 1346
michael@0 1347 U_CAPI int32_t U_EXPORT2
michael@0 1348 uidna_labelToUnicode(const UIDNA *idna,
michael@0 1349 const UChar *label, int32_t length,
michael@0 1350 UChar *dest, int32_t capacity,
michael@0 1351 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1352 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1353 return 0;
michael@0 1354 }
michael@0 1355 UnicodeString src((UBool)(length<0), label, length);
michael@0 1356 UnicodeString destString(dest, 0, capacity);
michael@0 1357 IDNAInfo info;
michael@0 1358 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);
michael@0 1359 idnaInfoToStruct(info, pInfo);
michael@0 1360 return destString.extract(dest, capacity, *pErrorCode);
michael@0 1361 }
michael@0 1362
michael@0 1363 U_CAPI int32_t U_EXPORT2
michael@0 1364 uidna_nameToASCII(const UIDNA *idna,
michael@0 1365 const UChar *name, int32_t length,
michael@0 1366 UChar *dest, int32_t capacity,
michael@0 1367 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1368 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1369 return 0;
michael@0 1370 }
michael@0 1371 UnicodeString src((UBool)(length<0), name, length);
michael@0 1372 UnicodeString destString(dest, 0, capacity);
michael@0 1373 IDNAInfo info;
michael@0 1374 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);
michael@0 1375 idnaInfoToStruct(info, pInfo);
michael@0 1376 return destString.extract(dest, capacity, *pErrorCode);
michael@0 1377 }
michael@0 1378
michael@0 1379 U_CAPI int32_t U_EXPORT2
michael@0 1380 uidna_nameToUnicode(const UIDNA *idna,
michael@0 1381 const UChar *name, int32_t length,
michael@0 1382 UChar *dest, int32_t capacity,
michael@0 1383 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1384 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1385 return 0;
michael@0 1386 }
michael@0 1387 UnicodeString src((UBool)(length<0), name, length);
michael@0 1388 UnicodeString destString(dest, 0, capacity);
michael@0 1389 IDNAInfo info;
michael@0 1390 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);
michael@0 1391 idnaInfoToStruct(info, pInfo);
michael@0 1392 return destString.extract(dest, capacity, *pErrorCode);
michael@0 1393 }
michael@0 1394
michael@0 1395 U_CAPI int32_t U_EXPORT2
michael@0 1396 uidna_labelToASCII_UTF8(const UIDNA *idna,
michael@0 1397 const char *label, int32_t length,
michael@0 1398 char *dest, int32_t capacity,
michael@0 1399 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1400 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1401 return 0;
michael@0 1402 }
michael@0 1403 StringPiece src(label, length<0 ? uprv_strlen(label) : length);
michael@0 1404 CheckedArrayByteSink sink(dest, capacity);
michael@0 1405 IDNAInfo info;
michael@0 1406 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode);
michael@0 1407 idnaInfoToStruct(info, pInfo);
michael@0 1408 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
michael@0 1409 }
michael@0 1410
michael@0 1411 U_CAPI int32_t U_EXPORT2
michael@0 1412 uidna_labelToUnicodeUTF8(const UIDNA *idna,
michael@0 1413 const char *label, int32_t length,
michael@0 1414 char *dest, int32_t capacity,
michael@0 1415 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1416 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1417 return 0;
michael@0 1418 }
michael@0 1419 StringPiece src(label, length<0 ? uprv_strlen(label) : length);
michael@0 1420 CheckedArrayByteSink sink(dest, capacity);
michael@0 1421 IDNAInfo info;
michael@0 1422 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode);
michael@0 1423 idnaInfoToStruct(info, pInfo);
michael@0 1424 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
michael@0 1425 }
michael@0 1426
michael@0 1427 U_CAPI int32_t U_EXPORT2
michael@0 1428 uidna_nameToASCII_UTF8(const UIDNA *idna,
michael@0 1429 const char *name, int32_t length,
michael@0 1430 char *dest, int32_t capacity,
michael@0 1431 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1432 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1433 return 0;
michael@0 1434 }
michael@0 1435 StringPiece src(name, length<0 ? uprv_strlen(name) : length);
michael@0 1436 CheckedArrayByteSink sink(dest, capacity);
michael@0 1437 IDNAInfo info;
michael@0 1438 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode);
michael@0 1439 idnaInfoToStruct(info, pInfo);
michael@0 1440 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
michael@0 1441 }
michael@0 1442
michael@0 1443 U_CAPI int32_t U_EXPORT2
michael@0 1444 uidna_nameToUnicodeUTF8(const UIDNA *idna,
michael@0 1445 const char *name, int32_t length,
michael@0 1446 char *dest, int32_t capacity,
michael@0 1447 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {
michael@0 1448 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {
michael@0 1449 return 0;
michael@0 1450 }
michael@0 1451 StringPiece src(name, length<0 ? uprv_strlen(name) : length);
michael@0 1452 CheckedArrayByteSink sink(dest, capacity);
michael@0 1453 IDNAInfo info;
michael@0 1454 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode);
michael@0 1455 idnaInfoToStruct(info, pInfo);
michael@0 1456 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);
michael@0 1457 }
michael@0 1458
michael@0 1459 #endif // UCONFIG_NO_IDNA

mercurial