Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * Copyright (C) 2010-2012, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ******************************************************************************* |
michael@0 | 6 | * file name: uts46.cpp |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2010mar09 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | */ |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/utypes.h" |
michael@0 | 16 | |
michael@0 | 17 | #if !UCONFIG_NO_IDNA |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/idna.h" |
michael@0 | 20 | #include "unicode/normalizer2.h" |
michael@0 | 21 | #include "unicode/uscript.h" |
michael@0 | 22 | #include "unicode/ustring.h" |
michael@0 | 23 | #include "unicode/utf16.h" |
michael@0 | 24 | #include "cmemory.h" |
michael@0 | 25 | #include "cstring.h" |
michael@0 | 26 | #include "punycode.h" |
michael@0 | 27 | #include "ubidi_props.h" |
michael@0 | 28 | #include "ustr_imp.h" |
michael@0 | 29 | |
michael@0 | 30 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
michael@0 | 31 | |
michael@0 | 32 | // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: |
michael@0 | 33 | // |
michael@0 | 34 | // The domain name length limit is 255 octets in an internal DNS representation |
michael@0 | 35 | // where the last ("root") label is the empty label |
michael@0 | 36 | // represented by length byte 0 alone. |
michael@0 | 37 | // In a conventional string, this translates to 253 characters, or 254 |
michael@0 | 38 | // if there is a trailing dot for the root label. |
michael@0 | 39 | |
michael@0 | 40 | U_NAMESPACE_BEGIN |
michael@0 | 41 | |
michael@0 | 42 | // Severe errors which usually result in a U+FFFD replacement character in the result string. |
michael@0 | 43 | const uint32_t severeErrors= |
michael@0 | 44 | UIDNA_ERROR_LEADING_COMBINING_MARK| |
michael@0 | 45 | UIDNA_ERROR_DISALLOWED| |
michael@0 | 46 | UIDNA_ERROR_PUNYCODE| |
michael@0 | 47 | UIDNA_ERROR_LABEL_HAS_DOT| |
michael@0 | 48 | UIDNA_ERROR_INVALID_ACE_LABEL; |
michael@0 | 49 | |
michael@0 | 50 | static inline UBool |
michael@0 | 51 | isASCIIString(const UnicodeString &dest) { |
michael@0 | 52 | const UChar *s=dest.getBuffer(); |
michael@0 | 53 | const UChar *limit=s+dest.length(); |
michael@0 | 54 | while(s<limit) { |
michael@0 | 55 | if(*s++>0x7f) { |
michael@0 | 56 | return FALSE; |
michael@0 | 57 | } |
michael@0 | 58 | } |
michael@0 | 59 | return TRUE; |
michael@0 | 60 | } |
michael@0 | 61 | |
michael@0 | 62 | static UBool |
michael@0 | 63 | isASCIIOkBiDi(const UChar *s, int32_t length); |
michael@0 | 64 | |
michael@0 | 65 | static UBool |
michael@0 | 66 | isASCIIOkBiDi(const char *s, int32_t length); |
michael@0 | 67 | |
michael@0 | 68 | // IDNA class default implementations -------------------------------------- *** |
michael@0 | 69 | |
michael@0 | 70 | IDNA::~IDNA() {} |
michael@0 | 71 | |
michael@0 | 72 | void |
michael@0 | 73 | IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 74 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 75 | if(U_SUCCESS(errorCode)) { |
michael@0 | 76 | UnicodeString destString; |
michael@0 | 77 | labelToASCII(UnicodeString::fromUTF8(label), destString, |
michael@0 | 78 | info, errorCode).toUTF8(dest); |
michael@0 | 79 | } |
michael@0 | 80 | } |
michael@0 | 81 | |
michael@0 | 82 | void |
michael@0 | 83 | IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 84 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 85 | if(U_SUCCESS(errorCode)) { |
michael@0 | 86 | UnicodeString destString; |
michael@0 | 87 | labelToUnicode(UnicodeString::fromUTF8(label), destString, |
michael@0 | 88 | info, errorCode).toUTF8(dest); |
michael@0 | 89 | } |
michael@0 | 90 | } |
michael@0 | 91 | |
michael@0 | 92 | void |
michael@0 | 93 | IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 94 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 95 | if(U_SUCCESS(errorCode)) { |
michael@0 | 96 | UnicodeString destString; |
michael@0 | 97 | nameToASCII(UnicodeString::fromUTF8(name), destString, |
michael@0 | 98 | info, errorCode).toUTF8(dest); |
michael@0 | 99 | } |
michael@0 | 100 | } |
michael@0 | 101 | |
michael@0 | 102 | void |
michael@0 | 103 | IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 104 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 105 | if(U_SUCCESS(errorCode)) { |
michael@0 | 106 | UnicodeString destString; |
michael@0 | 107 | nameToUnicode(UnicodeString::fromUTF8(name), destString, |
michael@0 | 108 | info, errorCode).toUTF8(dest); |
michael@0 | 109 | } |
michael@0 | 110 | } |
michael@0 | 111 | |
michael@0 | 112 | // UTS46 class declaration ------------------------------------------------- *** |
michael@0 | 113 | |
michael@0 | 114 | class UTS46 : public IDNA { |
michael@0 | 115 | public: |
michael@0 | 116 | UTS46(uint32_t options, UErrorCode &errorCode); |
michael@0 | 117 | virtual ~UTS46(); |
michael@0 | 118 | |
michael@0 | 119 | virtual UnicodeString & |
michael@0 | 120 | labelToASCII(const UnicodeString &label, UnicodeString &dest, |
michael@0 | 121 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 122 | |
michael@0 | 123 | virtual UnicodeString & |
michael@0 | 124 | labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
michael@0 | 125 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 126 | |
michael@0 | 127 | virtual UnicodeString & |
michael@0 | 128 | nameToASCII(const UnicodeString &name, UnicodeString &dest, |
michael@0 | 129 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 130 | |
michael@0 | 131 | virtual UnicodeString & |
michael@0 | 132 | nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
michael@0 | 133 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 134 | |
michael@0 | 135 | virtual void |
michael@0 | 136 | labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 137 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 138 | |
michael@0 | 139 | virtual void |
michael@0 | 140 | labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 141 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 142 | |
michael@0 | 143 | virtual void |
michael@0 | 144 | nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 145 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 146 | |
michael@0 | 147 | virtual void |
michael@0 | 148 | nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 149 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 150 | |
michael@0 | 151 | private: |
michael@0 | 152 | UnicodeString & |
michael@0 | 153 | process(const UnicodeString &src, |
michael@0 | 154 | UBool isLabel, UBool toASCII, |
michael@0 | 155 | UnicodeString &dest, |
michael@0 | 156 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 157 | |
michael@0 | 158 | void |
michael@0 | 159 | processUTF8(const StringPiece &src, |
michael@0 | 160 | UBool isLabel, UBool toASCII, |
michael@0 | 161 | ByteSink &dest, |
michael@0 | 162 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 163 | |
michael@0 | 164 | UnicodeString & |
michael@0 | 165 | processUnicode(const UnicodeString &src, |
michael@0 | 166 | int32_t labelStart, int32_t mappingStart, |
michael@0 | 167 | UBool isLabel, UBool toASCII, |
michael@0 | 168 | UnicodeString &dest, |
michael@0 | 169 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 170 | |
michael@0 | 171 | // returns the new dest.length() |
michael@0 | 172 | int32_t |
michael@0 | 173 | mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, |
michael@0 | 174 | UErrorCode &errorCode) const; |
michael@0 | 175 | |
michael@0 | 176 | // returns the new label length |
michael@0 | 177 | int32_t |
michael@0 | 178 | processLabel(UnicodeString &dest, |
michael@0 | 179 | int32_t labelStart, int32_t labelLength, |
michael@0 | 180 | UBool toASCII, |
michael@0 | 181 | IDNAInfo &info, UErrorCode &errorCode) const; |
michael@0 | 182 | int32_t |
michael@0 | 183 | markBadACELabel(UnicodeString &dest, |
michael@0 | 184 | int32_t labelStart, int32_t labelLength, |
michael@0 | 185 | UBool toASCII, IDNAInfo &info) const; |
michael@0 | 186 | |
michael@0 | 187 | void |
michael@0 | 188 | checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const; |
michael@0 | 189 | |
michael@0 | 190 | UBool |
michael@0 | 191 | isLabelOkContextJ(const UChar *label, int32_t labelLength) const; |
michael@0 | 192 | |
michael@0 | 193 | void |
michael@0 | 194 | checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; |
michael@0 | 195 | |
michael@0 | 196 | const Normalizer2 &uts46Norm2; // uts46.nrm |
michael@0 | 197 | uint32_t options; |
michael@0 | 198 | }; |
michael@0 | 199 | |
michael@0 | 200 | IDNA * |
michael@0 | 201 | IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { |
michael@0 | 202 | if(U_SUCCESS(errorCode)) { |
michael@0 | 203 | IDNA *idna=new UTS46(options, errorCode); |
michael@0 | 204 | if(idna==NULL) { |
michael@0 | 205 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 206 | } else if(U_FAILURE(errorCode)) { |
michael@0 | 207 | delete idna; |
michael@0 | 208 | idna=NULL; |
michael@0 | 209 | } |
michael@0 | 210 | return idna; |
michael@0 | 211 | } else { |
michael@0 | 212 | return NULL; |
michael@0 | 213 | } |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | // UTS46 implementation ---------------------------------------------------- *** |
michael@0 | 217 | |
michael@0 | 218 | UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) |
michael@0 | 219 | : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)), |
michael@0 | 220 | options(opt) {} |
michael@0 | 221 | |
michael@0 | 222 | UTS46::~UTS46() {} |
michael@0 | 223 | |
michael@0 | 224 | UnicodeString & |
michael@0 | 225 | UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, |
michael@0 | 226 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 227 | return process(label, TRUE, TRUE, dest, info, errorCode); |
michael@0 | 228 | } |
michael@0 | 229 | |
michael@0 | 230 | UnicodeString & |
michael@0 | 231 | UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
michael@0 | 232 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 233 | return process(label, TRUE, FALSE, dest, info, errorCode); |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | UnicodeString & |
michael@0 | 237 | UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, |
michael@0 | 238 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 239 | process(name, FALSE, TRUE, dest, info, errorCode); |
michael@0 | 240 | if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && |
michael@0 | 241 | isASCIIString(dest) && |
michael@0 | 242 | (dest.length()>254 || dest[253]!=0x2e) |
michael@0 | 243 | ) { |
michael@0 | 244 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
michael@0 | 245 | } |
michael@0 | 246 | return dest; |
michael@0 | 247 | } |
michael@0 | 248 | |
michael@0 | 249 | UnicodeString & |
michael@0 | 250 | UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
michael@0 | 251 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 252 | return process(name, FALSE, FALSE, dest, info, errorCode); |
michael@0 | 253 | } |
michael@0 | 254 | |
michael@0 | 255 | void |
michael@0 | 256 | UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 257 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 258 | processUTF8(label, TRUE, TRUE, dest, info, errorCode); |
michael@0 | 259 | } |
michael@0 | 260 | |
michael@0 | 261 | void |
michael@0 | 262 | UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
michael@0 | 263 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 264 | processUTF8(label, TRUE, FALSE, dest, info, errorCode); |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | void |
michael@0 | 268 | UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 269 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 270 | processUTF8(name, FALSE, TRUE, dest, info, errorCode); |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | void |
michael@0 | 274 | UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
michael@0 | 275 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 276 | processUTF8(name, FALSE, FALSE, dest, info, errorCode); |
michael@0 | 277 | } |
michael@0 | 278 | |
michael@0 | 279 | // UTS #46 data for ASCII characters. |
michael@0 | 280 | // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase |
michael@0 | 281 | // and passes through all other ASCII characters. |
michael@0 | 282 | // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed |
michael@0 | 283 | // using this data. |
michael@0 | 284 | // The ASCII fastpath also uses this data. |
michael@0 | 285 | // Values: -1=disallowed 0==valid 1==mapped (lowercase) |
michael@0 | 286 | static const int8_t asciiData[128]={ |
michael@0 | 287 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
michael@0 | 288 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
michael@0 | 289 | // 002D..002E; valid # HYPHEN-MINUS..FULL STOP |
michael@0 | 290 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, |
michael@0 | 291 | // 0030..0039; valid # DIGIT ZERO..DIGIT NINE |
michael@0 | 292 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, |
michael@0 | 293 | // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z |
michael@0 | 294 | -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
michael@0 | 295 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, |
michael@0 | 296 | // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z |
michael@0 | 297 | -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
michael@0 | 298 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 |
michael@0 | 299 | }; |
michael@0 | 300 | |
michael@0 | 301 | UnicodeString & |
michael@0 | 302 | UTS46::process(const UnicodeString &src, |
michael@0 | 303 | UBool isLabel, UBool toASCII, |
michael@0 | 304 | UnicodeString &dest, |
michael@0 | 305 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 306 | // uts46Norm2.normalize() would do all of this error checking and setup, |
michael@0 | 307 | // but with the ASCII fastpath we do not always call it, and do not |
michael@0 | 308 | // call it first. |
michael@0 | 309 | if(U_FAILURE(errorCode)) { |
michael@0 | 310 | dest.setToBogus(); |
michael@0 | 311 | return dest; |
michael@0 | 312 | } |
michael@0 | 313 | const UChar *srcArray=src.getBuffer(); |
michael@0 | 314 | if(&dest==&src || srcArray==NULL) { |
michael@0 | 315 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 316 | dest.setToBogus(); |
michael@0 | 317 | return dest; |
michael@0 | 318 | } |
michael@0 | 319 | // Arguments are fine, reset output values. |
michael@0 | 320 | dest.remove(); |
michael@0 | 321 | info.reset(); |
michael@0 | 322 | int32_t srcLength=src.length(); |
michael@0 | 323 | if(srcLength==0) { |
michael@0 | 324 | if(toASCII) { |
michael@0 | 325 | info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
michael@0 | 326 | } |
michael@0 | 327 | return dest; |
michael@0 | 328 | } |
michael@0 | 329 | UChar *destArray=dest.getBuffer(srcLength); |
michael@0 | 330 | if(destArray==NULL) { |
michael@0 | 331 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 332 | return dest; |
michael@0 | 333 | } |
michael@0 | 334 | // ASCII fastpath |
michael@0 | 335 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
michael@0 | 336 | int32_t labelStart=0; |
michael@0 | 337 | int32_t i; |
michael@0 | 338 | for(i=0;; ++i) { |
michael@0 | 339 | if(i==srcLength) { |
michael@0 | 340 | if(toASCII) { |
michael@0 | 341 | if((i-labelStart)>63) { |
michael@0 | 342 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 343 | } |
michael@0 | 344 | // There is a trailing dot if labelStart==i. |
michael@0 | 345 | if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
michael@0 | 346 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
michael@0 | 347 | } |
michael@0 | 348 | } |
michael@0 | 349 | info.errors|=info.labelErrors; |
michael@0 | 350 | dest.releaseBuffer(i); |
michael@0 | 351 | return dest; |
michael@0 | 352 | } |
michael@0 | 353 | UChar c=srcArray[i]; |
michael@0 | 354 | if(c>0x7f) { |
michael@0 | 355 | break; |
michael@0 | 356 | } |
michael@0 | 357 | int cData=asciiData[c]; |
michael@0 | 358 | if(cData>0) { |
michael@0 | 359 | destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
michael@0 | 360 | } else if(cData<0 && disallowNonLDHDot) { |
michael@0 | 361 | break; // Replacing with U+FFFD can be complicated for toASCII. |
michael@0 | 362 | } else { |
michael@0 | 363 | destArray[i]=c; |
michael@0 | 364 | if(c==0x2d) { // hyphen |
michael@0 | 365 | if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
michael@0 | 366 | // "??--..." is Punycode or forbidden. |
michael@0 | 367 | ++i; // '-' was copied to dest already |
michael@0 | 368 | break; |
michael@0 | 369 | } |
michael@0 | 370 | if(i==labelStart) { |
michael@0 | 371 | // label starts with "-" |
michael@0 | 372 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
michael@0 | 373 | } |
michael@0 | 374 | if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
michael@0 | 375 | // label ends with "-" |
michael@0 | 376 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
michael@0 | 377 | } |
michael@0 | 378 | } else if(c==0x2e) { // dot |
michael@0 | 379 | if(isLabel) { |
michael@0 | 380 | // Replacing with U+FFFD can be complicated for toASCII. |
michael@0 | 381 | ++i; // '.' was copied to dest already |
michael@0 | 382 | break; |
michael@0 | 383 | } |
michael@0 | 384 | if(toASCII) { |
michael@0 | 385 | // Permit an empty label at the end but not elsewhere. |
michael@0 | 386 | if(i==labelStart && i<(srcLength-1)) { |
michael@0 | 387 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
michael@0 | 388 | } else if((i-labelStart)>63) { |
michael@0 | 389 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 390 | } |
michael@0 | 391 | } |
michael@0 | 392 | info.errors|=info.labelErrors; |
michael@0 | 393 | info.labelErrors=0; |
michael@0 | 394 | labelStart=i+1; |
michael@0 | 395 | } |
michael@0 | 396 | } |
michael@0 | 397 | } |
michael@0 | 398 | info.errors|=info.labelErrors; |
michael@0 | 399 | dest.releaseBuffer(i); |
michael@0 | 400 | processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); |
michael@0 | 401 | if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
michael@0 | 402 | (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) |
michael@0 | 403 | ) { |
michael@0 | 404 | info.errors|=UIDNA_ERROR_BIDI; |
michael@0 | 405 | } |
michael@0 | 406 | return dest; |
michael@0 | 407 | } |
michael@0 | 408 | |
michael@0 | 409 | void |
michael@0 | 410 | UTS46::processUTF8(const StringPiece &src, |
michael@0 | 411 | UBool isLabel, UBool toASCII, |
michael@0 | 412 | ByteSink &dest, |
michael@0 | 413 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 414 | if(U_FAILURE(errorCode)) { |
michael@0 | 415 | return; |
michael@0 | 416 | } |
michael@0 | 417 | const char *srcArray=src.data(); |
michael@0 | 418 | int32_t srcLength=src.length(); |
michael@0 | 419 | if(srcArray==NULL && srcLength!=0) { |
michael@0 | 420 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 421 | return; |
michael@0 | 422 | } |
michael@0 | 423 | // Arguments are fine, reset output values. |
michael@0 | 424 | info.reset(); |
michael@0 | 425 | if(srcLength==0) { |
michael@0 | 426 | if(toASCII) { |
michael@0 | 427 | info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
michael@0 | 428 | } |
michael@0 | 429 | dest.Flush(); |
michael@0 | 430 | return; |
michael@0 | 431 | } |
michael@0 | 432 | UnicodeString destString; |
michael@0 | 433 | int32_t labelStart=0; |
michael@0 | 434 | if(srcLength<=256) { // length of stackArray[] |
michael@0 | 435 | // ASCII fastpath |
michael@0 | 436 | char stackArray[256]; |
michael@0 | 437 | int32_t destCapacity; |
michael@0 | 438 | char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, |
michael@0 | 439 | stackArray, LENGTHOF(stackArray), &destCapacity); |
michael@0 | 440 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
michael@0 | 441 | int32_t i; |
michael@0 | 442 | for(i=0;; ++i) { |
michael@0 | 443 | if(i==srcLength) { |
michael@0 | 444 | if(toASCII) { |
michael@0 | 445 | if((i-labelStart)>63) { |
michael@0 | 446 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 447 | } |
michael@0 | 448 | // There is a trailing dot if labelStart==i. |
michael@0 | 449 | if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
michael@0 | 450 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
michael@0 | 451 | } |
michael@0 | 452 | } |
michael@0 | 453 | info.errors|=info.labelErrors; |
michael@0 | 454 | dest.Append(destArray, i); |
michael@0 | 455 | dest.Flush(); |
michael@0 | 456 | return; |
michael@0 | 457 | } |
michael@0 | 458 | char c=srcArray[i]; |
michael@0 | 459 | if((int8_t)c<0) { // (uint8_t)c>0x7f |
michael@0 | 460 | break; |
michael@0 | 461 | } |
michael@0 | 462 | int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. |
michael@0 | 463 | if(cData>0) { |
michael@0 | 464 | destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
michael@0 | 465 | } else if(cData<0 && disallowNonLDHDot) { |
michael@0 | 466 | break; // Replacing with U+FFFD can be complicated for toASCII. |
michael@0 | 467 | } else { |
michael@0 | 468 | destArray[i]=c; |
michael@0 | 469 | if(c==0x2d) { // hyphen |
michael@0 | 470 | if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
michael@0 | 471 | // "??--..." is Punycode or forbidden. |
michael@0 | 472 | break; |
michael@0 | 473 | } |
michael@0 | 474 | if(i==labelStart) { |
michael@0 | 475 | // label starts with "-" |
michael@0 | 476 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
michael@0 | 477 | } |
michael@0 | 478 | if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
michael@0 | 479 | // label ends with "-" |
michael@0 | 480 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
michael@0 | 481 | } |
michael@0 | 482 | } else if(c==0x2e) { // dot |
michael@0 | 483 | if(isLabel) { |
michael@0 | 484 | break; // Replacing with U+FFFD can be complicated for toASCII. |
michael@0 | 485 | } |
michael@0 | 486 | if(toASCII) { |
michael@0 | 487 | // Permit an empty label at the end but not elsewhere. |
michael@0 | 488 | if(i==labelStart && i<(srcLength-1)) { |
michael@0 | 489 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
michael@0 | 490 | } else if((i-labelStart)>63) { |
michael@0 | 491 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 492 | } |
michael@0 | 493 | } |
michael@0 | 494 | info.errors|=info.labelErrors; |
michael@0 | 495 | info.labelErrors=0; |
michael@0 | 496 | labelStart=i+1; |
michael@0 | 497 | } |
michael@0 | 498 | } |
michael@0 | 499 | } |
michael@0 | 500 | info.errors|=info.labelErrors; |
michael@0 | 501 | // Convert the processed ASCII prefix of the current label to UTF-16. |
michael@0 | 502 | int32_t mappingStart=i-labelStart; |
michael@0 | 503 | destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); |
michael@0 | 504 | // Output the previous ASCII labels and process the rest of src in UTF-16. |
michael@0 | 505 | dest.Append(destArray, labelStart); |
michael@0 | 506 | processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, |
michael@0 | 507 | isLabel, toASCII, |
michael@0 | 508 | destString, info, errorCode); |
michael@0 | 509 | } else { |
michael@0 | 510 | // src is too long for the ASCII fastpath implementation. |
michael@0 | 511 | processUnicode(UnicodeString::fromUTF8(src), 0, 0, |
michael@0 | 512 | isLabel, toASCII, |
michael@0 | 513 | destString, info, errorCode); |
michael@0 | 514 | } |
michael@0 | 515 | destString.toUTF8(dest); // calls dest.Flush() |
michael@0 | 516 | if(toASCII && !isLabel) { |
michael@0 | 517 | // length==labelStart==254 means that there is a trailing dot (ok) and |
michael@0 | 518 | // destString is empty (do not index at 253-labelStart). |
michael@0 | 519 | int32_t length=labelStart+destString.length(); |
michael@0 | 520 | if( length>=254 && isASCIIString(destString) && |
michael@0 | 521 | (length>254 || |
michael@0 | 522 | (labelStart<254 && destString[253-labelStart]!=0x2e)) |
michael@0 | 523 | ) { |
michael@0 | 524 | info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
michael@0 | 525 | } |
michael@0 | 526 | } |
michael@0 | 527 | if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
michael@0 | 528 | (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) |
michael@0 | 529 | ) { |
michael@0 | 530 | info.errors|=UIDNA_ERROR_BIDI; |
michael@0 | 531 | } |
michael@0 | 532 | } |
michael@0 | 533 | |
michael@0 | 534 | UnicodeString & |
michael@0 | 535 | UTS46::processUnicode(const UnicodeString &src, |
michael@0 | 536 | int32_t labelStart, int32_t mappingStart, |
michael@0 | 537 | UBool isLabel, UBool toASCII, |
michael@0 | 538 | UnicodeString &dest, |
michael@0 | 539 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 540 | if(mappingStart==0) { |
michael@0 | 541 | uts46Norm2.normalize(src, dest, errorCode); |
michael@0 | 542 | } else { |
michael@0 | 543 | uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); |
michael@0 | 544 | } |
michael@0 | 545 | if(U_FAILURE(errorCode)) { |
michael@0 | 546 | return dest; |
michael@0 | 547 | } |
michael@0 | 548 | UBool doMapDevChars= |
michael@0 | 549 | toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : |
michael@0 | 550 | (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; |
michael@0 | 551 | const UChar *destArray=dest.getBuffer(); |
michael@0 | 552 | int32_t destLength=dest.length(); |
michael@0 | 553 | int32_t labelLimit=labelStart; |
michael@0 | 554 | while(labelLimit<destLength) { |
michael@0 | 555 | UChar c=destArray[labelLimit]; |
michael@0 | 556 | if(c==0x2e && !isLabel) { |
michael@0 | 557 | int32_t labelLength=labelLimit-labelStart; |
michael@0 | 558 | int32_t newLength=processLabel(dest, labelStart, labelLength, |
michael@0 | 559 | toASCII, info, errorCode); |
michael@0 | 560 | info.errors|=info.labelErrors; |
michael@0 | 561 | info.labelErrors=0; |
michael@0 | 562 | if(U_FAILURE(errorCode)) { |
michael@0 | 563 | return dest; |
michael@0 | 564 | } |
michael@0 | 565 | destArray=dest.getBuffer(); |
michael@0 | 566 | destLength+=newLength-labelLength; |
michael@0 | 567 | labelLimit=labelStart+=newLength+1; |
michael@0 | 568 | } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { |
michael@0 | 569 | info.isTransDiff=TRUE; |
michael@0 | 570 | if(doMapDevChars) { |
michael@0 | 571 | destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); |
michael@0 | 572 | if(U_FAILURE(errorCode)) { |
michael@0 | 573 | return dest; |
michael@0 | 574 | } |
michael@0 | 575 | destArray=dest.getBuffer(); |
michael@0 | 576 | // Do not increment labelLimit in case c was removed. |
michael@0 | 577 | // All deviation characters have been mapped, no need to check for them again. |
michael@0 | 578 | doMapDevChars=FALSE; |
michael@0 | 579 | } else { |
michael@0 | 580 | ++labelLimit; |
michael@0 | 581 | } |
michael@0 | 582 | } else { |
michael@0 | 583 | ++labelLimit; |
michael@0 | 584 | } |
michael@0 | 585 | } |
michael@0 | 586 | // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) |
michael@0 | 587 | // but not an empty label elsewhere nor a completely empty domain name. |
michael@0 | 588 | // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. |
michael@0 | 589 | if(0==labelStart || labelStart<labelLimit) { |
michael@0 | 590 | processLabel(dest, labelStart, labelLimit-labelStart, |
michael@0 | 591 | toASCII, info, errorCode); |
michael@0 | 592 | info.errors|=info.labelErrors; |
michael@0 | 593 | } |
michael@0 | 594 | return dest; |
michael@0 | 595 | } |
michael@0 | 596 | |
michael@0 | 597 | int32_t |
michael@0 | 598 | UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, |
michael@0 | 599 | UErrorCode &errorCode) const { |
michael@0 | 600 | int32_t length=dest.length(); |
michael@0 | 601 | UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); |
michael@0 | 602 | if(s==NULL) { |
michael@0 | 603 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 604 | return length; |
michael@0 | 605 | } |
michael@0 | 606 | int32_t capacity=dest.getCapacity(); |
michael@0 | 607 | UBool didMapDevChars=FALSE; |
michael@0 | 608 | int32_t readIndex=mappingStart, writeIndex=mappingStart; |
michael@0 | 609 | do { |
michael@0 | 610 | UChar c=s[readIndex++]; |
michael@0 | 611 | switch(c) { |
michael@0 | 612 | case 0xdf: |
michael@0 | 613 | // Map sharp s to ss. |
michael@0 | 614 | didMapDevChars=TRUE; |
michael@0 | 615 | s[writeIndex++]=0x73; // Replace sharp s with first s. |
michael@0 | 616 | // Insert second s and account for possible buffer reallocation. |
michael@0 | 617 | if(writeIndex==readIndex) { |
michael@0 | 618 | if(length==capacity) { |
michael@0 | 619 | dest.releaseBuffer(length); |
michael@0 | 620 | s=dest.getBuffer(length+1); |
michael@0 | 621 | if(s==NULL) { |
michael@0 | 622 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 623 | return length; |
michael@0 | 624 | } |
michael@0 | 625 | capacity=dest.getCapacity(); |
michael@0 | 626 | } |
michael@0 | 627 | u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); |
michael@0 | 628 | ++readIndex; |
michael@0 | 629 | } |
michael@0 | 630 | s[writeIndex++]=0x73; |
michael@0 | 631 | ++length; |
michael@0 | 632 | break; |
michael@0 | 633 | case 0x3c2: // Map final sigma to nonfinal sigma. |
michael@0 | 634 | didMapDevChars=TRUE; |
michael@0 | 635 | s[writeIndex++]=0x3c3; |
michael@0 | 636 | break; |
michael@0 | 637 | case 0x200c: // Ignore/remove ZWNJ. |
michael@0 | 638 | case 0x200d: // Ignore/remove ZWJ. |
michael@0 | 639 | didMapDevChars=TRUE; |
michael@0 | 640 | --length; |
michael@0 | 641 | break; |
michael@0 | 642 | default: |
michael@0 | 643 | // Only really necessary if writeIndex was different from readIndex. |
michael@0 | 644 | s[writeIndex++]=c; |
michael@0 | 645 | break; |
michael@0 | 646 | } |
michael@0 | 647 | } while(writeIndex<length); |
michael@0 | 648 | dest.releaseBuffer(length); |
michael@0 | 649 | if(didMapDevChars) { |
michael@0 | 650 | // Mapping deviation characters might have resulted in an un-NFC string. |
michael@0 | 651 | // We could use either the NFC or the UTS #46 normalizer. |
michael@0 | 652 | // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. |
michael@0 | 653 | UnicodeString normalized; |
michael@0 | 654 | uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); |
michael@0 | 655 | if(U_SUCCESS(errorCode)) { |
michael@0 | 656 | dest.replace(labelStart, 0x7fffffff, normalized); |
michael@0 | 657 | return dest.length(); |
michael@0 | 658 | } |
michael@0 | 659 | } |
michael@0 | 660 | return length; |
michael@0 | 661 | } |
michael@0 | 662 | |
michael@0 | 663 | // Some non-ASCII characters are equivalent to sequences with |
michael@0 | 664 | // non-LDH ASCII characters. To find them: |
michael@0 | 665 | // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) |
michael@0 | 666 | static inline UBool |
michael@0 | 667 | isNonASCIIDisallowedSTD3Valid(UChar32 c) { |
michael@0 | 668 | return c==0x2260 || c==0x226E || c==0x226F; |
michael@0 | 669 | } |
michael@0 | 670 | |
michael@0 | 671 | // Replace the label in dest with the label string, if the label was modified. |
michael@0 | 672 | // If &label==&dest then the label was modified in-place and labelLength |
michael@0 | 673 | // is the new label length, different from label.length(). |
michael@0 | 674 | // If &label!=&dest then labelLength==label.length(). |
michael@0 | 675 | // Returns labelLength (= the new label length). |
michael@0 | 676 | static int32_t |
michael@0 | 677 | replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, |
michael@0 | 678 | const UnicodeString &label, int32_t labelLength) { |
michael@0 | 679 | if(&label!=&dest) { |
michael@0 | 680 | dest.replace(destLabelStart, destLabelLength, label); |
michael@0 | 681 | } |
michael@0 | 682 | return labelLength; |
michael@0 | 683 | } |
michael@0 | 684 | |
michael@0 | 685 | int32_t |
michael@0 | 686 | UTS46::processLabel(UnicodeString &dest, |
michael@0 | 687 | int32_t labelStart, int32_t labelLength, |
michael@0 | 688 | UBool toASCII, |
michael@0 | 689 | IDNAInfo &info, UErrorCode &errorCode) const { |
michael@0 | 690 | UnicodeString fromPunycode; |
michael@0 | 691 | UnicodeString *labelString; |
michael@0 | 692 | const UChar *label=dest.getBuffer()+labelStart; |
michael@0 | 693 | int32_t destLabelStart=labelStart; |
michael@0 | 694 | int32_t destLabelLength=labelLength; |
michael@0 | 695 | UBool wasPunycode; |
michael@0 | 696 | if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { |
michael@0 | 697 | // Label starts with "xn--", try to un-Punycode it. |
michael@0 | 698 | wasPunycode=TRUE; |
michael@0 | 699 | UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit |
michael@0 | 700 | if(unicodeBuffer==NULL) { |
michael@0 | 701 | // Should never occur if we used capacity==-1 which uses the internal buffer. |
michael@0 | 702 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 703 | return labelLength; |
michael@0 | 704 | } |
michael@0 | 705 | UErrorCode punycodeErrorCode=U_ZERO_ERROR; |
michael@0 | 706 | int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
michael@0 | 707 | unicodeBuffer, fromPunycode.getCapacity(), |
michael@0 | 708 | NULL, &punycodeErrorCode); |
michael@0 | 709 | if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 710 | fromPunycode.releaseBuffer(0); |
michael@0 | 711 | unicodeBuffer=fromPunycode.getBuffer(unicodeLength); |
michael@0 | 712 | if(unicodeBuffer==NULL) { |
michael@0 | 713 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 714 | return labelLength; |
michael@0 | 715 | } |
michael@0 | 716 | punycodeErrorCode=U_ZERO_ERROR; |
michael@0 | 717 | unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
michael@0 | 718 | unicodeBuffer, fromPunycode.getCapacity(), |
michael@0 | 719 | NULL, &punycodeErrorCode); |
michael@0 | 720 | } |
michael@0 | 721 | fromPunycode.releaseBuffer(unicodeLength); |
michael@0 | 722 | if(U_FAILURE(punycodeErrorCode)) { |
michael@0 | 723 | info.labelErrors|=UIDNA_ERROR_PUNYCODE; |
michael@0 | 724 | return markBadACELabel(dest, labelStart, labelLength, toASCII, info); |
michael@0 | 725 | } |
michael@0 | 726 | // Check for NFC, and for characters that are not |
michael@0 | 727 | // valid or deviation characters according to the normalizer. |
michael@0 | 728 | // If there is something wrong, then the string will change. |
michael@0 | 729 | // Note that the normalizer passes through non-LDH ASCII and deviation characters. |
michael@0 | 730 | // Deviation characters are ok in Punycode even in transitional processing. |
michael@0 | 731 | // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES |
michael@0 | 732 | // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. |
michael@0 | 733 | UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); |
michael@0 | 734 | if(U_FAILURE(errorCode)) { |
michael@0 | 735 | return labelLength; |
michael@0 | 736 | } |
michael@0 | 737 | if(!isValid) { |
michael@0 | 738 | info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
michael@0 | 739 | return markBadACELabel(dest, labelStart, labelLength, toASCII, info); |
michael@0 | 740 | } |
michael@0 | 741 | labelString=&fromPunycode; |
michael@0 | 742 | label=fromPunycode.getBuffer(); |
michael@0 | 743 | labelStart=0; |
michael@0 | 744 | labelLength=fromPunycode.length(); |
michael@0 | 745 | } else { |
michael@0 | 746 | wasPunycode=FALSE; |
michael@0 | 747 | labelString=&dest; |
michael@0 | 748 | } |
michael@0 | 749 | // Validity check |
michael@0 | 750 | if(labelLength==0) { |
michael@0 | 751 | if(toASCII) { |
michael@0 | 752 | info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
michael@0 | 753 | } |
michael@0 | 754 | return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); |
michael@0 | 755 | } |
michael@0 | 756 | // labelLength>0 |
michael@0 | 757 | if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { |
michael@0 | 758 | // label starts with "??--" |
michael@0 | 759 | info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; |
michael@0 | 760 | } |
michael@0 | 761 | if(label[0]==0x2d) { |
michael@0 | 762 | // label starts with "-" |
michael@0 | 763 | info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
michael@0 | 764 | } |
michael@0 | 765 | if(label[labelLength-1]==0x2d) { |
michael@0 | 766 | // label ends with "-" |
michael@0 | 767 | info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
michael@0 | 768 | } |
michael@0 | 769 | // If the label was not a Punycode label, then it was the result of |
michael@0 | 770 | // mapping, normalization and label segmentation. |
michael@0 | 771 | // If the label was in Punycode, then we mapped it again above |
michael@0 | 772 | // and checked its validity. |
michael@0 | 773 | // Now we handle the STD3 restriction to LDH characters (if set) |
michael@0 | 774 | // and we look for U+FFFD which indicates disallowed characters |
michael@0 | 775 | // in a non-Punycode label or U+FFFD itself in a Punycode label. |
michael@0 | 776 | // We also check for dots which can come from the input to a single-label function. |
michael@0 | 777 | // Ok to cast away const because we own the UnicodeString. |
michael@0 | 778 | UChar *s=(UChar *)label; |
michael@0 | 779 | const UChar *limit=label+labelLength; |
michael@0 | 780 | UChar oredChars=0; |
michael@0 | 781 | // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. |
michael@0 | 782 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
michael@0 | 783 | do { |
michael@0 | 784 | UChar c=*s; |
michael@0 | 785 | if(c<=0x7f) { |
michael@0 | 786 | if(c==0x2e) { |
michael@0 | 787 | info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
michael@0 | 788 | *s=0xfffd; |
michael@0 | 789 | } else if(disallowNonLDHDot && asciiData[c]<0) { |
michael@0 | 790 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
michael@0 | 791 | *s=0xfffd; |
michael@0 | 792 | } |
michael@0 | 793 | } else { |
michael@0 | 794 | oredChars|=c; |
michael@0 | 795 | if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { |
michael@0 | 796 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
michael@0 | 797 | *s=0xfffd; |
michael@0 | 798 | } else if(c==0xfffd) { |
michael@0 | 799 | info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
michael@0 | 800 | } |
michael@0 | 801 | } |
michael@0 | 802 | ++s; |
michael@0 | 803 | } while(s<limit); |
michael@0 | 804 | // Check for a leading combining mark after other validity checks |
michael@0 | 805 | // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. |
michael@0 | 806 | UChar32 c; |
michael@0 | 807 | int32_t cpLength=0; |
michael@0 | 808 | // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. |
michael@0 | 809 | U16_NEXT_UNSAFE(label, cpLength, c); |
michael@0 | 810 | if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { |
michael@0 | 811 | info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; |
michael@0 | 812 | labelString->replace(labelStart, cpLength, (UChar)0xfffd); |
michael@0 | 813 | label=labelString->getBuffer()+labelStart; |
michael@0 | 814 | labelLength+=1-cpLength; |
michael@0 | 815 | if(labelString==&dest) { |
michael@0 | 816 | destLabelLength=labelLength; |
michael@0 | 817 | } |
michael@0 | 818 | } |
michael@0 | 819 | if((info.labelErrors&severeErrors)==0) { |
michael@0 | 820 | // Do contextual checks only if we do not have U+FFFD from a severe error |
michael@0 | 821 | // because U+FFFD can make these checks fail. |
michael@0 | 822 | if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { |
michael@0 | 823 | checkLabelBiDi(label, labelLength, info); |
michael@0 | 824 | } |
michael@0 | 825 | if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && |
michael@0 | 826 | !isLabelOkContextJ(label, labelLength) |
michael@0 | 827 | ) { |
michael@0 | 828 | info.labelErrors|=UIDNA_ERROR_CONTEXTJ; |
michael@0 | 829 | } |
michael@0 | 830 | if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { |
michael@0 | 831 | checkLabelContextO(label, labelLength, info); |
michael@0 | 832 | } |
michael@0 | 833 | if(toASCII) { |
michael@0 | 834 | if(wasPunycode) { |
michael@0 | 835 | // Leave a Punycode label unchanged if it has no severe errors. |
michael@0 | 836 | if(destLabelLength>63) { |
michael@0 | 837 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 838 | } |
michael@0 | 839 | return destLabelLength; |
michael@0 | 840 | } else if(oredChars>=0x80) { |
michael@0 | 841 | // Contains non-ASCII characters. |
michael@0 | 842 | UnicodeString punycode; |
michael@0 | 843 | UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length |
michael@0 | 844 | if(buffer==NULL) { |
michael@0 | 845 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 846 | return destLabelLength; |
michael@0 | 847 | } |
michael@0 | 848 | buffer[0]=0x78; // Write "xn--". |
michael@0 | 849 | buffer[1]=0x6e; |
michael@0 | 850 | buffer[2]=0x2d; |
michael@0 | 851 | buffer[3]=0x2d; |
michael@0 | 852 | int32_t punycodeLength=u_strToPunycode(label, labelLength, |
michael@0 | 853 | buffer+4, punycode.getCapacity()-4, |
michael@0 | 854 | NULL, &errorCode); |
michael@0 | 855 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
michael@0 | 856 | errorCode=U_ZERO_ERROR; |
michael@0 | 857 | punycode.releaseBuffer(4); |
michael@0 | 858 | buffer=punycode.getBuffer(4+punycodeLength); |
michael@0 | 859 | if(buffer==NULL) { |
michael@0 | 860 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 861 | return destLabelLength; |
michael@0 | 862 | } |
michael@0 | 863 | punycodeLength=u_strToPunycode(label, labelLength, |
michael@0 | 864 | buffer+4, punycode.getCapacity()-4, |
michael@0 | 865 | NULL, &errorCode); |
michael@0 | 866 | } |
michael@0 | 867 | punycodeLength+=4; |
michael@0 | 868 | punycode.releaseBuffer(punycodeLength); |
michael@0 | 869 | if(U_FAILURE(errorCode)) { |
michael@0 | 870 | return destLabelLength; |
michael@0 | 871 | } |
michael@0 | 872 | if(punycodeLength>63) { |
michael@0 | 873 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 874 | } |
michael@0 | 875 | return replaceLabel(dest, destLabelStart, destLabelLength, |
michael@0 | 876 | punycode, punycodeLength); |
michael@0 | 877 | } else { |
michael@0 | 878 | // all-ASCII label |
michael@0 | 879 | if(labelLength>63) { |
michael@0 | 880 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 881 | } |
michael@0 | 882 | } |
michael@0 | 883 | } |
michael@0 | 884 | } else { |
michael@0 | 885 | // If a Punycode label has severe errors, |
michael@0 | 886 | // then leave it but make sure it does not look valid. |
michael@0 | 887 | if(wasPunycode) { |
michael@0 | 888 | info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
michael@0 | 889 | return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); |
michael@0 | 890 | } |
michael@0 | 891 | } |
michael@0 | 892 | return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); |
michael@0 | 893 | } |
michael@0 | 894 | |
michael@0 | 895 | // Make sure an ACE label does not look valid. |
michael@0 | 896 | // Append U+FFFD if the label has only LDH characters. |
michael@0 | 897 | // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. |
michael@0 | 898 | int32_t |
michael@0 | 899 | UTS46::markBadACELabel(UnicodeString &dest, |
michael@0 | 900 | int32_t labelStart, int32_t labelLength, |
michael@0 | 901 | UBool toASCII, IDNAInfo &info) const { |
michael@0 | 902 | UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
michael@0 | 903 | UBool isASCII=TRUE; |
michael@0 | 904 | UBool onlyLDH=TRUE; |
michael@0 | 905 | const UChar *label=dest.getBuffer()+labelStart; |
michael@0 | 906 | // Ok to cast away const because we own the UnicodeString. |
michael@0 | 907 | UChar *s=(UChar *)label+4; // After the initial "xn--". |
michael@0 | 908 | const UChar *limit=label+labelLength; |
michael@0 | 909 | do { |
michael@0 | 910 | UChar c=*s; |
michael@0 | 911 | if(c<=0x7f) { |
michael@0 | 912 | if(c==0x2e) { |
michael@0 | 913 | info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
michael@0 | 914 | *s=0xfffd; |
michael@0 | 915 | isASCII=onlyLDH=FALSE; |
michael@0 | 916 | } else if(asciiData[c]<0) { |
michael@0 | 917 | onlyLDH=FALSE; |
michael@0 | 918 | if(disallowNonLDHDot) { |
michael@0 | 919 | *s=0xfffd; |
michael@0 | 920 | isASCII=FALSE; |
michael@0 | 921 | } |
michael@0 | 922 | } |
michael@0 | 923 | } else { |
michael@0 | 924 | isASCII=onlyLDH=FALSE; |
michael@0 | 925 | } |
michael@0 | 926 | } while(++s<limit); |
michael@0 | 927 | if(onlyLDH) { |
michael@0 | 928 | dest.insert(labelStart+labelLength, (UChar)0xfffd); |
michael@0 | 929 | ++labelLength; |
michael@0 | 930 | } else { |
michael@0 | 931 | if(toASCII && isASCII && labelLength>63) { |
michael@0 | 932 | info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
michael@0 | 933 | } |
michael@0 | 934 | } |
michael@0 | 935 | return labelLength; |
michael@0 | 936 | } |
michael@0 | 937 | |
michael@0 | 938 | const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); |
michael@0 | 939 | const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); |
michael@0 | 940 | const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; |
michael@0 | 941 | |
michael@0 | 942 | const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); |
michael@0 | 943 | |
michael@0 | 944 | const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); |
michael@0 | 945 | const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; |
michael@0 | 946 | const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); |
michael@0 | 947 | |
michael@0 | 948 | const uint32_t ES_CS_ET_ON_BN_NSM_MASK= |
michael@0 | 949 | U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| |
michael@0 | 950 | U_MASK(U_COMMON_NUMBER_SEPARATOR)| |
michael@0 | 951 | U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| |
michael@0 | 952 | U_MASK(U_OTHER_NEUTRAL)| |
michael@0 | 953 | U_MASK(U_BOUNDARY_NEUTRAL)| |
michael@0 | 954 | U_MASK(U_DIR_NON_SPACING_MARK); |
michael@0 | 955 | const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; |
michael@0 | 956 | const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; |
michael@0 | 957 | |
michael@0 | 958 | // We scan the whole label and check both for whether it contains RTL characters |
michael@0 | 959 | // and whether it passes the BiDi Rule. |
michael@0 | 960 | // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find |
michael@0 | 961 | // that a domain name is a BiDi domain name (has an RTL label) only after |
michael@0 | 962 | // processing several earlier labels. |
michael@0 | 963 | void |
michael@0 | 964 | UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const { |
michael@0 | 965 | // IDNA2008 BiDi rule |
michael@0 | 966 | // Get the directionality of the first character. |
michael@0 | 967 | UChar32 c; |
michael@0 | 968 | int32_t i=0; |
michael@0 | 969 | U16_NEXT_UNSAFE(label, i, c); |
michael@0 | 970 | uint32_t firstMask=U_MASK(u_charDirection(c)); |
michael@0 | 971 | // 1. The first character must be a character with BIDI property L, R |
michael@0 | 972 | // or AL. If it has the R or AL property, it is an RTL label; if it |
michael@0 | 973 | // has the L property, it is an LTR label. |
michael@0 | 974 | if((firstMask&~L_R_AL_MASK)!=0) { |
michael@0 | 975 | info.isOkBiDi=FALSE; |
michael@0 | 976 | } |
michael@0 | 977 | // Get the directionality of the last non-NSM character. |
michael@0 | 978 | uint32_t lastMask; |
michael@0 | 979 | for(;;) { |
michael@0 | 980 | if(i>=labelLength) { |
michael@0 | 981 | lastMask=firstMask; |
michael@0 | 982 | break; |
michael@0 | 983 | } |
michael@0 | 984 | U16_PREV_UNSAFE(label, labelLength, c); |
michael@0 | 985 | UCharDirection dir=u_charDirection(c); |
michael@0 | 986 | if(dir!=U_DIR_NON_SPACING_MARK) { |
michael@0 | 987 | lastMask=U_MASK(dir); |
michael@0 | 988 | break; |
michael@0 | 989 | } |
michael@0 | 990 | } |
michael@0 | 991 | // 3. In an RTL label, the end of the label must be a character with |
michael@0 | 992 | // BIDI property R, AL, EN or AN, followed by zero or more |
michael@0 | 993 | // characters with BIDI property NSM. |
michael@0 | 994 | // 6. In an LTR label, the end of the label must be a character with |
michael@0 | 995 | // BIDI property L or EN, followed by zero or more characters with |
michael@0 | 996 | // BIDI property NSM. |
michael@0 | 997 | if( (firstMask&L_MASK)!=0 ? |
michael@0 | 998 | (lastMask&~L_EN_MASK)!=0 : |
michael@0 | 999 | (lastMask&~R_AL_EN_AN_MASK)!=0 |
michael@0 | 1000 | ) { |
michael@0 | 1001 | info.isOkBiDi=FALSE; |
michael@0 | 1002 | } |
michael@0 | 1003 | // Get the directionalities of the intervening characters. |
michael@0 | 1004 | uint32_t mask=0; |
michael@0 | 1005 | while(i<labelLength) { |
michael@0 | 1006 | U16_NEXT_UNSAFE(label, i, c); |
michael@0 | 1007 | mask|=U_MASK(u_charDirection(c)); |
michael@0 | 1008 | } |
michael@0 | 1009 | if(firstMask&L_MASK) { |
michael@0 | 1010 | // 5. In an LTR label, only characters with the BIDI properties L, EN, |
michael@0 | 1011 | // ES, CS, ET, ON, BN and NSM are allowed. |
michael@0 | 1012 | if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
michael@0 | 1013 | info.isOkBiDi=FALSE; |
michael@0 | 1014 | } |
michael@0 | 1015 | } else { |
michael@0 | 1016 | // 2. In an RTL label, only characters with the BIDI properties R, AL, |
michael@0 | 1017 | // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. |
michael@0 | 1018 | if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
michael@0 | 1019 | info.isOkBiDi=FALSE; |
michael@0 | 1020 | } |
michael@0 | 1021 | // 4. In an RTL label, if an EN is present, no AN may be present, and |
michael@0 | 1022 | // vice versa. |
michael@0 | 1023 | if((mask&EN_AN_MASK)==EN_AN_MASK) { |
michael@0 | 1024 | info.isOkBiDi=FALSE; |
michael@0 | 1025 | } |
michael@0 | 1026 | } |
michael@0 | 1027 | // An RTL label is a label that contains at least one character of type |
michael@0 | 1028 | // R, AL or AN. [...] |
michael@0 | 1029 | // A "BIDI domain name" is a domain name that contains at least one RTL |
michael@0 | 1030 | // label. [...] |
michael@0 | 1031 | // The following rule, consisting of six conditions, applies to labels |
michael@0 | 1032 | // in BIDI domain names. |
michael@0 | 1033 | if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { |
michael@0 | 1034 | info.isBiDi=TRUE; |
michael@0 | 1035 | } |
michael@0 | 1036 | } |
michael@0 | 1037 | |
michael@0 | 1038 | // Special code for the ASCII prefix of a BiDi domain name. |
michael@0 | 1039 | // The ASCII prefix is all-LTR. |
michael@0 | 1040 | |
michael@0 | 1041 | // IDNA2008 BiDi rule, parts relevant to ASCII labels: |
michael@0 | 1042 | // 1. The first character must be a character with BIDI property L [...] |
michael@0 | 1043 | // 5. In an LTR label, only characters with the BIDI properties L, EN, |
michael@0 | 1044 | // ES, CS, ET, ON, BN and NSM are allowed. |
michael@0 | 1045 | // 6. In an LTR label, the end of the label must be a character with |
michael@0 | 1046 | // BIDI property L or EN [...] |
michael@0 | 1047 | |
michael@0 | 1048 | // UTF-16 version, called for mapped ASCII prefix. |
michael@0 | 1049 | // Cannot contain uppercase A-Z. |
michael@0 | 1050 | // s[length-1] must be the trailing dot. |
michael@0 | 1051 | static UBool |
michael@0 | 1052 | isASCIIOkBiDi(const UChar *s, int32_t length) { |
michael@0 | 1053 | int32_t labelStart=0; |
michael@0 | 1054 | for(int32_t i=0; i<length; ++i) { |
michael@0 | 1055 | UChar c=s[i]; |
michael@0 | 1056 | if(c==0x2e) { // dot |
michael@0 | 1057 | if(i>labelStart) { |
michael@0 | 1058 | c=s[i-1]; |
michael@0 | 1059 | if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { |
michael@0 | 1060 | // Last character in the label is not an L or EN. |
michael@0 | 1061 | return FALSE; |
michael@0 | 1062 | } |
michael@0 | 1063 | } |
michael@0 | 1064 | labelStart=i+1; |
michael@0 | 1065 | } else if(i==labelStart) { |
michael@0 | 1066 | if(!(0x61<=c && c<=0x7a)) { |
michael@0 | 1067 | // First character in the label is not an L. |
michael@0 | 1068 | return FALSE; |
michael@0 | 1069 | } |
michael@0 | 1070 | } else { |
michael@0 | 1071 | if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
michael@0 | 1072 | // Intermediate character in the label is a B, S or WS. |
michael@0 | 1073 | return FALSE; |
michael@0 | 1074 | } |
michael@0 | 1075 | } |
michael@0 | 1076 | } |
michael@0 | 1077 | return TRUE; |
michael@0 | 1078 | } |
michael@0 | 1079 | |
michael@0 | 1080 | // UTF-8 version, called for source ASCII prefix. |
michael@0 | 1081 | // Can contain uppercase A-Z. |
michael@0 | 1082 | // s[length-1] must be the trailing dot. |
michael@0 | 1083 | static UBool |
michael@0 | 1084 | isASCIIOkBiDi(const char *s, int32_t length) { |
michael@0 | 1085 | int32_t labelStart=0; |
michael@0 | 1086 | for(int32_t i=0; i<length; ++i) { |
michael@0 | 1087 | char c=s[i]; |
michael@0 | 1088 | if(c==0x2e) { // dot |
michael@0 | 1089 | if(i>labelStart) { |
michael@0 | 1090 | c=s[i-1]; |
michael@0 | 1091 | if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { |
michael@0 | 1092 | // Last character in the label is not an L or EN. |
michael@0 | 1093 | return FALSE; |
michael@0 | 1094 | } |
michael@0 | 1095 | } |
michael@0 | 1096 | labelStart=i+1; |
michael@0 | 1097 | } else if(i==labelStart) { |
michael@0 | 1098 | if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { |
michael@0 | 1099 | // First character in the label is not an L. |
michael@0 | 1100 | return FALSE; |
michael@0 | 1101 | } |
michael@0 | 1102 | } else { |
michael@0 | 1103 | if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
michael@0 | 1104 | // Intermediate character in the label is a B, S or WS. |
michael@0 | 1105 | return FALSE; |
michael@0 | 1106 | } |
michael@0 | 1107 | } |
michael@0 | 1108 | } |
michael@0 | 1109 | return TRUE; |
michael@0 | 1110 | } |
michael@0 | 1111 | |
michael@0 | 1112 | UBool |
michael@0 | 1113 | UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { |
michael@0 | 1114 | const UBiDiProps *bdp=ubidi_getSingleton(); |
michael@0 | 1115 | // [IDNA2008-Tables] |
michael@0 | 1116 | // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER |
michael@0 | 1117 | for(int32_t i=0; i<labelLength; ++i) { |
michael@0 | 1118 | if(label[i]==0x200c) { |
michael@0 | 1119 | // Appendix A.1. ZERO WIDTH NON-JOINER |
michael@0 | 1120 | // Rule Set: |
michael@0 | 1121 | // False; |
michael@0 | 1122 | // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
michael@0 | 1123 | // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
michael@0 | 1124 | // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
michael@0 | 1125 | if(i==0) { |
michael@0 | 1126 | return FALSE; |
michael@0 | 1127 | } |
michael@0 | 1128 | UChar32 c; |
michael@0 | 1129 | int32_t j=i; |
michael@0 | 1130 | U16_PREV_UNSAFE(label, j, c); |
michael@0 | 1131 | if(uts46Norm2.getCombiningClass(c)==9) { |
michael@0 | 1132 | continue; |
michael@0 | 1133 | } |
michael@0 | 1134 | // check precontext (Joining_Type:{L,D})(Joining_Type:T)* |
michael@0 | 1135 | for(;;) { |
michael@0 | 1136 | UJoiningType type=ubidi_getJoiningType(bdp, c); |
michael@0 | 1137 | if(type==U_JT_TRANSPARENT) { |
michael@0 | 1138 | if(j==0) { |
michael@0 | 1139 | return FALSE; |
michael@0 | 1140 | } |
michael@0 | 1141 | U16_PREV_UNSAFE(label, j, c); |
michael@0 | 1142 | } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { |
michael@0 | 1143 | break; // precontext fulfilled |
michael@0 | 1144 | } else { |
michael@0 | 1145 | return FALSE; |
michael@0 | 1146 | } |
michael@0 | 1147 | } |
michael@0 | 1148 | // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) |
michael@0 | 1149 | for(j=i+1;;) { |
michael@0 | 1150 | if(j==labelLength) { |
michael@0 | 1151 | return FALSE; |
michael@0 | 1152 | } |
michael@0 | 1153 | U16_NEXT_UNSAFE(label, j, c); |
michael@0 | 1154 | UJoiningType type=ubidi_getJoiningType(bdp, c); |
michael@0 | 1155 | if(type==U_JT_TRANSPARENT) { |
michael@0 | 1156 | // just skip this character |
michael@0 | 1157 | } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { |
michael@0 | 1158 | break; // postcontext fulfilled |
michael@0 | 1159 | } else { |
michael@0 | 1160 | return FALSE; |
michael@0 | 1161 | } |
michael@0 | 1162 | } |
michael@0 | 1163 | } else if(label[i]==0x200d) { |
michael@0 | 1164 | // Appendix A.2. ZERO WIDTH JOINER (U+200D) |
michael@0 | 1165 | // Rule Set: |
michael@0 | 1166 | // False; |
michael@0 | 1167 | // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
michael@0 | 1168 | if(i==0) { |
michael@0 | 1169 | return FALSE; |
michael@0 | 1170 | } |
michael@0 | 1171 | UChar32 c; |
michael@0 | 1172 | int32_t j=i; |
michael@0 | 1173 | U16_PREV_UNSAFE(label, j, c); |
michael@0 | 1174 | if(uts46Norm2.getCombiningClass(c)!=9) { |
michael@0 | 1175 | return FALSE; |
michael@0 | 1176 | } |
michael@0 | 1177 | } |
michael@0 | 1178 | } |
michael@0 | 1179 | return TRUE; |
michael@0 | 1180 | } |
michael@0 | 1181 | |
michael@0 | 1182 | void |
michael@0 | 1183 | UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { |
michael@0 | 1184 | int32_t labelEnd=labelLength-1; // inclusive |
michael@0 | 1185 | int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx |
michael@0 | 1186 | for(int32_t i=0; i<=labelEnd; ++i) { |
michael@0 | 1187 | UChar32 c=label[i]; |
michael@0 | 1188 | if(c<0xb7) { |
michael@0 | 1189 | // ASCII fastpath |
michael@0 | 1190 | } else if(c<=0x6f9) { |
michael@0 | 1191 | if(c==0xb7) { |
michael@0 | 1192 | // Appendix A.3. MIDDLE DOT (U+00B7) |
michael@0 | 1193 | // Rule Set: |
michael@0 | 1194 | // False; |
michael@0 | 1195 | // If Before(cp) .eq. U+006C And |
michael@0 | 1196 | // After(cp) .eq. U+006C Then True; |
michael@0 | 1197 | if(!(0<i && label[i-1]==0x6c && |
michael@0 | 1198 | i<labelEnd && label[i+1]==0x6c)) { |
michael@0 | 1199 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
michael@0 | 1200 | } |
michael@0 | 1201 | } else if(c==0x375) { |
michael@0 | 1202 | // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) |
michael@0 | 1203 | // Rule Set: |
michael@0 | 1204 | // False; |
michael@0 | 1205 | // If Script(After(cp)) .eq. Greek Then True; |
michael@0 | 1206 | UScriptCode script=USCRIPT_INVALID_CODE; |
michael@0 | 1207 | if(i<labelEnd) { |
michael@0 | 1208 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 1209 | int32_t j=i+1; |
michael@0 | 1210 | U16_NEXT(label, j, labelLength, c); |
michael@0 | 1211 | script=uscript_getScript(c, &errorCode); |
michael@0 | 1212 | } |
michael@0 | 1213 | if(script!=USCRIPT_GREEK) { |
michael@0 | 1214 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
michael@0 | 1215 | } |
michael@0 | 1216 | } else if(c==0x5f3 || c==0x5f4) { |
michael@0 | 1217 | // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) |
michael@0 | 1218 | // Rule Set: |
michael@0 | 1219 | // False; |
michael@0 | 1220 | // If Script(Before(cp)) .eq. Hebrew Then True; |
michael@0 | 1221 | // |
michael@0 | 1222 | // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) |
michael@0 | 1223 | // Rule Set: |
michael@0 | 1224 | // False; |
michael@0 | 1225 | // If Script(Before(cp)) .eq. Hebrew Then True; |
michael@0 | 1226 | UScriptCode script=USCRIPT_INVALID_CODE; |
michael@0 | 1227 | if(0<i) { |
michael@0 | 1228 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 1229 | int32_t j=i; |
michael@0 | 1230 | U16_PREV(label, 0, j, c); |
michael@0 | 1231 | script=uscript_getScript(c, &errorCode); |
michael@0 | 1232 | } |
michael@0 | 1233 | if(script!=USCRIPT_HEBREW) { |
michael@0 | 1234 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
michael@0 | 1235 | } |
michael@0 | 1236 | } else if(0x660<=c /* && c<=0x6f9 */) { |
michael@0 | 1237 | // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) |
michael@0 | 1238 | // Rule Set: |
michael@0 | 1239 | // True; |
michael@0 | 1240 | // For All Characters: |
michael@0 | 1241 | // If cp .in. 06F0..06F9 Then False; |
michael@0 | 1242 | // End For; |
michael@0 | 1243 | // |
michael@0 | 1244 | // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) |
michael@0 | 1245 | // Rule Set: |
michael@0 | 1246 | // True; |
michael@0 | 1247 | // For All Characters: |
michael@0 | 1248 | // If cp .in. 0660..0669 Then False; |
michael@0 | 1249 | // End For; |
michael@0 | 1250 | if(c<=0x669) { |
michael@0 | 1251 | if(arabicDigits>0) { |
michael@0 | 1252 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; |
michael@0 | 1253 | } |
michael@0 | 1254 | arabicDigits=-1; |
michael@0 | 1255 | } else if(0x6f0<=c) { |
michael@0 | 1256 | if(arabicDigits<0) { |
michael@0 | 1257 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; |
michael@0 | 1258 | } |
michael@0 | 1259 | arabicDigits=1; |
michael@0 | 1260 | } |
michael@0 | 1261 | } |
michael@0 | 1262 | } else if(c==0x30fb) { |
michael@0 | 1263 | // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) |
michael@0 | 1264 | // Rule Set: |
michael@0 | 1265 | // False; |
michael@0 | 1266 | // For All Characters: |
michael@0 | 1267 | // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; |
michael@0 | 1268 | // End For; |
michael@0 | 1269 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 1270 | for(int j=0;;) { |
michael@0 | 1271 | if(j>labelEnd) { |
michael@0 | 1272 | info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; |
michael@0 | 1273 | break; |
michael@0 | 1274 | } |
michael@0 | 1275 | U16_NEXT(label, j, labelLength, c); |
michael@0 | 1276 | UScriptCode script=uscript_getScript(c, &errorCode); |
michael@0 | 1277 | if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { |
michael@0 | 1278 | break; |
michael@0 | 1279 | } |
michael@0 | 1280 | } |
michael@0 | 1281 | } |
michael@0 | 1282 | } |
michael@0 | 1283 | } |
michael@0 | 1284 | |
michael@0 | 1285 | U_NAMESPACE_END |
michael@0 | 1286 | |
michael@0 | 1287 | // C API ------------------------------------------------------------------- *** |
michael@0 | 1288 | |
michael@0 | 1289 | U_NAMESPACE_USE |
michael@0 | 1290 | |
michael@0 | 1291 | U_CAPI UIDNA * U_EXPORT2 |
michael@0 | 1292 | uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { |
michael@0 | 1293 | return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); |
michael@0 | 1294 | } |
michael@0 | 1295 | |
michael@0 | 1296 | U_CAPI void U_EXPORT2 |
michael@0 | 1297 | uidna_close(UIDNA *idna) { |
michael@0 | 1298 | delete reinterpret_cast<IDNA *>(idna); |
michael@0 | 1299 | } |
michael@0 | 1300 | |
michael@0 | 1301 | static UBool |
michael@0 | 1302 | checkArgs(const void *label, int32_t length, |
michael@0 | 1303 | void *dest, int32_t capacity, |
michael@0 | 1304 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1305 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1306 | return FALSE; |
michael@0 | 1307 | } |
michael@0 | 1308 | // sizeof(UIDNAInfo)=16 in the first API version. |
michael@0 | 1309 | if(pInfo==NULL || pInfo->size<16) { |
michael@0 | 1310 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1311 | return FALSE; |
michael@0 | 1312 | } |
michael@0 | 1313 | if( (label==NULL ? length!=0 : length<-1) || |
michael@0 | 1314 | (dest==NULL ? capacity!=0 : capacity<0) || |
michael@0 | 1315 | (dest==label && label!=NULL) |
michael@0 | 1316 | ) { |
michael@0 | 1317 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1318 | return FALSE; |
michael@0 | 1319 | } |
michael@0 | 1320 | // Set all *pInfo bytes to 0 except for the size field itself. |
michael@0 | 1321 | uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); |
michael@0 | 1322 | return TRUE; |
michael@0 | 1323 | } |
michael@0 | 1324 | |
michael@0 | 1325 | static void |
michael@0 | 1326 | idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { |
michael@0 | 1327 | pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); |
michael@0 | 1328 | pInfo->errors=info.getErrors(); |
michael@0 | 1329 | } |
michael@0 | 1330 | |
michael@0 | 1331 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1332 | uidna_labelToASCII(const UIDNA *idna, |
michael@0 | 1333 | const UChar *label, int32_t length, |
michael@0 | 1334 | UChar *dest, int32_t capacity, |
michael@0 | 1335 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1336 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1337 | return 0; |
michael@0 | 1338 | } |
michael@0 | 1339 | UnicodeString src((UBool)(length<0), label, length); |
michael@0 | 1340 | UnicodeString destString(dest, 0, capacity); |
michael@0 | 1341 | IDNAInfo info; |
michael@0 | 1342 | reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); |
michael@0 | 1343 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1344 | return destString.extract(dest, capacity, *pErrorCode); |
michael@0 | 1345 | } |
michael@0 | 1346 | |
michael@0 | 1347 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1348 | uidna_labelToUnicode(const UIDNA *idna, |
michael@0 | 1349 | const UChar *label, int32_t length, |
michael@0 | 1350 | UChar *dest, int32_t capacity, |
michael@0 | 1351 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1352 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1353 | return 0; |
michael@0 | 1354 | } |
michael@0 | 1355 | UnicodeString src((UBool)(length<0), label, length); |
michael@0 | 1356 | UnicodeString destString(dest, 0, capacity); |
michael@0 | 1357 | IDNAInfo info; |
michael@0 | 1358 | reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); |
michael@0 | 1359 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1360 | return destString.extract(dest, capacity, *pErrorCode); |
michael@0 | 1361 | } |
michael@0 | 1362 | |
michael@0 | 1363 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1364 | uidna_nameToASCII(const UIDNA *idna, |
michael@0 | 1365 | const UChar *name, int32_t length, |
michael@0 | 1366 | UChar *dest, int32_t capacity, |
michael@0 | 1367 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1368 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1369 | return 0; |
michael@0 | 1370 | } |
michael@0 | 1371 | UnicodeString src((UBool)(length<0), name, length); |
michael@0 | 1372 | UnicodeString destString(dest, 0, capacity); |
michael@0 | 1373 | IDNAInfo info; |
michael@0 | 1374 | reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); |
michael@0 | 1375 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1376 | return destString.extract(dest, capacity, *pErrorCode); |
michael@0 | 1377 | } |
michael@0 | 1378 | |
michael@0 | 1379 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1380 | uidna_nameToUnicode(const UIDNA *idna, |
michael@0 | 1381 | const UChar *name, int32_t length, |
michael@0 | 1382 | UChar *dest, int32_t capacity, |
michael@0 | 1383 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1384 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1385 | return 0; |
michael@0 | 1386 | } |
michael@0 | 1387 | UnicodeString src((UBool)(length<0), name, length); |
michael@0 | 1388 | UnicodeString destString(dest, 0, capacity); |
michael@0 | 1389 | IDNAInfo info; |
michael@0 | 1390 | reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); |
michael@0 | 1391 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1392 | return destString.extract(dest, capacity, *pErrorCode); |
michael@0 | 1393 | } |
michael@0 | 1394 | |
michael@0 | 1395 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1396 | uidna_labelToASCII_UTF8(const UIDNA *idna, |
michael@0 | 1397 | const char *label, int32_t length, |
michael@0 | 1398 | char *dest, int32_t capacity, |
michael@0 | 1399 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1400 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1401 | return 0; |
michael@0 | 1402 | } |
michael@0 | 1403 | StringPiece src(label, length<0 ? uprv_strlen(label) : length); |
michael@0 | 1404 | CheckedArrayByteSink sink(dest, capacity); |
michael@0 | 1405 | IDNAInfo info; |
michael@0 | 1406 | reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); |
michael@0 | 1407 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1408 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
michael@0 | 1409 | } |
michael@0 | 1410 | |
michael@0 | 1411 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1412 | uidna_labelToUnicodeUTF8(const UIDNA *idna, |
michael@0 | 1413 | const char *label, int32_t length, |
michael@0 | 1414 | char *dest, int32_t capacity, |
michael@0 | 1415 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1416 | if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1417 | return 0; |
michael@0 | 1418 | } |
michael@0 | 1419 | StringPiece src(label, length<0 ? uprv_strlen(label) : length); |
michael@0 | 1420 | CheckedArrayByteSink sink(dest, capacity); |
michael@0 | 1421 | IDNAInfo info; |
michael@0 | 1422 | reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); |
michael@0 | 1423 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1424 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
michael@0 | 1425 | } |
michael@0 | 1426 | |
michael@0 | 1427 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1428 | uidna_nameToASCII_UTF8(const UIDNA *idna, |
michael@0 | 1429 | const char *name, int32_t length, |
michael@0 | 1430 | char *dest, int32_t capacity, |
michael@0 | 1431 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1432 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1433 | return 0; |
michael@0 | 1434 | } |
michael@0 | 1435 | StringPiece src(name, length<0 ? uprv_strlen(name) : length); |
michael@0 | 1436 | CheckedArrayByteSink sink(dest, capacity); |
michael@0 | 1437 | IDNAInfo info; |
michael@0 | 1438 | reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); |
michael@0 | 1439 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1440 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
michael@0 | 1441 | } |
michael@0 | 1442 | |
michael@0 | 1443 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1444 | uidna_nameToUnicodeUTF8(const UIDNA *idna, |
michael@0 | 1445 | const char *name, int32_t length, |
michael@0 | 1446 | char *dest, int32_t capacity, |
michael@0 | 1447 | UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
michael@0 | 1448 | if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
michael@0 | 1449 | return 0; |
michael@0 | 1450 | } |
michael@0 | 1451 | StringPiece src(name, length<0 ? uprv_strlen(name) : length); |
michael@0 | 1452 | CheckedArrayByteSink sink(dest, capacity); |
michael@0 | 1453 | IDNAInfo info; |
michael@0 | 1454 | reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); |
michael@0 | 1455 | idnaInfoToStruct(info, pInfo); |
michael@0 | 1456 | return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); |
michael@0 | 1457 | } |
michael@0 | 1458 | |
michael@0 | 1459 | #endif // UCONFIG_NO_IDNA |