1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/unicharutil/util/nsUnicharUtils.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,428 @@ 1.4 +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nsUnicharUtils.h" 1.10 +#include "nsXPCOMStrings.h" 1.11 +#include "nsUTF8Utils.h" 1.12 +#include "nsUnicodeProperties.h" 1.13 +#include "mozilla/Likely.h" 1.14 +#include "mozilla/HashFunctions.h" 1.15 + 1.16 +// We map x -> x, except for upper-case letters, 1.17 +// which we map to their lower-case equivalents. 1.18 +static const uint8_t gASCIIToLower [128] = { 1.19 + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 1.20 + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 1.21 + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 1.22 + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 1.23 + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 1.24 + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 1.25 + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 1.26 + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 1.27 +}; 1.28 + 1.29 +#define IS_ASCII(u) ((u) < 0x80) 1.30 +#define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z')) 1.31 +#define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z')) 1.32 +#define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u)) 1.33 +#define IS_ASCII_SPACE(u) (' ' == (u)) 1.34 + 1.35 +// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast 1.36 +// when they're called from within the case-insensitive comparators, so we 1.37 +// define inlined versions. 1.38 +static MOZ_ALWAYS_INLINE uint32_t 1.39 +ToLowerCase_inline(uint32_t aChar) 1.40 +{ 1.41 + if (IS_ASCII(aChar)) { 1.42 + return gASCIIToLower[aChar]; 1.43 + } 1.44 + 1.45 + return mozilla::unicode::GetLowercase(aChar); 1.46 +} 1.47 + 1.48 +static MOZ_ALWAYS_INLINE uint32_t 1.49 +ToLowerCaseASCII_inline(const uint32_t aChar) 1.50 +{ 1.51 + if (IS_ASCII(aChar)) { 1.52 + return gASCIIToLower[aChar]; 1.53 + } 1.54 + 1.55 + return aChar; 1.56 +} 1.57 + 1.58 +void 1.59 +ToLowerCase(nsAString& aString) 1.60 +{ 1.61 + char16_t *buf = aString.BeginWriting(); 1.62 + ToLowerCase(buf, buf, aString.Length()); 1.63 +} 1.64 + 1.65 +void 1.66 +ToLowerCase(const nsAString& aSource, 1.67 + nsAString& aDest) 1.68 +{ 1.69 + const char16_t *in; 1.70 + char16_t *out; 1.71 + uint32_t len = NS_StringGetData(aSource, &in); 1.72 + NS_StringGetMutableData(aDest, len, &out); 1.73 + NS_ASSERTION(out, "Uh..."); 1.74 + ToLowerCase(in, out, len); 1.75 +} 1.76 + 1.77 +uint32_t 1.78 +ToLowerCaseASCII(const uint32_t aChar) 1.79 +{ 1.80 + return ToLowerCaseASCII_inline(aChar); 1.81 +} 1.82 + 1.83 +void 1.84 +ToUpperCase(nsAString& aString) 1.85 +{ 1.86 + char16_t *buf = aString.BeginWriting(); 1.87 + ToUpperCase(buf, buf, aString.Length()); 1.88 +} 1.89 + 1.90 +void 1.91 +ToUpperCase(const nsAString& aSource, 1.92 + nsAString& aDest) 1.93 +{ 1.94 + const char16_t *in; 1.95 + char16_t *out; 1.96 + uint32_t len = NS_StringGetData(aSource, &in); 1.97 + NS_StringGetMutableData(aDest, len, &out); 1.98 + NS_ASSERTION(out, "Uh..."); 1.99 + ToUpperCase(in, out, len); 1.100 +} 1.101 + 1.102 +#ifdef MOZILLA_INTERNAL_API 1.103 + 1.104 +int32_t 1.105 +nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs, 1.106 + const char16_t* rhs, 1.107 + uint32_t lLength, 1.108 + uint32_t rLength) const 1.109 +{ 1.110 + return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) : 1.111 + (lLength > rLength) ? 1 : -1; 1.112 +} 1.113 + 1.114 +int32_t 1.115 +nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs, 1.116 + const char* rhs, 1.117 + uint32_t lLength, 1.118 + uint32_t rLength) const 1.119 +{ 1.120 + return CaseInsensitiveCompare(lhs, rhs, lLength, rLength); 1.121 +} 1.122 + 1.123 +int32_t 1.124 +nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs, 1.125 + const char16_t* rhs, 1.126 + uint32_t lLength, 1.127 + uint32_t rLength) const 1.128 +{ 1.129 + if (lLength != rLength) { 1.130 + if (lLength > rLength) 1.131 + return 1; 1.132 + return -1; 1.133 + } 1.134 + 1.135 + while (rLength) { 1.136 + // we don't care about surrogates here, because we're only 1.137 + // lowercasing the ASCII range 1.138 + char16_t l = *lhs++; 1.139 + char16_t r = *rhs++; 1.140 + if (l != r) { 1.141 + l = ToLowerCaseASCII_inline(l); 1.142 + r = ToLowerCaseASCII_inline(r); 1.143 + 1.144 + if (l > r) 1.145 + return 1; 1.146 + else if (r > l) 1.147 + return -1; 1.148 + } 1.149 + rLength--; 1.150 + } 1.151 + 1.152 + return 0; 1.153 +} 1.154 + 1.155 +#endif // MOZILLA_INTERNAL_API 1.156 + 1.157 +uint32_t 1.158 +ToLowerCase(uint32_t aChar) 1.159 +{ 1.160 + return ToLowerCase_inline(aChar); 1.161 +} 1.162 + 1.163 +void 1.164 +ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) 1.165 +{ 1.166 + for (uint32_t i = 0; i < aLen; i++) { 1.167 + uint32_t ch = aIn[i]; 1.168 + if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && 1.169 + NS_IS_LOW_SURROGATE(aIn[i + 1])) { 1.170 + ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); 1.171 + NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); 1.172 + aOut[i++] = H_SURROGATE(ch); 1.173 + aOut[i] = L_SURROGATE(ch); 1.174 + continue; 1.175 + } 1.176 + aOut[i] = ToLowerCase(ch); 1.177 + } 1.178 +} 1.179 + 1.180 +uint32_t 1.181 +ToUpperCase(uint32_t aChar) 1.182 +{ 1.183 + if (IS_ASCII(aChar)) { 1.184 + if (IS_ASCII_LOWER(aChar)) { 1.185 + return aChar - 0x20; 1.186 + } 1.187 + return aChar; 1.188 + } 1.189 + 1.190 + return mozilla::unicode::GetUppercase(aChar); 1.191 +} 1.192 + 1.193 +void 1.194 +ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) 1.195 +{ 1.196 + for (uint32_t i = 0; i < aLen; i++) { 1.197 + uint32_t ch = aIn[i]; 1.198 + if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && 1.199 + NS_IS_LOW_SURROGATE(aIn[i + 1])) { 1.200 + ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); 1.201 + NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); 1.202 + aOut[i++] = H_SURROGATE(ch); 1.203 + aOut[i] = L_SURROGATE(ch); 1.204 + continue; 1.205 + } 1.206 + aOut[i] = ToUpperCase(ch); 1.207 + } 1.208 +} 1.209 + 1.210 +uint32_t 1.211 +ToTitleCase(uint32_t aChar) 1.212 +{ 1.213 + if (IS_ASCII(aChar)) { 1.214 + return ToUpperCase(aChar); 1.215 + } 1.216 + 1.217 + return mozilla::unicode::GetTitlecaseForLower(aChar); 1.218 +} 1.219 + 1.220 +int32_t 1.221 +CaseInsensitiveCompare(const char16_t *a, 1.222 + const char16_t *b, 1.223 + uint32_t len) 1.224 +{ 1.225 + NS_ASSERTION(a && b, "Do not pass in invalid pointers!"); 1.226 + 1.227 + if (len) { 1.228 + do { 1.229 + uint32_t c1 = *a++; 1.230 + uint32_t c2 = *b++; 1.231 + 1.232 + // Unfortunately, we need to check for surrogates BEFORE we check 1.233 + // for equality, because we could have identical high surrogates 1.234 + // but non-identical characters, so we can't just skip them 1.235 + 1.236 + // If c1 isn't a surrogate, we don't bother to check c2; 1.237 + // in the case where it _is_ a surrogate, we're definitely going to get 1.238 + // a mismatch, and don't need to interpret and lowercase it 1.239 + 1.240 + if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) { 1.241 + c1 = SURROGATE_TO_UCS4(c1, *a++); 1.242 + if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) { 1.243 + c2 = SURROGATE_TO_UCS4(c2, *b++); 1.244 + } 1.245 + // If c2 wasn't a surrogate, decrementing len means we'd stop 1.246 + // short of the end of string b, but that doesn't actually matter 1.247 + // because we're going to find a mismatch and return early 1.248 + --len; 1.249 + } 1.250 + 1.251 + if (c1 != c2) { 1.252 + c1 = ToLowerCase_inline(c1); 1.253 + c2 = ToLowerCase_inline(c2); 1.254 + if (c1 != c2) { 1.255 + if (c1 < c2) { 1.256 + return -1; 1.257 + } 1.258 + return 1; 1.259 + } 1.260 + } 1.261 + } while (--len != 0); 1.262 + } 1.263 + return 0; 1.264 +} 1.265 + 1.266 +// Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext 1.267 +// to the byte following the end of the sequence. 1.268 +// 1.269 +// If the sequence is invalid, or if computing the codepoint would take us off 1.270 +// the end of the string (as marked by aEnd), returns -1 and does not set 1.271 +// aNext. Note that this function doesn't check that aStr < aEnd -- it assumes 1.272 +// you've done that already. 1.273 +static MOZ_ALWAYS_INLINE uint32_t 1.274 +GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) 1.275 +{ 1.276 + // Convert to unsigned char so that stuffing chars into PRUint32s doesn't 1.277 + // sign extend. 1.278 + const unsigned char *str = (unsigned char*)aStr; 1.279 + 1.280 + if (UTF8traits::isASCII(str[0])) { 1.281 + // It's ASCII; just convert to lower-case and return it. 1.282 + *aNext = aStr + 1; 1.283 + return gASCIIToLower[*str]; 1.284 + } 1.285 + if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) { 1.286 + // It's a two-byte sequence, so it looks like 1.287 + // 110XXXXX 10XXXXXX. 1.288 + // This is definitely in the BMP, so we can store straightaway into a 1.289 + // uint16_t. 1.290 + 1.291 + uint16_t c; 1.292 + c = (str[0] & 0x1F) << 6; 1.293 + c += (str[1] & 0x3F); 1.294 + 1.295 + // we don't go through ToLowerCase here, because we know this isn't 1.296 + // an ASCII character so the ASCII fast-path there is useless 1.297 + c = mozilla::unicode::GetLowercase(c); 1.298 + 1.299 + *aNext = aStr + 2; 1.300 + return c; 1.301 + } 1.302 + if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) { 1.303 + // It's a three-byte sequence, so it looks like 1.304 + // 1110XXXX 10XXXXXX 10XXXXXX. 1.305 + // This will just barely fit into 16-bits, so store into a uint16_t. 1.306 + 1.307 + uint16_t c; 1.308 + c = (str[0] & 0x0F) << 12; 1.309 + c += (str[1] & 0x3F) << 6; 1.310 + c += (str[2] & 0x3F); 1.311 + 1.312 + c = mozilla::unicode::GetLowercase(c); 1.313 + 1.314 + *aNext = aStr + 3; 1.315 + return c; 1.316 + } 1.317 + if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) { 1.318 + // It's a four-byte sequence, so it looks like 1.319 + // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX. 1.320 + 1.321 + uint32_t c; 1.322 + c = (str[0] & 0x07) << 18; 1.323 + c += (str[1] & 0x3F) << 12; 1.324 + c += (str[2] & 0x3F) << 6; 1.325 + c += (str[3] & 0x3F); 1.326 + 1.327 + c = mozilla::unicode::GetLowercase(c); 1.328 + 1.329 + *aNext = aStr + 4; 1.330 + return c; 1.331 + } 1.332 + 1.333 + // Hm, we don't understand this sequence. 1.334 + return -1; 1.335 +} 1.336 + 1.337 +int32_t CaseInsensitiveCompare(const char *aLeft, 1.338 + const char *aRight, 1.339 + uint32_t aLeftBytes, 1.340 + uint32_t aRightBytes) 1.341 +{ 1.342 + const char *leftEnd = aLeft + aLeftBytes; 1.343 + const char *rightEnd = aRight + aRightBytes; 1.344 + 1.345 + while (aLeft < leftEnd && aRight < rightEnd) { 1.346 + uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft); 1.347 + if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) 1.348 + return -1; 1.349 + 1.350 + uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight); 1.351 + if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) 1.352 + return -1; 1.353 + 1.354 + // Now leftChar and rightChar are lower-case, so we can compare them. 1.355 + if (leftChar != rightChar) { 1.356 + if (leftChar > rightChar) 1.357 + return 1; 1.358 + return -1; 1.359 + } 1.360 + } 1.361 + 1.362 + // Make sure that if one string is longer than the other we return the 1.363 + // correct result. 1.364 + if (aLeft < leftEnd) 1.365 + return 1; 1.366 + if (aRight < rightEnd) 1.367 + return -1; 1.368 + 1.369 + return 0; 1.370 +} 1.371 + 1.372 +bool 1.373 +CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight, 1.374 + const char* aLeftEnd, const char* aRightEnd, 1.375 + const char** aLeftNext, const char** aRightNext, 1.376 + bool* aErr) 1.377 +{ 1.378 + NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null."); 1.379 + NS_ASSERTION(aRightNext, "Out pointer shouldn't be null."); 1.380 + NS_ASSERTION(aErr, "Out pointer shouldn't be null."); 1.381 + NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd."); 1.382 + NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd."); 1.383 + 1.384 + uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext); 1.385 + if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) { 1.386 + *aErr = true; 1.387 + return false; 1.388 + } 1.389 + 1.390 + uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext); 1.391 + if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) { 1.392 + *aErr = true; 1.393 + return false; 1.394 + } 1.395 + 1.396 + // Can't have an error past this point. 1.397 + *aErr = false; 1.398 + 1.399 + return leftChar == rightChar; 1.400 +} 1.401 + 1.402 +namespace mozilla { 1.403 + 1.404 +uint32_t 1.405 +HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr) 1.406 +{ 1.407 + uint32_t hash = 0; 1.408 + const char* s = aUTF8; 1.409 + const char* end = aUTF8 + aLength; 1.410 + 1.411 + *aErr = false; 1.412 + 1.413 + while (s < end) 1.414 + { 1.415 + uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr); 1.416 + if (*aErr) { 1.417 + return 0; 1.418 + } 1.419 + 1.420 + if (ucs4 < PLANE1_BASE) { 1.421 + hash = AddToHash(hash, ucs4); 1.422 + } 1.423 + else { 1.424 + hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4)); 1.425 + } 1.426 + } 1.427 + 1.428 + return hash; 1.429 +} 1.430 + 1.431 +} // namespace mozilla