intl/unicharutil/util/nsUnicharUtils.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "nsUnicharUtils.h"
michael@0 7 #include "nsXPCOMStrings.h"
michael@0 8 #include "nsUTF8Utils.h"
michael@0 9 #include "nsUnicodeProperties.h"
michael@0 10 #include "mozilla/Likely.h"
michael@0 11 #include "mozilla/HashFunctions.h"
michael@0 12
michael@0 13 // We map x -> x, except for upper-case letters,
michael@0 14 // which we map to their lower-case equivalents.
michael@0 15 static const uint8_t gASCIIToLower [128] = {
michael@0 16 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
michael@0 17 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
michael@0 18 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
michael@0 19 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
michael@0 20 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
michael@0 21 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
michael@0 22 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
michael@0 23 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
michael@0 24 };
michael@0 25
michael@0 26 #define IS_ASCII(u) ((u) < 0x80)
michael@0 27 #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
michael@0 28 #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
michael@0 29 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
michael@0 30 #define IS_ASCII_SPACE(u) (' ' == (u))
michael@0 31
michael@0 32 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
michael@0 33 // when they're called from within the case-insensitive comparators, so we
michael@0 34 // define inlined versions.
michael@0 35 static MOZ_ALWAYS_INLINE uint32_t
michael@0 36 ToLowerCase_inline(uint32_t aChar)
michael@0 37 {
michael@0 38 if (IS_ASCII(aChar)) {
michael@0 39 return gASCIIToLower[aChar];
michael@0 40 }
michael@0 41
michael@0 42 return mozilla::unicode::GetLowercase(aChar);
michael@0 43 }
michael@0 44
michael@0 45 static MOZ_ALWAYS_INLINE uint32_t
michael@0 46 ToLowerCaseASCII_inline(const uint32_t aChar)
michael@0 47 {
michael@0 48 if (IS_ASCII(aChar)) {
michael@0 49 return gASCIIToLower[aChar];
michael@0 50 }
michael@0 51
michael@0 52 return aChar;
michael@0 53 }
michael@0 54
michael@0 55 void
michael@0 56 ToLowerCase(nsAString& aString)
michael@0 57 {
michael@0 58 char16_t *buf = aString.BeginWriting();
michael@0 59 ToLowerCase(buf, buf, aString.Length());
michael@0 60 }
michael@0 61
michael@0 62 void
michael@0 63 ToLowerCase(const nsAString& aSource,
michael@0 64 nsAString& aDest)
michael@0 65 {
michael@0 66 const char16_t *in;
michael@0 67 char16_t *out;
michael@0 68 uint32_t len = NS_StringGetData(aSource, &in);
michael@0 69 NS_StringGetMutableData(aDest, len, &out);
michael@0 70 NS_ASSERTION(out, "Uh...");
michael@0 71 ToLowerCase(in, out, len);
michael@0 72 }
michael@0 73
michael@0 74 uint32_t
michael@0 75 ToLowerCaseASCII(const uint32_t aChar)
michael@0 76 {
michael@0 77 return ToLowerCaseASCII_inline(aChar);
michael@0 78 }
michael@0 79
michael@0 80 void
michael@0 81 ToUpperCase(nsAString& aString)
michael@0 82 {
michael@0 83 char16_t *buf = aString.BeginWriting();
michael@0 84 ToUpperCase(buf, buf, aString.Length());
michael@0 85 }
michael@0 86
michael@0 87 void
michael@0 88 ToUpperCase(const nsAString& aSource,
michael@0 89 nsAString& aDest)
michael@0 90 {
michael@0 91 const char16_t *in;
michael@0 92 char16_t *out;
michael@0 93 uint32_t len = NS_StringGetData(aSource, &in);
michael@0 94 NS_StringGetMutableData(aDest, len, &out);
michael@0 95 NS_ASSERTION(out, "Uh...");
michael@0 96 ToUpperCase(in, out, len);
michael@0 97 }
michael@0 98
michael@0 99 #ifdef MOZILLA_INTERNAL_API
michael@0 100
michael@0 101 int32_t
michael@0 102 nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
michael@0 103 const char16_t* rhs,
michael@0 104 uint32_t lLength,
michael@0 105 uint32_t rLength) const
michael@0 106 {
michael@0 107 return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
michael@0 108 (lLength > rLength) ? 1 : -1;
michael@0 109 }
michael@0 110
michael@0 111 int32_t
michael@0 112 nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
michael@0 113 const char* rhs,
michael@0 114 uint32_t lLength,
michael@0 115 uint32_t rLength) const
michael@0 116 {
michael@0 117 return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
michael@0 118 }
michael@0 119
michael@0 120 int32_t
michael@0 121 nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
michael@0 122 const char16_t* rhs,
michael@0 123 uint32_t lLength,
michael@0 124 uint32_t rLength) const
michael@0 125 {
michael@0 126 if (lLength != rLength) {
michael@0 127 if (lLength > rLength)
michael@0 128 return 1;
michael@0 129 return -1;
michael@0 130 }
michael@0 131
michael@0 132 while (rLength) {
michael@0 133 // we don't care about surrogates here, because we're only
michael@0 134 // lowercasing the ASCII range
michael@0 135 char16_t l = *lhs++;
michael@0 136 char16_t r = *rhs++;
michael@0 137 if (l != r) {
michael@0 138 l = ToLowerCaseASCII_inline(l);
michael@0 139 r = ToLowerCaseASCII_inline(r);
michael@0 140
michael@0 141 if (l > r)
michael@0 142 return 1;
michael@0 143 else if (r > l)
michael@0 144 return -1;
michael@0 145 }
michael@0 146 rLength--;
michael@0 147 }
michael@0 148
michael@0 149 return 0;
michael@0 150 }
michael@0 151
michael@0 152 #endif // MOZILLA_INTERNAL_API
michael@0 153
michael@0 154 uint32_t
michael@0 155 ToLowerCase(uint32_t aChar)
michael@0 156 {
michael@0 157 return ToLowerCase_inline(aChar);
michael@0 158 }
michael@0 159
michael@0 160 void
michael@0 161 ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
michael@0 162 {
michael@0 163 for (uint32_t i = 0; i < aLen; i++) {
michael@0 164 uint32_t ch = aIn[i];
michael@0 165 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
michael@0 166 NS_IS_LOW_SURROGATE(aIn[i + 1])) {
michael@0 167 ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
michael@0 168 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
michael@0 169 aOut[i++] = H_SURROGATE(ch);
michael@0 170 aOut[i] = L_SURROGATE(ch);
michael@0 171 continue;
michael@0 172 }
michael@0 173 aOut[i] = ToLowerCase(ch);
michael@0 174 }
michael@0 175 }
michael@0 176
michael@0 177 uint32_t
michael@0 178 ToUpperCase(uint32_t aChar)
michael@0 179 {
michael@0 180 if (IS_ASCII(aChar)) {
michael@0 181 if (IS_ASCII_LOWER(aChar)) {
michael@0 182 return aChar - 0x20;
michael@0 183 }
michael@0 184 return aChar;
michael@0 185 }
michael@0 186
michael@0 187 return mozilla::unicode::GetUppercase(aChar);
michael@0 188 }
michael@0 189
michael@0 190 void
michael@0 191 ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
michael@0 192 {
michael@0 193 for (uint32_t i = 0; i < aLen; i++) {
michael@0 194 uint32_t ch = aIn[i];
michael@0 195 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
michael@0 196 NS_IS_LOW_SURROGATE(aIn[i + 1])) {
michael@0 197 ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
michael@0 198 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
michael@0 199 aOut[i++] = H_SURROGATE(ch);
michael@0 200 aOut[i] = L_SURROGATE(ch);
michael@0 201 continue;
michael@0 202 }
michael@0 203 aOut[i] = ToUpperCase(ch);
michael@0 204 }
michael@0 205 }
michael@0 206
michael@0 207 uint32_t
michael@0 208 ToTitleCase(uint32_t aChar)
michael@0 209 {
michael@0 210 if (IS_ASCII(aChar)) {
michael@0 211 return ToUpperCase(aChar);
michael@0 212 }
michael@0 213
michael@0 214 return mozilla::unicode::GetTitlecaseForLower(aChar);
michael@0 215 }
michael@0 216
michael@0 217 int32_t
michael@0 218 CaseInsensitiveCompare(const char16_t *a,
michael@0 219 const char16_t *b,
michael@0 220 uint32_t len)
michael@0 221 {
michael@0 222 NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
michael@0 223
michael@0 224 if (len) {
michael@0 225 do {
michael@0 226 uint32_t c1 = *a++;
michael@0 227 uint32_t c2 = *b++;
michael@0 228
michael@0 229 // Unfortunately, we need to check for surrogates BEFORE we check
michael@0 230 // for equality, because we could have identical high surrogates
michael@0 231 // but non-identical characters, so we can't just skip them
michael@0 232
michael@0 233 // If c1 isn't a surrogate, we don't bother to check c2;
michael@0 234 // in the case where it _is_ a surrogate, we're definitely going to get
michael@0 235 // a mismatch, and don't need to interpret and lowercase it
michael@0 236
michael@0 237 if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
michael@0 238 c1 = SURROGATE_TO_UCS4(c1, *a++);
michael@0 239 if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
michael@0 240 c2 = SURROGATE_TO_UCS4(c2, *b++);
michael@0 241 }
michael@0 242 // If c2 wasn't a surrogate, decrementing len means we'd stop
michael@0 243 // short of the end of string b, but that doesn't actually matter
michael@0 244 // because we're going to find a mismatch and return early
michael@0 245 --len;
michael@0 246 }
michael@0 247
michael@0 248 if (c1 != c2) {
michael@0 249 c1 = ToLowerCase_inline(c1);
michael@0 250 c2 = ToLowerCase_inline(c2);
michael@0 251 if (c1 != c2) {
michael@0 252 if (c1 < c2) {
michael@0 253 return -1;
michael@0 254 }
michael@0 255 return 1;
michael@0 256 }
michael@0 257 }
michael@0 258 } while (--len != 0);
michael@0 259 }
michael@0 260 return 0;
michael@0 261 }
michael@0 262
michael@0 263 // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext
michael@0 264 // to the byte following the end of the sequence.
michael@0 265 //
michael@0 266 // If the sequence is invalid, or if computing the codepoint would take us off
michael@0 267 // the end of the string (as marked by aEnd), returns -1 and does not set
michael@0 268 // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes
michael@0 269 // you've done that already.
michael@0 270 static MOZ_ALWAYS_INLINE uint32_t
michael@0 271 GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
michael@0 272 {
michael@0 273 // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
michael@0 274 // sign extend.
michael@0 275 const unsigned char *str = (unsigned char*)aStr;
michael@0 276
michael@0 277 if (UTF8traits::isASCII(str[0])) {
michael@0 278 // It's ASCII; just convert to lower-case and return it.
michael@0 279 *aNext = aStr + 1;
michael@0 280 return gASCIIToLower[*str];
michael@0 281 }
michael@0 282 if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
michael@0 283 // It's a two-byte sequence, so it looks like
michael@0 284 // 110XXXXX 10XXXXXX.
michael@0 285 // This is definitely in the BMP, so we can store straightaway into a
michael@0 286 // uint16_t.
michael@0 287
michael@0 288 uint16_t c;
michael@0 289 c = (str[0] & 0x1F) << 6;
michael@0 290 c += (str[1] & 0x3F);
michael@0 291
michael@0 292 // we don't go through ToLowerCase here, because we know this isn't
michael@0 293 // an ASCII character so the ASCII fast-path there is useless
michael@0 294 c = mozilla::unicode::GetLowercase(c);
michael@0 295
michael@0 296 *aNext = aStr + 2;
michael@0 297 return c;
michael@0 298 }
michael@0 299 if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
michael@0 300 // It's a three-byte sequence, so it looks like
michael@0 301 // 1110XXXX 10XXXXXX 10XXXXXX.
michael@0 302 // This will just barely fit into 16-bits, so store into a uint16_t.
michael@0 303
michael@0 304 uint16_t c;
michael@0 305 c = (str[0] & 0x0F) << 12;
michael@0 306 c += (str[1] & 0x3F) << 6;
michael@0 307 c += (str[2] & 0x3F);
michael@0 308
michael@0 309 c = mozilla::unicode::GetLowercase(c);
michael@0 310
michael@0 311 *aNext = aStr + 3;
michael@0 312 return c;
michael@0 313 }
michael@0 314 if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
michael@0 315 // It's a four-byte sequence, so it looks like
michael@0 316 // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
michael@0 317
michael@0 318 uint32_t c;
michael@0 319 c = (str[0] & 0x07) << 18;
michael@0 320 c += (str[1] & 0x3F) << 12;
michael@0 321 c += (str[2] & 0x3F) << 6;
michael@0 322 c += (str[3] & 0x3F);
michael@0 323
michael@0 324 c = mozilla::unicode::GetLowercase(c);
michael@0 325
michael@0 326 *aNext = aStr + 4;
michael@0 327 return c;
michael@0 328 }
michael@0 329
michael@0 330 // Hm, we don't understand this sequence.
michael@0 331 return -1;
michael@0 332 }
michael@0 333
michael@0 334 int32_t CaseInsensitiveCompare(const char *aLeft,
michael@0 335 const char *aRight,
michael@0 336 uint32_t aLeftBytes,
michael@0 337 uint32_t aRightBytes)
michael@0 338 {
michael@0 339 const char *leftEnd = aLeft + aLeftBytes;
michael@0 340 const char *rightEnd = aRight + aRightBytes;
michael@0 341
michael@0 342 while (aLeft < leftEnd && aRight < rightEnd) {
michael@0 343 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
michael@0 344 if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
michael@0 345 return -1;
michael@0 346
michael@0 347 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
michael@0 348 if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
michael@0 349 return -1;
michael@0 350
michael@0 351 // Now leftChar and rightChar are lower-case, so we can compare them.
michael@0 352 if (leftChar != rightChar) {
michael@0 353 if (leftChar > rightChar)
michael@0 354 return 1;
michael@0 355 return -1;
michael@0 356 }
michael@0 357 }
michael@0 358
michael@0 359 // Make sure that if one string is longer than the other we return the
michael@0 360 // correct result.
michael@0 361 if (aLeft < leftEnd)
michael@0 362 return 1;
michael@0 363 if (aRight < rightEnd)
michael@0 364 return -1;
michael@0 365
michael@0 366 return 0;
michael@0 367 }
michael@0 368
michael@0 369 bool
michael@0 370 CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
michael@0 371 const char* aLeftEnd, const char* aRightEnd,
michael@0 372 const char** aLeftNext, const char** aRightNext,
michael@0 373 bool* aErr)
michael@0 374 {
michael@0 375 NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
michael@0 376 NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
michael@0 377 NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
michael@0 378 NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
michael@0 379 NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
michael@0 380
michael@0 381 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
michael@0 382 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
michael@0 383 *aErr = true;
michael@0 384 return false;
michael@0 385 }
michael@0 386
michael@0 387 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
michael@0 388 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
michael@0 389 *aErr = true;
michael@0 390 return false;
michael@0 391 }
michael@0 392
michael@0 393 // Can't have an error past this point.
michael@0 394 *aErr = false;
michael@0 395
michael@0 396 return leftChar == rightChar;
michael@0 397 }
michael@0 398
michael@0 399 namespace mozilla {
michael@0 400
michael@0 401 uint32_t
michael@0 402 HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
michael@0 403 {
michael@0 404 uint32_t hash = 0;
michael@0 405 const char* s = aUTF8;
michael@0 406 const char* end = aUTF8 + aLength;
michael@0 407
michael@0 408 *aErr = false;
michael@0 409
michael@0 410 while (s < end)
michael@0 411 {
michael@0 412 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
michael@0 413 if (*aErr) {
michael@0 414 return 0;
michael@0 415 }
michael@0 416
michael@0 417 if (ucs4 < PLANE1_BASE) {
michael@0 418 hash = AddToHash(hash, ucs4);
michael@0 419 }
michael@0 420 else {
michael@0 421 hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
michael@0 422 }
michael@0 423 }
michael@0 424
michael@0 425 return hash;
michael@0 426 }
michael@0 427
michael@0 428 } // namespace mozilla

mercurial