michael@0: /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsUnicharUtils.h" michael@0: #include "nsXPCOMStrings.h" michael@0: #include "nsUTF8Utils.h" michael@0: #include "nsUnicodeProperties.h" michael@0: #include "mozilla/Likely.h" michael@0: #include "mozilla/HashFunctions.h" michael@0: michael@0: // We map x -> x, except for upper-case letters, michael@0: // which we map to their lower-case equivalents. michael@0: static const uint8_t gASCIIToLower [128] = { michael@0: 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, michael@0: 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, michael@0: 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, michael@0: 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, michael@0: 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, michael@0: 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, michael@0: 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, michael@0: 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, michael@0: }; michael@0: michael@0: #define IS_ASCII(u) ((u) < 0x80) michael@0: #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z')) michael@0: #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z')) michael@0: #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u)) michael@0: #define IS_ASCII_SPACE(u) (' ' == (u)) michael@0: michael@0: // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast michael@0: // when they're called from within the case-insensitive comparators, so we michael@0: // define inlined versions. michael@0: static MOZ_ALWAYS_INLINE uint32_t michael@0: ToLowerCase_inline(uint32_t aChar) michael@0: { michael@0: if (IS_ASCII(aChar)) { michael@0: return gASCIIToLower[aChar]; michael@0: } michael@0: michael@0: return mozilla::unicode::GetLowercase(aChar); michael@0: } michael@0: michael@0: static MOZ_ALWAYS_INLINE uint32_t michael@0: ToLowerCaseASCII_inline(const uint32_t aChar) michael@0: { michael@0: if (IS_ASCII(aChar)) { michael@0: return gASCIIToLower[aChar]; michael@0: } michael@0: michael@0: return aChar; michael@0: } michael@0: michael@0: void michael@0: ToLowerCase(nsAString& aString) michael@0: { michael@0: char16_t *buf = aString.BeginWriting(); michael@0: ToLowerCase(buf, buf, aString.Length()); michael@0: } michael@0: michael@0: void michael@0: ToLowerCase(const nsAString& aSource, michael@0: nsAString& aDest) michael@0: { michael@0: const char16_t *in; michael@0: char16_t *out; michael@0: uint32_t len = NS_StringGetData(aSource, &in); michael@0: NS_StringGetMutableData(aDest, len, &out); michael@0: NS_ASSERTION(out, "Uh..."); michael@0: ToLowerCase(in, out, len); michael@0: } michael@0: michael@0: uint32_t michael@0: ToLowerCaseASCII(const uint32_t aChar) michael@0: { michael@0: return ToLowerCaseASCII_inline(aChar); michael@0: } michael@0: michael@0: void michael@0: ToUpperCase(nsAString& aString) michael@0: { michael@0: char16_t *buf = aString.BeginWriting(); michael@0: ToUpperCase(buf, buf, aString.Length()); michael@0: } michael@0: michael@0: void michael@0: ToUpperCase(const nsAString& aSource, michael@0: nsAString& aDest) michael@0: { michael@0: const char16_t *in; michael@0: char16_t *out; michael@0: uint32_t len = NS_StringGetData(aSource, &in); michael@0: NS_StringGetMutableData(aDest, len, &out); michael@0: NS_ASSERTION(out, "Uh..."); michael@0: ToUpperCase(in, out, len); michael@0: } michael@0: michael@0: #ifdef MOZILLA_INTERNAL_API michael@0: michael@0: int32_t michael@0: nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs, michael@0: const char16_t* rhs, michael@0: uint32_t lLength, michael@0: uint32_t rLength) const michael@0: { michael@0: return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) : michael@0: (lLength > rLength) ? 1 : -1; michael@0: } michael@0: michael@0: int32_t michael@0: nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs, michael@0: const char* rhs, michael@0: uint32_t lLength, michael@0: uint32_t rLength) const michael@0: { michael@0: return CaseInsensitiveCompare(lhs, rhs, lLength, rLength); michael@0: } michael@0: michael@0: int32_t michael@0: nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs, michael@0: const char16_t* rhs, michael@0: uint32_t lLength, michael@0: uint32_t rLength) const michael@0: { michael@0: if (lLength != rLength) { michael@0: if (lLength > rLength) michael@0: return 1; michael@0: return -1; michael@0: } michael@0: michael@0: while (rLength) { michael@0: // we don't care about surrogates here, because we're only michael@0: // lowercasing the ASCII range michael@0: char16_t l = *lhs++; michael@0: char16_t r = *rhs++; michael@0: if (l != r) { michael@0: l = ToLowerCaseASCII_inline(l); michael@0: r = ToLowerCaseASCII_inline(r); michael@0: michael@0: if (l > r) michael@0: return 1; michael@0: else if (r > l) michael@0: return -1; michael@0: } michael@0: rLength--; michael@0: } michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: #endif // MOZILLA_INTERNAL_API michael@0: michael@0: uint32_t michael@0: ToLowerCase(uint32_t aChar) michael@0: { michael@0: return ToLowerCase_inline(aChar); michael@0: } michael@0: michael@0: void michael@0: ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) michael@0: { michael@0: for (uint32_t i = 0; i < aLen; i++) { michael@0: uint32_t ch = aIn[i]; michael@0: if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && michael@0: NS_IS_LOW_SURROGATE(aIn[i + 1])) { michael@0: ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); michael@0: NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); michael@0: aOut[i++] = H_SURROGATE(ch); michael@0: aOut[i] = L_SURROGATE(ch); michael@0: continue; michael@0: } michael@0: aOut[i] = ToLowerCase(ch); michael@0: } michael@0: } michael@0: michael@0: uint32_t michael@0: ToUpperCase(uint32_t aChar) michael@0: { michael@0: if (IS_ASCII(aChar)) { michael@0: if (IS_ASCII_LOWER(aChar)) { michael@0: return aChar - 0x20; michael@0: } michael@0: return aChar; michael@0: } michael@0: michael@0: return mozilla::unicode::GetUppercase(aChar); michael@0: } michael@0: michael@0: void michael@0: ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) michael@0: { michael@0: for (uint32_t i = 0; i < aLen; i++) { michael@0: uint32_t ch = aIn[i]; michael@0: if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && michael@0: NS_IS_LOW_SURROGATE(aIn[i + 1])) { michael@0: ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); michael@0: NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); michael@0: aOut[i++] = H_SURROGATE(ch); michael@0: aOut[i] = L_SURROGATE(ch); michael@0: continue; michael@0: } michael@0: aOut[i] = ToUpperCase(ch); michael@0: } michael@0: } michael@0: michael@0: uint32_t michael@0: ToTitleCase(uint32_t aChar) michael@0: { michael@0: if (IS_ASCII(aChar)) { michael@0: return ToUpperCase(aChar); michael@0: } michael@0: michael@0: return mozilla::unicode::GetTitlecaseForLower(aChar); michael@0: } michael@0: michael@0: int32_t michael@0: CaseInsensitiveCompare(const char16_t *a, michael@0: const char16_t *b, michael@0: uint32_t len) michael@0: { michael@0: NS_ASSERTION(a && b, "Do not pass in invalid pointers!"); michael@0: michael@0: if (len) { michael@0: do { michael@0: uint32_t c1 = *a++; michael@0: uint32_t c2 = *b++; michael@0: michael@0: // Unfortunately, we need to check for surrogates BEFORE we check michael@0: // for equality, because we could have identical high surrogates michael@0: // but non-identical characters, so we can't just skip them michael@0: michael@0: // If c1 isn't a surrogate, we don't bother to check c2; michael@0: // in the case where it _is_ a surrogate, we're definitely going to get michael@0: // a mismatch, and don't need to interpret and lowercase it michael@0: michael@0: if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) { michael@0: c1 = SURROGATE_TO_UCS4(c1, *a++); michael@0: if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) { michael@0: c2 = SURROGATE_TO_UCS4(c2, *b++); michael@0: } michael@0: // If c2 wasn't a surrogate, decrementing len means we'd stop michael@0: // short of the end of string b, but that doesn't actually matter michael@0: // because we're going to find a mismatch and return early michael@0: --len; michael@0: } michael@0: michael@0: if (c1 != c2) { michael@0: c1 = ToLowerCase_inline(c1); michael@0: c2 = ToLowerCase_inline(c2); michael@0: if (c1 != c2) { michael@0: if (c1 < c2) { michael@0: return -1; michael@0: } michael@0: return 1; michael@0: } michael@0: } michael@0: } while (--len != 0); michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext michael@0: // to the byte following the end of the sequence. michael@0: // michael@0: // If the sequence is invalid, or if computing the codepoint would take us off michael@0: // the end of the string (as marked by aEnd), returns -1 and does not set michael@0: // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes michael@0: // you've done that already. michael@0: static MOZ_ALWAYS_INLINE uint32_t michael@0: GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) michael@0: { michael@0: // Convert to unsigned char so that stuffing chars into PRUint32s doesn't michael@0: // sign extend. michael@0: const unsigned char *str = (unsigned char*)aStr; michael@0: michael@0: if (UTF8traits::isASCII(str[0])) { michael@0: // It's ASCII; just convert to lower-case and return it. michael@0: *aNext = aStr + 1; michael@0: return gASCIIToLower[*str]; michael@0: } michael@0: if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) { michael@0: // It's a two-byte sequence, so it looks like michael@0: // 110XXXXX 10XXXXXX. michael@0: // This is definitely in the BMP, so we can store straightaway into a michael@0: // uint16_t. michael@0: michael@0: uint16_t c; michael@0: c = (str[0] & 0x1F) << 6; michael@0: c += (str[1] & 0x3F); michael@0: michael@0: // we don't go through ToLowerCase here, because we know this isn't michael@0: // an ASCII character so the ASCII fast-path there is useless michael@0: c = mozilla::unicode::GetLowercase(c); michael@0: michael@0: *aNext = aStr + 2; michael@0: return c; michael@0: } michael@0: if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) { michael@0: // It's a three-byte sequence, so it looks like michael@0: // 1110XXXX 10XXXXXX 10XXXXXX. michael@0: // This will just barely fit into 16-bits, so store into a uint16_t. michael@0: michael@0: uint16_t c; michael@0: c = (str[0] & 0x0F) << 12; michael@0: c += (str[1] & 0x3F) << 6; michael@0: c += (str[2] & 0x3F); michael@0: michael@0: c = mozilla::unicode::GetLowercase(c); michael@0: michael@0: *aNext = aStr + 3; michael@0: return c; michael@0: } michael@0: if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) { michael@0: // It's a four-byte sequence, so it looks like michael@0: // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX. michael@0: michael@0: uint32_t c; michael@0: c = (str[0] & 0x07) << 18; michael@0: c += (str[1] & 0x3F) << 12; michael@0: c += (str[2] & 0x3F) << 6; michael@0: c += (str[3] & 0x3F); michael@0: michael@0: c = mozilla::unicode::GetLowercase(c); michael@0: michael@0: *aNext = aStr + 4; michael@0: return c; michael@0: } michael@0: michael@0: // Hm, we don't understand this sequence. michael@0: return -1; michael@0: } michael@0: michael@0: int32_t CaseInsensitiveCompare(const char *aLeft, michael@0: const char *aRight, michael@0: uint32_t aLeftBytes, michael@0: uint32_t aRightBytes) michael@0: { michael@0: const char *leftEnd = aLeft + aLeftBytes; michael@0: const char *rightEnd = aRight + aRightBytes; michael@0: michael@0: while (aLeft < leftEnd && aRight < rightEnd) { michael@0: uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft); michael@0: if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) michael@0: return -1; michael@0: michael@0: uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight); michael@0: if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) michael@0: return -1; michael@0: michael@0: // Now leftChar and rightChar are lower-case, so we can compare them. michael@0: if (leftChar != rightChar) { michael@0: if (leftChar > rightChar) michael@0: return 1; michael@0: return -1; michael@0: } michael@0: } michael@0: michael@0: // Make sure that if one string is longer than the other we return the michael@0: // correct result. michael@0: if (aLeft < leftEnd) michael@0: return 1; michael@0: if (aRight < rightEnd) michael@0: return -1; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: bool michael@0: CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight, michael@0: const char* aLeftEnd, const char* aRightEnd, michael@0: const char** aLeftNext, const char** aRightNext, michael@0: bool* aErr) michael@0: { michael@0: NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null."); michael@0: NS_ASSERTION(aRightNext, "Out pointer shouldn't be null."); michael@0: NS_ASSERTION(aErr, "Out pointer shouldn't be null."); michael@0: NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd."); michael@0: NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd."); michael@0: michael@0: uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext); michael@0: if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) { michael@0: *aErr = true; michael@0: return false; michael@0: } michael@0: michael@0: uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext); michael@0: if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) { michael@0: *aErr = true; michael@0: return false; michael@0: } michael@0: michael@0: // Can't have an error past this point. michael@0: *aErr = false; michael@0: michael@0: return leftChar == rightChar; michael@0: } michael@0: michael@0: namespace mozilla { michael@0: michael@0: uint32_t michael@0: HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr) michael@0: { michael@0: uint32_t hash = 0; michael@0: const char* s = aUTF8; michael@0: const char* end = aUTF8 + aLength; michael@0: michael@0: *aErr = false; michael@0: michael@0: while (s < end) michael@0: { michael@0: uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr); michael@0: if (*aErr) { michael@0: return 0; michael@0: } michael@0: michael@0: if (ucs4 < PLANE1_BASE) { michael@0: hash = AddToHash(hash, ucs4); michael@0: } michael@0: else { michael@0: hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4)); michael@0: } michael@0: } michael@0: michael@0: return hash; michael@0: } michael@0: michael@0: } // namespace mozilla