intl/unicharutil/util/nsUnicharUtils.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/unicharutil/util/nsUnicharUtils.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,428 @@
     1.4 +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsUnicharUtils.h"
    1.10 +#include "nsXPCOMStrings.h"
    1.11 +#include "nsUTF8Utils.h"
    1.12 +#include "nsUnicodeProperties.h"
    1.13 +#include "mozilla/Likely.h"
    1.14 +#include "mozilla/HashFunctions.h"
    1.15 +
    1.16 +// We map x -> x, except for upper-case letters,
    1.17 +// which we map to their lower-case equivalents.
    1.18 +static const uint8_t gASCIIToLower [128] = {
    1.19 +    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    1.20 +    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    1.21 +    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
    1.22 +    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
    1.23 +    0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    1.24 +    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
    1.25 +    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
    1.26 +    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
    1.27 +};
    1.28 +
    1.29 +#define IS_ASCII(u)       ((u) < 0x80)
    1.30 +#define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
    1.31 +#define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
    1.32 +#define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
    1.33 +#define IS_ASCII_SPACE(u) (' ' == (u))
    1.34 +
    1.35 +// We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
    1.36 +// when they're called from within the case-insensitive comparators, so we
    1.37 +// define inlined versions.
    1.38 +static MOZ_ALWAYS_INLINE uint32_t
    1.39 +ToLowerCase_inline(uint32_t aChar)
    1.40 +{
    1.41 +  if (IS_ASCII(aChar)) {
    1.42 +    return gASCIIToLower[aChar];
    1.43 +  }
    1.44 +
    1.45 +  return mozilla::unicode::GetLowercase(aChar);
    1.46 +}
    1.47 +
    1.48 +static MOZ_ALWAYS_INLINE uint32_t
    1.49 +ToLowerCaseASCII_inline(const uint32_t aChar)
    1.50 +{
    1.51 +  if (IS_ASCII(aChar)) {
    1.52 +    return gASCIIToLower[aChar];
    1.53 +  }
    1.54 +
    1.55 +  return aChar;
    1.56 +}
    1.57 +
    1.58 +void
    1.59 +ToLowerCase(nsAString& aString)
    1.60 +{
    1.61 +  char16_t *buf = aString.BeginWriting();
    1.62 +  ToLowerCase(buf, buf, aString.Length());
    1.63 +}
    1.64 +
    1.65 +void
    1.66 +ToLowerCase(const nsAString& aSource,
    1.67 +            nsAString& aDest)
    1.68 +{
    1.69 +  const char16_t *in;
    1.70 +  char16_t *out;
    1.71 +  uint32_t len = NS_StringGetData(aSource, &in);
    1.72 +  NS_StringGetMutableData(aDest, len, &out);
    1.73 +  NS_ASSERTION(out, "Uh...");
    1.74 +  ToLowerCase(in, out, len);
    1.75 +}
    1.76 +
    1.77 +uint32_t
    1.78 +ToLowerCaseASCII(const uint32_t aChar)
    1.79 +{
    1.80 +  return ToLowerCaseASCII_inline(aChar);
    1.81 +}
    1.82 +
    1.83 +void
    1.84 +ToUpperCase(nsAString& aString)
    1.85 +{
    1.86 +  char16_t *buf = aString.BeginWriting();
    1.87 +  ToUpperCase(buf, buf, aString.Length());
    1.88 +}
    1.89 +
    1.90 +void
    1.91 +ToUpperCase(const nsAString& aSource,
    1.92 +            nsAString& aDest)
    1.93 +{
    1.94 +  const char16_t *in;
    1.95 +  char16_t *out;
    1.96 +  uint32_t len = NS_StringGetData(aSource, &in);
    1.97 +  NS_StringGetMutableData(aDest, len, &out);
    1.98 +  NS_ASSERTION(out, "Uh...");
    1.99 +  ToUpperCase(in, out, len);
   1.100 +}
   1.101 +
   1.102 +#ifdef MOZILLA_INTERNAL_API
   1.103 +
   1.104 +int32_t
   1.105 +nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
   1.106 +                                              const char16_t* rhs,
   1.107 +                                              uint32_t lLength,
   1.108 +                                              uint32_t rLength) const
   1.109 +{
   1.110 +  return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
   1.111 +         (lLength > rLength) ? 1 : -1;
   1.112 +}
   1.113 +
   1.114 +int32_t
   1.115 +nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
   1.116 +                                                  const char* rhs,
   1.117 +                                                  uint32_t lLength,
   1.118 +                                                  uint32_t rLength) const
   1.119 +{
   1.120 +  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
   1.121 +}
   1.122 +
   1.123 +int32_t
   1.124 +nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
   1.125 +                                                   const char16_t* rhs,
   1.126 +                                                   uint32_t lLength,
   1.127 +                                                   uint32_t rLength) const
   1.128 +{
   1.129 +  if (lLength != rLength) {
   1.130 +    if (lLength > rLength)
   1.131 +      return 1;
   1.132 +    return -1;
   1.133 +  }
   1.134 +
   1.135 +  while (rLength) {
   1.136 +    // we don't care about surrogates here, because we're only
   1.137 +    // lowercasing the ASCII range
   1.138 +    char16_t l = *lhs++;
   1.139 +    char16_t r = *rhs++;
   1.140 +    if (l != r) {
   1.141 +      l = ToLowerCaseASCII_inline(l);
   1.142 +      r = ToLowerCaseASCII_inline(r);
   1.143 +
   1.144 +      if (l > r)
   1.145 +        return 1;
   1.146 +      else if (r > l)
   1.147 +        return -1;
   1.148 +    }
   1.149 +    rLength--;
   1.150 +  }
   1.151 +
   1.152 +  return 0;
   1.153 +}
   1.154 +
   1.155 +#endif // MOZILLA_INTERNAL_API
   1.156 +
   1.157 +uint32_t
   1.158 +ToLowerCase(uint32_t aChar)
   1.159 +{
   1.160 +  return ToLowerCase_inline(aChar);
   1.161 +}
   1.162 +
   1.163 +void
   1.164 +ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
   1.165 +{
   1.166 +  for (uint32_t i = 0; i < aLen; i++) {
   1.167 +    uint32_t ch = aIn[i];
   1.168 +    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
   1.169 +        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
   1.170 +      ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
   1.171 +      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
   1.172 +      aOut[i++] = H_SURROGATE(ch);
   1.173 +      aOut[i] = L_SURROGATE(ch);
   1.174 +      continue;
   1.175 +    }
   1.176 +    aOut[i] = ToLowerCase(ch);
   1.177 +  }
   1.178 +}
   1.179 +
   1.180 +uint32_t
   1.181 +ToUpperCase(uint32_t aChar)
   1.182 +{
   1.183 +  if (IS_ASCII(aChar)) {
   1.184 +    if (IS_ASCII_LOWER(aChar)) {
   1.185 +      return aChar - 0x20;
   1.186 +    }
   1.187 +    return aChar;
   1.188 +  }
   1.189 +
   1.190 +  return mozilla::unicode::GetUppercase(aChar);
   1.191 +}
   1.192 +
   1.193 +void
   1.194 +ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
   1.195 +{
   1.196 +  for (uint32_t i = 0; i < aLen; i++) {
   1.197 +    uint32_t ch = aIn[i];
   1.198 +    if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
   1.199 +        NS_IS_LOW_SURROGATE(aIn[i + 1])) {
   1.200 +      ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
   1.201 +      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
   1.202 +      aOut[i++] = H_SURROGATE(ch);
   1.203 +      aOut[i] = L_SURROGATE(ch);
   1.204 +      continue;
   1.205 +    }
   1.206 +    aOut[i] = ToUpperCase(ch);
   1.207 +  }
   1.208 +}
   1.209 +
   1.210 +uint32_t
   1.211 +ToTitleCase(uint32_t aChar)
   1.212 +{
   1.213 +  if (IS_ASCII(aChar)) {
   1.214 +    return ToUpperCase(aChar);
   1.215 +  }
   1.216 +
   1.217 +  return mozilla::unicode::GetTitlecaseForLower(aChar);
   1.218 +}
   1.219 +
   1.220 +int32_t
   1.221 +CaseInsensitiveCompare(const char16_t *a,
   1.222 +                       const char16_t *b,
   1.223 +                       uint32_t len)
   1.224 +{
   1.225 +  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
   1.226 +
   1.227 +  if (len) {
   1.228 +    do {
   1.229 +      uint32_t c1 = *a++;
   1.230 +      uint32_t c2 = *b++;
   1.231 +
   1.232 +      // Unfortunately, we need to check for surrogates BEFORE we check
   1.233 +      // for equality, because we could have identical high surrogates
   1.234 +      // but non-identical characters, so we can't just skip them
   1.235 +
   1.236 +      // If c1 isn't a surrogate, we don't bother to check c2;
   1.237 +      // in the case where it _is_ a surrogate, we're definitely going to get
   1.238 +      // a mismatch, and don't need to interpret and lowercase it
   1.239 +
   1.240 +      if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
   1.241 +        c1 = SURROGATE_TO_UCS4(c1, *a++);
   1.242 +        if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
   1.243 +          c2 = SURROGATE_TO_UCS4(c2, *b++);
   1.244 +        }
   1.245 +        // If c2 wasn't a surrogate, decrementing len means we'd stop
   1.246 +        // short of the end of string b, but that doesn't actually matter
   1.247 +        // because we're going to find a mismatch and return early
   1.248 +        --len;
   1.249 +      }
   1.250 +
   1.251 +      if (c1 != c2) {
   1.252 +        c1 = ToLowerCase_inline(c1);
   1.253 +        c2 = ToLowerCase_inline(c2);
   1.254 +        if (c1 != c2) {
   1.255 +          if (c1 < c2) {
   1.256 +            return -1;
   1.257 +          }
   1.258 +          return 1;
   1.259 +        }
   1.260 +      }
   1.261 +    } while (--len != 0);
   1.262 +  }
   1.263 +  return 0;
   1.264 +}
   1.265 +
   1.266 +// Calculates the codepoint of the UTF8 sequence starting at aStr.  Sets aNext
   1.267 +// to the byte following the end of the sequence.
   1.268 +//
   1.269 +// If the sequence is invalid, or if computing the codepoint would take us off
   1.270 +// the end of the string (as marked by aEnd), returns -1 and does not set
   1.271 +// aNext.  Note that this function doesn't check that aStr < aEnd -- it assumes
   1.272 +// you've done that already.
   1.273 +static MOZ_ALWAYS_INLINE uint32_t
   1.274 +GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
   1.275 +{
   1.276 +  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
   1.277 +  // sign extend.
   1.278 +  const unsigned char *str = (unsigned char*)aStr;
   1.279 +
   1.280 +  if (UTF8traits::isASCII(str[0])) {
   1.281 +    // It's ASCII; just convert to lower-case and return it.
   1.282 +    *aNext = aStr + 1;
   1.283 +    return gASCIIToLower[*str];
   1.284 +  }
   1.285 +  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
   1.286 +    // It's a two-byte sequence, so it looks like
   1.287 +    //  110XXXXX 10XXXXXX.
   1.288 +    // This is definitely in the BMP, so we can store straightaway into a
   1.289 +    // uint16_t.
   1.290 +
   1.291 +    uint16_t c;
   1.292 +    c  = (str[0] & 0x1F) << 6;
   1.293 +    c += (str[1] & 0x3F);
   1.294 +
   1.295 +    // we don't go through ToLowerCase here, because we know this isn't
   1.296 +    // an ASCII character so the ASCII fast-path there is useless
   1.297 +    c = mozilla::unicode::GetLowercase(c);
   1.298 +
   1.299 +    *aNext = aStr + 2;
   1.300 +    return c;
   1.301 +  }
   1.302 +  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
   1.303 +    // It's a three-byte sequence, so it looks like
   1.304 +    //  1110XXXX 10XXXXXX 10XXXXXX.
   1.305 +    // This will just barely fit into 16-bits, so store into a uint16_t.
   1.306 +
   1.307 +    uint16_t c;
   1.308 +    c  = (str[0] & 0x0F) << 12;
   1.309 +    c += (str[1] & 0x3F) << 6;
   1.310 +    c += (str[2] & 0x3F);
   1.311 +
   1.312 +    c = mozilla::unicode::GetLowercase(c);
   1.313 +
   1.314 +    *aNext = aStr + 3;
   1.315 +    return c;
   1.316 +  }
   1.317 +  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
   1.318 +    // It's a four-byte sequence, so it looks like
   1.319 +    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
   1.320 +
   1.321 +    uint32_t c;
   1.322 +    c  = (str[0] & 0x07) << 18;
   1.323 +    c += (str[1] & 0x3F) << 12;
   1.324 +    c += (str[2] & 0x3F) << 6;
   1.325 +    c += (str[3] & 0x3F);
   1.326 +
   1.327 +    c = mozilla::unicode::GetLowercase(c);
   1.328 +
   1.329 +    *aNext = aStr + 4;
   1.330 +    return c;
   1.331 +  }
   1.332 +
   1.333 +  // Hm, we don't understand this sequence.
   1.334 +  return -1;
   1.335 +}
   1.336 +
   1.337 +int32_t CaseInsensitiveCompare(const char *aLeft,
   1.338 +                               const char *aRight,
   1.339 +                               uint32_t aLeftBytes,
   1.340 +                               uint32_t aRightBytes)
   1.341 +{
   1.342 +  const char *leftEnd = aLeft + aLeftBytes;
   1.343 +  const char *rightEnd = aRight + aRightBytes;
   1.344 +
   1.345 +  while (aLeft < leftEnd && aRight < rightEnd) {
   1.346 +    uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
   1.347 +    if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
   1.348 +      return -1;
   1.349 +
   1.350 +    uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
   1.351 +    if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
   1.352 +      return -1;
   1.353 +
   1.354 +    // Now leftChar and rightChar are lower-case, so we can compare them.
   1.355 +    if (leftChar != rightChar) {
   1.356 +      if (leftChar > rightChar)
   1.357 +        return 1;
   1.358 +      return -1;
   1.359 +    }
   1.360 +  }
   1.361 +
   1.362 +  // Make sure that if one string is longer than the other we return the
   1.363 +  // correct result.
   1.364 +  if (aLeft < leftEnd)
   1.365 +    return 1;
   1.366 +  if (aRight < rightEnd)
   1.367 +    return -1;
   1.368 +
   1.369 +  return 0;
   1.370 +}
   1.371 +
   1.372 +bool
   1.373 +CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
   1.374 +                              const char* aLeftEnd, const char* aRightEnd,
   1.375 +                              const char** aLeftNext, const char** aRightNext,
   1.376 +                              bool* aErr)
   1.377 +{
   1.378 +  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
   1.379 +  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
   1.380 +  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
   1.381 +  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
   1.382 +  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
   1.383 +
   1.384 +  uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
   1.385 +  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
   1.386 +    *aErr = true;
   1.387 +    return false;
   1.388 +  }
   1.389 +
   1.390 +  uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
   1.391 +  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
   1.392 +    *aErr = true;
   1.393 +    return false;
   1.394 +  }
   1.395 +
   1.396 +  // Can't have an error past this point.
   1.397 +  *aErr = false;
   1.398 +
   1.399 +  return leftChar == rightChar;
   1.400 +}
   1.401 +
   1.402 +namespace mozilla {
   1.403 +
   1.404 +uint32_t
   1.405 +HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
   1.406 +{
   1.407 +  uint32_t hash = 0;
   1.408 +  const char* s = aUTF8;
   1.409 +  const char* end = aUTF8 + aLength;
   1.410 +
   1.411 +  *aErr = false;
   1.412 +
   1.413 +  while (s < end)
   1.414 +  {
   1.415 +    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
   1.416 +    if (*aErr) {
   1.417 +      return 0;
   1.418 +    }
   1.419 +
   1.420 +    if (ucs4 < PLANE1_BASE) {
   1.421 +      hash = AddToHash(hash, ucs4);
   1.422 +    }
   1.423 +    else {
   1.424 +      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
   1.425 +    }
   1.426 +  }
   1.427 +
   1.428 +  return hash;
   1.429 +}
   1.430 +
   1.431 +} // namespace mozilla

mercurial