The Tor Browser: intl/unicharutil/util/nsUnicharUtils.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

     2 /* This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     6 #include "nsUnicharUtils.h"

     7 #include "nsXPCOMStrings.h"

     8 #include "nsUTF8Utils.h"

     9 #include "nsUnicodeProperties.h"

    10 #include "mozilla/Likely.h"

    11 #include "mozilla/HashFunctions.h"

    13 // We map x -> x, except for upper-case letters,

    14 // which we map to their lower-case equivalents.

    15 static const uint8_t gASCIIToLower [128] = {

    16     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

    17     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,

    18     0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,

    19     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,

    20     0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,

    21     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,

    22     0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,

    23     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,

    24 };

    26 #define IS_ASCII(u)       ((u) < 0x80)

    27 #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))

    28 #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))

    29 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))

    30 #define IS_ASCII_SPACE(u) (' ' == (u))

    32 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast

    33 // when they're called from within the case-insensitive comparators, so we

    34 // define inlined versions.

    35 static MOZ_ALWAYS_INLINE uint32_t

    36 ToLowerCase_inline(uint32_t aChar)

    37 {

    38   if (IS_ASCII(aChar)) {

    39     return gASCIIToLower[aChar];

    40   }

    42   return mozilla::unicode::GetLowercase(aChar);

    43 }

    45 static MOZ_ALWAYS_INLINE uint32_t

    46 ToLowerCaseASCII_inline(const uint32_t aChar)

    47 {

    48   if (IS_ASCII(aChar)) {

    49     return gASCIIToLower[aChar];

    50   }

    52   return aChar;

    53 }

    55 void

    56 ToLowerCase(nsAString& aString)

    57 {

    58   char16_t *buf = aString.BeginWriting();

    59   ToLowerCase(buf, buf, aString.Length());

    60 }

    62 void

    63 ToLowerCase(const nsAString& aSource,

    64             nsAString& aDest)

    65 {

    66   const char16_t *in;

    67   char16_t *out;

    68   uint32_t len = NS_StringGetData(aSource, &in);

    69   NS_StringGetMutableData(aDest, len, &out);

    70   NS_ASSERTION(out, "Uh...");

    71   ToLowerCase(in, out, len);

    72 }

    74 uint32_t

    75 ToLowerCaseASCII(const uint32_t aChar)

    76 {

    77   return ToLowerCaseASCII_inline(aChar);

    78 }

    80 void

    81 ToUpperCase(nsAString& aString)

    82 {

    83   char16_t *buf = aString.BeginWriting();

    84   ToUpperCase(buf, buf, aString.Length());

    85 }

    87 void

    88 ToUpperCase(const nsAString& aSource,

    89             nsAString& aDest)

    90 {

    91   const char16_t *in;

    92   char16_t *out;

    93   uint32_t len = NS_StringGetData(aSource, &in);

    94   NS_StringGetMutableData(aDest, len, &out);

    95   NS_ASSERTION(out, "Uh...");

    96   ToUpperCase(in, out, len);

    97 }

    99 #ifdef MOZILLA_INTERNAL_API

   101 int32_t

   102 nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,

   103                                               const char16_t* rhs,

   104                                               uint32_t lLength,

   105                                               uint32_t rLength) const

   106 {

   107   return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :

   108          (lLength > rLength) ? 1 : -1;

   109 }

   111 int32_t

   112 nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,

   113                                                   const char* rhs,

   114                                                   uint32_t lLength,

   115                                                   uint32_t rLength) const

   116 {

   117   return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);

   118 }

   120 int32_t

   121 nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,

   122                                                    const char16_t* rhs,

   123                                                    uint32_t lLength,

   124                                                    uint32_t rLength) const

   125 {

   126   if (lLength != rLength) {

   127     if (lLength > rLength)

   128       return 1;

   129     return -1;

   130   }

   132   while (rLength) {

   133     // we don't care about surrogates here, because we're only

   134     // lowercasing the ASCII range

   135     char16_t l = *lhs++;

   136     char16_t r = *rhs++;

   137     if (l != r) {

   138       l = ToLowerCaseASCII_inline(l);

   139       r = ToLowerCaseASCII_inline(r);

   141       if (l > r)

   142         return 1;

   143       else if (r > l)

   144         return -1;

   145     }

   146     rLength--;

   147   }

   149   return 0;

   150 }

   152 #endif // MOZILLA_INTERNAL_API

   154 uint32_t

   155 ToLowerCase(uint32_t aChar)

   156 {

   157   return ToLowerCase_inline(aChar);

   158 }

   160 void

   161 ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)

   162 {

   163   for (uint32_t i = 0; i < aLen; i++) {

   164     uint32_t ch = aIn[i];

   165     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&

   166         NS_IS_LOW_SURROGATE(aIn[i + 1])) {

   167       ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));

   168       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");

   169       aOut[i++] = H_SURROGATE(ch);

   170       aOut[i] = L_SURROGATE(ch);

   171       continue;

   172     }

   173     aOut[i] = ToLowerCase(ch);

   174   }

   175 }

   177 uint32_t

   178 ToUpperCase(uint32_t aChar)

   179 {

   180   if (IS_ASCII(aChar)) {

   181     if (IS_ASCII_LOWER(aChar)) {

   182       return aChar - 0x20;

   183     }

   184     return aChar;

   185   }

   187   return mozilla::unicode::GetUppercase(aChar);

   188 }

   190 void

   191 ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)

   192 {

   193   for (uint32_t i = 0; i < aLen; i++) {

   194     uint32_t ch = aIn[i];

   195     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&

   196         NS_IS_LOW_SURROGATE(aIn[i + 1])) {

   197       ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));

   198       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");

   199       aOut[i++] = H_SURROGATE(ch);

   200       aOut[i] = L_SURROGATE(ch);

   201       continue;

   202     }

   203     aOut[i] = ToUpperCase(ch);

   204   }

   205 }

   207 uint32_t

   208 ToTitleCase(uint32_t aChar)

   209 {

   210   if (IS_ASCII(aChar)) {

   211     return ToUpperCase(aChar);

   212   }

   214   return mozilla::unicode::GetTitlecaseForLower(aChar);

   215 }

   217 int32_t

   218 CaseInsensitiveCompare(const char16_t *a,

   219                        const char16_t *b,

   220                        uint32_t len)

   221 {

   222   NS_ASSERTION(a && b, "Do not pass in invalid pointers!");

   224   if (len) {

   225     do {

   226       uint32_t c1 = *a++;

   227       uint32_t c2 = *b++;

   229       // Unfortunately, we need to check for surrogates BEFORE we check

   230       // for equality, because we could have identical high surrogates

   231       // but non-identical characters, so we can't just skip them

   233       // If c1 isn't a surrogate, we don't bother to check c2;

   234       // in the case where it _is_ a surrogate, we're definitely going to get

   235       // a mismatch, and don't need to interpret and lowercase it

   237       if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {

   238         c1 = SURROGATE_TO_UCS4(c1, *a++);

   239         if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {

   240           c2 = SURROGATE_TO_UCS4(c2, *b++);

   241         }

   242         // If c2 wasn't a surrogate, decrementing len means we'd stop

   243         // short of the end of string b, but that doesn't actually matter

   244         // because we're going to find a mismatch and return early

   245         --len;

   246       }

   248       if (c1 != c2) {

   249         c1 = ToLowerCase_inline(c1);

   250         c2 = ToLowerCase_inline(c2);

   251         if (c1 != c2) {

   252           if (c1 < c2) {

   253             return -1;

   254           }

   255           return 1;

   256         }

   257       }

   258     } while (--len != 0);

   259   }

   260   return 0;

   261 }

   263 // Calculates the codepoint of the UTF8 sequence starting at aStr.  Sets aNext

   264 // to the byte following the end of the sequence.

   265 //

   266 // If the sequence is invalid, or if computing the codepoint would take us off

   267 // the end of the string (as marked by aEnd), returns -1 and does not set

   268 // aNext.  Note that this function doesn't check that aStr < aEnd -- it assumes

   269 // you've done that already.

   270 static MOZ_ALWAYS_INLINE uint32_t

   271 GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)

   272 {

   273   // Convert to unsigned char so that stuffing chars into PRUint32s doesn't

   274   // sign extend.

   275   const unsigned char *str = (unsigned char*)aStr;

   277   if (UTF8traits::isASCII(str[0])) {

   278     // It's ASCII; just convert to lower-case and return it.

   279     *aNext = aStr + 1;

   280     return gASCIIToLower[*str];

   281   }

   282   if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {

   283     // It's a two-byte sequence, so it looks like

   284     //  110XXXXX 10XXXXXX.

   285     // This is definitely in the BMP, so we can store straightaway into a

   286     // uint16_t.

   288     uint16_t c;

   289     c  = (str[0] & 0x1F) << 6;

   290     c += (str[1] & 0x3F);

   292     // we don't go through ToLowerCase here, because we know this isn't

   293     // an ASCII character so the ASCII fast-path there is useless

   294     c = mozilla::unicode::GetLowercase(c);

   296     *aNext = aStr + 2;

   297     return c;

   298   }

   299   if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {

   300     // It's a three-byte sequence, so it looks like

   301     //  1110XXXX 10XXXXXX 10XXXXXX.

   302     // This will just barely fit into 16-bits, so store into a uint16_t.

   304     uint16_t c;

   305     c  = (str[0] & 0x0F) << 12;

   306     c += (str[1] & 0x3F) << 6;

   307     c += (str[2] & 0x3F);

   309     c = mozilla::unicode::GetLowercase(c);

   311     *aNext = aStr + 3;

   312     return c;

   313   }

   314   if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {

   315     // It's a four-byte sequence, so it looks like

   316     //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.

   318     uint32_t c;

   319     c  = (str[0] & 0x07) << 18;

   320     c += (str[1] & 0x3F) << 12;

   321     c += (str[2] & 0x3F) << 6;

   322     c += (str[3] & 0x3F);

   324     c = mozilla::unicode::GetLowercase(c);

   326     *aNext = aStr + 4;

   327     return c;

   328   }

   330   // Hm, we don't understand this sequence.

   331   return -1;

   332 }

   334 int32_t CaseInsensitiveCompare(const char *aLeft,

   335                                const char *aRight,

   336                                uint32_t aLeftBytes,

   337                                uint32_t aRightBytes)

   338 {

   339   const char *leftEnd = aLeft + aLeftBytes;

   340   const char *rightEnd = aRight + aRightBytes;

   342   while (aLeft < leftEnd && aRight < rightEnd) {

   343     uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);

   344     if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))

   345       return -1;

   347     uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);

   348     if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))

   349       return -1;

   351     // Now leftChar and rightChar are lower-case, so we can compare them.

   352     if (leftChar != rightChar) {

   353       if (leftChar > rightChar)

   354         return 1;

   355       return -1;

   356     }

   357   }

   359   // Make sure that if one string is longer than the other we return the

   360   // correct result.

   361   if (aLeft < leftEnd)

   362     return 1;

   363   if (aRight < rightEnd)

   364     return -1;

   366   return 0;

   367 }

   369 bool

   370 CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,

   371                               const char* aLeftEnd, const char* aRightEnd,

   372                               const char** aLeftNext, const char** aRightNext,

   373                               bool* aErr)

   374 {

   375   NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");

   376   NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");

   377   NS_ASSERTION(aErr, "Out pointer shouldn't be null.");

   378   NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");

   379   NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

   381   uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);

   382   if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {

   383     *aErr = true;

   384     return false;

   385   }

   387   uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);

   388   if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {

   389     *aErr = true;

   390     return false;

   391   }

   393   // Can't have an error past this point.

   394   *aErr = false;

   396   return leftChar == rightChar;

   397 }

   399 namespace mozilla {

   401 uint32_t

   402 HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)

   403 {

   404   uint32_t hash = 0;

   405   const char* s = aUTF8;

   406   const char* end = aUTF8 + aLength;

   408   *aErr = false;

   410   while (s < end)

   411   {

   412     uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);

   413     if (*aErr) {

   414       return 0;

   415     }

   417     if (ucs4 < PLANE1_BASE) {

   418       hash = AddToHash(hash, ucs4);

   419     }

   420     else {

   421       hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));

   422     }

   423   }

   425   return hash;

   426 }

   428 } // namespace mozilla

The Tor Browser / file revision

intl/unicharutil/util/nsUnicharUtils.cpp@6474c204b198

intl/unicharutil/util/nsUnicharUtils.cpp