The Tor Browser: xpcom/string/public/nsCharTraits.h@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     6 #ifndef nsCharTraits_h___

     7 #define nsCharTraits_h___

     9 #include <ctype.h> // for |EOF|, |WEOF|

    10 #include <string.h> // for |memcpy|, et al

    12 #include "nscore.h" // for |char16_t|

    14 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in

    15 // particular the standalone software updater. In that case stub out

    16 // the macros provided by nsDebug.h which are only usable when linking XPCOM

    18 #ifdef NS_NO_XPCOM

    19 #define NS_WARNING(msg)

    20 #define NS_ASSERTION(cond, msg)

    21 #define NS_ERROR(msg)

    22 #else

    23 #include "nsDebug.h"  // for NS_ASSERTION

    24 #endif

    26 /*

    27  * Some macros for converting char16_t (UTF-16) to and from Unicode scalar

    28  * values.

    29  *

    30  * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by

    31  * using "surrogate pairs". These consist of a high surrogate, i.e. a code

    32  * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point

    33  * in the range U+DC00 - U+DFFF, like this:

    34  *

    35  *  U+D800 U+DC00 =  U+10000

    36  *  U+D800 U+DC01 =  U+10001

    37  *  ...

    38  *  U+DBFF U+DFFE = U+10FFFE

    39  *  U+DBFF U+DFFF = U+10FFFF

    40  *

    41  * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode

    42  * scalar values and are not well-formed UTF-16 except as high-surrogate /

    43  * low-surrogate pairs.

    44  */

    46 #define PLANE1_BASE          uint32_t(0x00010000)

    47 // High surrogates are in the range 0xD800 -- OxDBFF

    48 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)

    49 // Low surrogates are in the range 0xDC00 -- 0xDFFF

    50 #define NS_IS_LOW_SURROGATE(u)  ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)

    51 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE

    52 #define IS_SURROGATE(u)      ((uint32_t(u) & 0xFFFFF800) == 0xD800)

    54 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF

    56 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)

    57 // I wonder whether we could somehow assert that H is a high surrogate

    58 // and L is a low surrogate

    59 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \

    60                                  (uint32_t(l) & 0x03FF) + PLANE1_BASE)

    62 // Extract surrogates from a UCS4 char

    63 // Reference: the Unicode standard 4.0, section 3.9

    64 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and

    65 // 0xD7C0 == 0xD800 - 0x0080,

    66 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to

    67 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \

    68                                  char16_t(0xD7C0))

    69 // where it's to be noted that 0xD7C0 is not bitwise-OR'd

    70 // but added.

    72 // Since 0x10000 & 0x03FF == 0,

    73 // (c - 0x10000) & 0x03FF == c & 0x03FF so that

    74 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to

    75 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \

    76                                  char16_t(0xDC00))

    78 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)

    79 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)

    81 #define UCS_END uint32_t(0x00110000)

    82 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))

    83 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)

    85 template <class CharT> struct nsCharTraits {};

    87 template <>

    88 struct nsCharTraits<char16_t>

    89   {

    90     typedef char16_t char_type;

    91     typedef uint16_t  unsigned_char_type;

    92     typedef char      incompatible_char_type;

    94     static char_type* const sEmptyBuffer;

    96     static

    97     void

    98     assign( char_type& lhs, char_type rhs )

    99       {

   100         lhs = rhs;

   101       }

   104       // integer representation of characters:

   105     typedef int int_type;

   107     static

   108     char_type

   109     to_char_type( int_type c )

   110       {

   111         return char_type(c);

   112       }

   114     static

   115     int_type

   116     to_int_type( char_type c )

   117       {

   118         return int_type( static_cast<unsigned_char_type>(c) );

   119       }

   121     static

   122     bool

   123     eq_int_type( int_type lhs, int_type rhs )

   124       {

   125         return lhs == rhs;

   126       }

   129       // |char_type| comparisons:

   131     static

   132     bool

   133     eq( char_type lhs, char_type rhs )

   134       {

   135         return lhs == rhs;

   136       }

   138     static

   139     bool

   140     lt( char_type lhs, char_type rhs )

   141       {

   142         return lhs < rhs;

   143       }

   146       // operations on s[n] arrays:

   148     static

   149     char_type*

   150     move( char_type* s1, const char_type* s2, size_t n )

   151       {

   152         return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));

   153       }

   155     static

   156     char_type*

   157     copy( char_type* s1, const char_type* s2, size_t n )

   158       {

   159         return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));

   160       }

   162     static

   163     char_type*

   164     copyASCII( char_type* s1, const char* s2, size_t n )

   165       {

   166         for (char_type* s = s1; n--; ++s, ++s2) {

   167           NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   168           *s = *s2;

   169         }

   170         return s1;

   171       }

   173     static

   174     char_type*

   175     assign( char_type* s, size_t n, char_type c )

   176       {

   177         char_type* result = s;

   178         while ( n-- )

   179           assign(*s++, c);

   180         return result;

   181       }

   183     static

   184     int

   185     compare( const char_type* s1, const char_type* s2, size_t n )

   186       {

   187         for ( ; n--; ++s1, ++s2 )

   188           {

   189             if ( !eq(*s1, *s2) )

   190               return to_int_type(*s1) - to_int_type(*s2);

   191           }

   193         return 0;

   194       }

   196     static

   197     int

   198     compareASCII( const char_type* s1, const char* s2, size_t n )

   199       {

   200         for ( ; n--; ++s1, ++s2 )

   201           {

   202             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   203             if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )

   204               return to_int_type(*s1) - to_int_type(*s2);

   205           }

   207         return 0;

   208       }

   210     // this version assumes that s2 is null-terminated and s1 has length n.

   211     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,

   212     // we return 1.

   213     static

   214     int

   215     compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )

   216       {

   217         for ( ; n--; ++s1, ++s2 )

   218           {

   219             if ( !*s2 )

   220               return 1;

   221             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   222             if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )

   223               return to_int_type(*s1) - to_int_type(*s2);

   224           }

   226         if ( *s2 )

   227           return -1;

   229         return 0;

   230       }

   232     /**

   233      * Convert c to its lower-case form, but only if c is in the ASCII

   234      * range. Otherwise leave it alone.

   235      */

   236     static

   237     char_type

   238     ASCIIToLower( char_type c )

   239       {

   240         if (c >= 'A' && c <= 'Z')

   241           return char_type(c + ('a' - 'A'));

   243         return c;

   244       }

   246     static

   247     int

   248     compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )

   249       {

   250         for ( ; n--; ++s1, ++s2 )

   251           {

   252             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   253             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),

   254                          "Unexpected uppercase character");

   255             char_type lower_s1 = ASCIIToLower(*s1);

   256             if ( lower_s1 != to_char_type(*s2) )

   257               return to_int_type(lower_s1) - to_int_type(*s2);

   258           }

   260         return 0;

   261       }

   263     // this version assumes that s2 is null-terminated and s1 has length n.

   264     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,

   265     // we return 1.

   266     static

   267     int

   268     compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )

   269       {

   270         for ( ; n--; ++s1, ++s2 )

   271           {

   272             if ( !*s2 )

   273               return 1;

   274             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   275             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),

   276                          "Unexpected uppercase character");

   277             char_type lower_s1 = ASCIIToLower(*s1);

   278             if ( lower_s1 != to_char_type(*s2) )

   279               return to_int_type(lower_s1) - to_int_type(*s2);

   280           }

   282         if ( *s2 )

   283           return -1;

   285         return 0;

   286       }

   288     static

   289     size_t

   290     length( const char_type* s )

   291       {

   292         size_t result = 0;

   293         while ( !eq(*s++, char_type(0)) )

   294           ++result;

   295         return result;

   296       }

   298     static

   299     const char_type*

   300     find( const char_type* s, size_t n, char_type c )

   301       {

   302         while ( n-- )

   303           {

   304             if ( eq(*s, c) )

   305               return s;

   306             ++s;

   307           }

   309         return 0;

   310       }

   311   };

   313 template <>

   314 struct nsCharTraits<char>

   315   {

   316     typedef char           char_type;

   317     typedef unsigned char  unsigned_char_type;

   318     typedef char16_t      incompatible_char_type;

   320     static char_type* const sEmptyBuffer;

   322     static

   323     void

   324     assign( char_type& lhs, char_type rhs )

   325       {

   326         lhs = rhs;

   327       }

   330       // integer representation of characters:

   332     typedef int int_type;

   334     static

   335     char_type

   336     to_char_type( int_type c )

   337       {

   338         return char_type(c);

   339       }

   341     static

   342     int_type

   343     to_int_type( char_type c )

   344       {

   345         return int_type( static_cast<unsigned_char_type>(c) );

   346       }

   348     static

   349     bool

   350     eq_int_type( int_type lhs, int_type rhs )

   351       {

   352         return lhs == rhs;

   353       }

   356       // |char_type| comparisons:

   358     static

   359     bool

   360     eq( char_type lhs, char_type rhs )

   361       {

   362         return lhs == rhs;

   363       }

   365     static

   366     bool

   367     lt( char_type lhs, char_type rhs )

   368       {

   369         return lhs < rhs;

   370       }

   373       // operations on s[n] arrays:

   375     static

   376     char_type*

   377     move( char_type* s1, const char_type* s2, size_t n )

   378       {

   379         return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));

   380       }

   382     static

   383     char_type*

   384     copy( char_type* s1, const char_type* s2, size_t n )

   385       {

   386         return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));

   387       }

   389     static

   390     char_type*

   391     copyASCII( char_type* s1, const char* s2, size_t n )

   392       {

   393         return copy(s1, s2, n);

   394       }

   396     static

   397     char_type*

   398     assign( char_type* s, size_t n, char_type c )

   399       {

   400         return static_cast<char_type*>(memset(s, to_int_type(c), n));

   401       }

   403     static

   404     int

   405     compare( const char_type* s1, const char_type* s2, size_t n )

   406       {

   407         return memcmp(s1, s2, n);

   408       }

   410     static

   411     int

   412     compareASCII( const char_type* s1, const char* s2, size_t n )

   413       {

   414 #ifdef DEBUG

   415         for (size_t i = 0; i < n; ++i)

   416           {

   417             NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character");

   418           }

   419 #endif

   420         return compare(s1, s2, n);

   421       }

   423     // this version assumes that s2 is null-terminated and s1 has length n.

   424     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,

   425     // we return 1.

   426     static

   427     int

   428     compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )

   429       {

   430         // can't use strcmp here because we don't want to stop when s1

   431         // contains a null

   432         for ( ; n--; ++s1, ++s2 )

   433           {

   434             if ( !*s2 )

   435               return 1;

   436             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   437             if ( *s1 != *s2 )

   438               return to_int_type(*s1) - to_int_type(*s2);

   439           }

   441         if ( *s2 )

   442           return -1;

   444         return 0;

   445       }

   447     /**

   448      * Convert c to its lower-case form, but only if c is ASCII.

   449      */

   450     static

   451     char_type

   452     ASCIIToLower( char_type c )

   453       {

   454         if (c >= 'A' && c <= 'Z')

   455           return char_type(c + ('a' - 'A'));

   457         return c;

   458       }

   460     static

   461     int

   462     compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )

   463       {

   464         for ( ; n--; ++s1, ++s2 )

   465           {

   466             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   467             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),

   468                          "Unexpected uppercase character");

   469             char_type lower_s1 = ASCIIToLower(*s1);

   470             if ( lower_s1 != *s2 )

   471               return to_int_type(lower_s1) - to_int_type(*s2);

   472           }

   473         return 0;

   474       }

   476     // this version assumes that s2 is null-terminated and s1 has length n.

   477     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,

   478     // we return 1.

   479     static

   480     int

   481     compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )

   482       {

   483         for ( ; n--; ++s1, ++s2 )

   484           {

   485             if ( !*s2 )

   486               return 1;

   487             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");

   488             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),

   489                          "Unexpected uppercase character");

   490             char_type lower_s1 = ASCIIToLower(*s1);

   491             if ( lower_s1 != *s2 )

   492               return to_int_type(lower_s1) - to_int_type(*s2);

   493           }

   495         if ( *s2 )

   496           return -1;

   498         return 0;

   499       }

   501     static

   502     size_t

   503     length( const char_type* s )

   504       {

   505         return strlen(s);

   506       }

   508     static

   509     const char_type*

   510     find( const char_type* s, size_t n, char_type c )

   511       {

   512         return reinterpret_cast<const char_type*>(memchr(s, to_int_type(c), n));

   513       }

   514   };

   516 template <class InputIterator>

   517 struct nsCharSourceTraits

   518   {

   519     typedef typename InputIterator::difference_type difference_type;

   521     static

   522     uint32_t

   523     readable_distance( const InputIterator& first, const InputIterator& last )

   524       {

   525         // assumes single fragment

   526         return uint32_t(last.get() - first.get());

   527       }

   529     static

   530     const typename InputIterator::value_type*

   531     read( const InputIterator& iter )

   532       {

   533         return iter.get();

   534       }

   536     static

   537     void

   538     advance( InputIterator& s, difference_type n )

   539       {

   540         s.advance(n);

   541       }

   542   };

   544 template <class CharT>

   545 struct nsCharSourceTraits<CharT*>

   546   {

   547     typedef ptrdiff_t difference_type;

   549     static

   550     uint32_t

   551     readable_distance( CharT* s )

   552       {

   553         return uint32_t(nsCharTraits<CharT>::length(s));

   554 //      return numeric_limits<uint32_t>::max();

   555       }

   557     static

   558     uint32_t

   559     readable_distance( CharT* first, CharT* last )

   560       {

   561         return uint32_t(last-first);

   562       }

   564     static

   565     const CharT*

   566     read( CharT* s )

   567       {

   568         return s;

   569       }

   571     static

   572     void

   573     advance( CharT*& s, difference_type n )

   574       {

   575         s += n;

   576       }

   577   };

   579 template <class OutputIterator>

   580 struct nsCharSinkTraits

   581   {

   582     static

   583     void

   584     write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n )

   585       {

   586         iter.write(s, n);

   587       }

   588   };

   590 template <class CharT>

   591 struct nsCharSinkTraits<CharT*>

   592   {

   593     static

   594     void

   595     write( CharT*& iter, const CharT* s, uint32_t n )

   596       {

   597         nsCharTraits<CharT>::move(iter, s, n);

   598         iter += n;

   599       }

   600   };

   602 #endif // !defined(nsCharTraits_h___)

The Tor Browser / file revision

xpcom/string/public/nsCharTraits.h@b8a032363ba2

xpcom/string/public/nsCharTraits.h