michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #ifndef nsCharTraits_h___
michael@0: #define nsCharTraits_h___
michael@0: 
michael@0: #include <ctype.h> // for |EOF|, |WEOF|
michael@0: #include <string.h> // for |memcpy|, et al
michael@0: 
michael@0: #include "nscore.h" // for |char16_t|
michael@0: 
michael@0: // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
michael@0: // particular the standalone software updater. In that case stub out
michael@0: // the macros provided by nsDebug.h which are only usable when linking XPCOM
michael@0: 
michael@0: #ifdef NS_NO_XPCOM
michael@0: #define NS_WARNING(msg)
michael@0: #define NS_ASSERTION(cond, msg)
michael@0: #define NS_ERROR(msg)
michael@0: #else
michael@0: #include "nsDebug.h"  // for NS_ASSERTION
michael@0: #endif
michael@0: 
michael@0: /*
michael@0:  * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
michael@0:  * values.
michael@0:  *
michael@0:  * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
michael@0:  * using "surrogate pairs". These consist of a high surrogate, i.e. a code
michael@0:  * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
michael@0:  * in the range U+DC00 - U+DFFF, like this:
michael@0:  *
michael@0:  *  U+D800 U+DC00 =  U+10000
michael@0:  *  U+D800 U+DC01 =  U+10001
michael@0:  *  ...
michael@0:  *  U+DBFF U+DFFE = U+10FFFE
michael@0:  *  U+DBFF U+DFFF = U+10FFFF
michael@0:  *
michael@0:  * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
michael@0:  * scalar values and are not well-formed UTF-16 except as high-surrogate /
michael@0:  * low-surrogate pairs.
michael@0:  */
michael@0: 
michael@0: #define PLANE1_BASE          uint32_t(0x00010000)
michael@0: // High surrogates are in the range 0xD800 -- OxDBFF
michael@0: #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
michael@0: // Low surrogates are in the range 0xDC00 -- 0xDFFF
michael@0: #define NS_IS_LOW_SURROGATE(u)  ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
michael@0: // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
michael@0: #define IS_SURROGATE(u)      ((uint32_t(u) & 0xFFFFF800) == 0xD800)
michael@0: 
michael@0: // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
michael@0: 
michael@0: // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
michael@0: // I wonder whether we could somehow assert that H is a high surrogate
michael@0: // and L is a low surrogate
michael@0: #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \
michael@0:                                  (uint32_t(l) & 0x03FF) + PLANE1_BASE)
michael@0: 
michael@0: // Extract surrogates from a UCS4 char
michael@0: // Reference: the Unicode standard 4.0, section 3.9
michael@0: // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and 
michael@0: // 0xD7C0 == 0xD800 - 0x0080,
michael@0: // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
michael@0: #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \
michael@0:                                  char16_t(0xD7C0)) 
michael@0: // where it's to be noted that 0xD7C0 is not bitwise-OR'd
michael@0: // but added.
michael@0: 
michael@0: // Since 0x10000 & 0x03FF == 0, 
michael@0: // (c - 0x10000) & 0x03FF == c & 0x03FF so that
michael@0: // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
michael@0: #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \
michael@0:                                  char16_t(0xDC00))
michael@0: 
michael@0: #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
michael@0: #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
michael@0: 
michael@0: #define UCS_END uint32_t(0x00110000)
michael@0: #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
michael@0: #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
michael@0: 
michael@0: template <class CharT> struct nsCharTraits {};
michael@0: 
michael@0: template <>
michael@0: struct nsCharTraits<char16_t>
michael@0:   {
michael@0:     typedef char16_t char_type;
michael@0:     typedef uint16_t  unsigned_char_type;
michael@0:     typedef char      incompatible_char_type;
michael@0: 
michael@0:     static char_type* const sEmptyBuffer;
michael@0: 
michael@0:     static
michael@0:     void
michael@0:     assign( char_type& lhs, char_type rhs )
michael@0:       {
michael@0:         lhs = rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // integer representation of characters:
michael@0:     typedef int int_type;
michael@0: 
michael@0:     static
michael@0:     char_type
michael@0:     to_char_type( int_type c )
michael@0:       {
michael@0:         return char_type(c);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int_type
michael@0:     to_int_type( char_type c )
michael@0:       {
michael@0:         return int_type( static_cast<unsigned_char_type>(c) );
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     eq_int_type( int_type lhs, int_type rhs )
michael@0:       {
michael@0:         return lhs == rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // |char_type| comparisons:
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     eq( char_type lhs, char_type rhs )
michael@0:       {
michael@0:         return lhs == rhs;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     lt( char_type lhs, char_type rhs )
michael@0:       {
michael@0:         return lhs < rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // operations on s[n] arrays:
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     move( char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     copy( char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     copyASCII( char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0:         for (char_type* s = s1; n--; ++s, ++s2) {
michael@0:           NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:           *s = *s2;
michael@0:         }
michael@0:         return s1;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     assign( char_type* s, size_t n, char_type c )
michael@0:       {
michael@0:         char_type* result = s;
michael@0:         while ( n-- )
michael@0:           assign(*s++, c);
michael@0:         return result;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compare( const char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             if ( !eq(*s1, *s2) )
michael@0:               return to_int_type(*s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compareASCII( const char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )
michael@0:               return to_int_type(*s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     // this version assumes that s2 is null-terminated and s1 has length n.
michael@0:     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
michael@0:     // we return 1.
michael@0:     static
michael@0:     int
michael@0:     compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             if ( !*s2 )
michael@0:               return 1;
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )
michael@0:               return to_int_type(*s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         if ( *s2 )
michael@0:           return -1;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     /**
michael@0:      * Convert c to its lower-case form, but only if c is in the ASCII
michael@0:      * range. Otherwise leave it alone.
michael@0:      */
michael@0:     static
michael@0:     char_type
michael@0:     ASCIIToLower( char_type c )
michael@0:       {
michael@0:         if (c >= 'A' && c <= 'Z')
michael@0:           return char_type(c + ('a' - 'A'));
michael@0:           
michael@0:         return c;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
michael@0:                          "Unexpected uppercase character");
michael@0:             char_type lower_s1 = ASCIIToLower(*s1);
michael@0:             if ( lower_s1 != to_char_type(*s2) )
michael@0:               return to_int_type(lower_s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     // this version assumes that s2 is null-terminated and s1 has length n.
michael@0:     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
michael@0:     // we return 1.
michael@0:     static
michael@0:     int
michael@0:     compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             if ( !*s2 )
michael@0:               return 1;
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
michael@0:                          "Unexpected uppercase character");
michael@0:             char_type lower_s1 = ASCIIToLower(*s1);
michael@0:             if ( lower_s1 != to_char_type(*s2) )
michael@0:               return to_int_type(lower_s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         if ( *s2 )
michael@0:           return -1;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     size_t
michael@0:     length( const char_type* s )
michael@0:       {
michael@0:         size_t result = 0;
michael@0:         while ( !eq(*s++, char_type(0)) )
michael@0:           ++result;
michael@0:         return result;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     const char_type*
michael@0:     find( const char_type* s, size_t n, char_type c )
michael@0:       {
michael@0:         while ( n-- )
michael@0:           {
michael@0:             if ( eq(*s, c) )
michael@0:               return s;
michael@0:             ++s;
michael@0:           }
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0:   };
michael@0: 
michael@0: template <>
michael@0: struct nsCharTraits<char>
michael@0:   {
michael@0:     typedef char           char_type;
michael@0:     typedef unsigned char  unsigned_char_type;
michael@0:     typedef char16_t      incompatible_char_type;
michael@0: 
michael@0:     static char_type* const sEmptyBuffer;
michael@0: 
michael@0:     static
michael@0:     void
michael@0:     assign( char_type& lhs, char_type rhs )
michael@0:       {
michael@0:         lhs = rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // integer representation of characters:
michael@0: 
michael@0:     typedef int int_type;
michael@0: 
michael@0:     static
michael@0:     char_type
michael@0:     to_char_type( int_type c )
michael@0:       {
michael@0:         return char_type(c);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int_type
michael@0:     to_int_type( char_type c )
michael@0:       {
michael@0:         return int_type( static_cast<unsigned_char_type>(c) );
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     eq_int_type( int_type lhs, int_type rhs )
michael@0:       {
michael@0:         return lhs == rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // |char_type| comparisons:
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     eq( char_type lhs, char_type rhs )
michael@0:       {
michael@0:         return lhs == rhs;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     bool
michael@0:     lt( char_type lhs, char_type rhs )
michael@0:       {
michael@0:         return lhs < rhs;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       // operations on s[n] arrays:
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     move( char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     copy( char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     copyASCII( char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0:         return copy(s1, s2, n);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     char_type*
michael@0:     assign( char_type* s, size_t n, char_type c )
michael@0:       {
michael@0:         return static_cast<char_type*>(memset(s, to_int_type(c), n));
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compare( const char_type* s1, const char_type* s2, size_t n )
michael@0:       {
michael@0:         return memcmp(s1, s2, n);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compareASCII( const char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0: #ifdef DEBUG
michael@0:         for (size_t i = 0; i < n; ++i)
michael@0:           {
michael@0:             NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character");
michael@0:           }
michael@0: #endif
michael@0:         return compare(s1, s2, n);
michael@0:       }
michael@0: 
michael@0:     // this version assumes that s2 is null-terminated and s1 has length n.
michael@0:     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
michael@0:     // we return 1.
michael@0:     static
michael@0:     int
michael@0:     compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
michael@0:       {
michael@0:         // can't use strcmp here because we don't want to stop when s1
michael@0:         // contains a null
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             if ( !*s2 )
michael@0:               return 1;
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             if ( *s1 != *s2 )
michael@0:               return to_int_type(*s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         if ( *s2 )
michael@0:           return -1;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     /**
michael@0:      * Convert c to its lower-case form, but only if c is ASCII.
michael@0:      */
michael@0:     static
michael@0:     char_type
michael@0:     ASCIIToLower( char_type c )
michael@0:       {
michael@0:         if (c >= 'A' && c <= 'Z')
michael@0:           return char_type(c + ('a' - 'A'));
michael@0: 
michael@0:         return c;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     int
michael@0:     compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
michael@0:                          "Unexpected uppercase character");
michael@0:             char_type lower_s1 = ASCIIToLower(*s1);
michael@0:             if ( lower_s1 != *s2 )
michael@0:               return to_int_type(lower_s1) - to_int_type(*s2);
michael@0:           }
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     // this version assumes that s2 is null-terminated and s1 has length n.
michael@0:     // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
michael@0:     // we return 1.
michael@0:     static
michael@0:     int
michael@0:     compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
michael@0:       {
michael@0:         for ( ; n--; ++s1, ++s2 )
michael@0:           {
michael@0:             if ( !*s2 )
michael@0:               return 1;
michael@0:             NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
michael@0:             NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
michael@0:                          "Unexpected uppercase character");
michael@0:             char_type lower_s1 = ASCIIToLower(*s1);
michael@0:             if ( lower_s1 != *s2 )
michael@0:               return to_int_type(lower_s1) - to_int_type(*s2);
michael@0:           }
michael@0: 
michael@0:         if ( *s2 )
michael@0:           return -1;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     size_t
michael@0:     length( const char_type* s )
michael@0:       {
michael@0:         return strlen(s);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     const char_type*
michael@0:     find( const char_type* s, size_t n, char_type c )
michael@0:       {
michael@0:         return reinterpret_cast<const char_type*>(memchr(s, to_int_type(c), n));
michael@0:       }
michael@0:   };
michael@0: 
michael@0: template <class InputIterator>
michael@0: struct nsCharSourceTraits
michael@0:   {
michael@0:     typedef typename InputIterator::difference_type difference_type;
michael@0: 
michael@0:     static
michael@0:     uint32_t
michael@0:     readable_distance( const InputIterator& first, const InputIterator& last )
michael@0:       {
michael@0:         // assumes single fragment
michael@0:         return uint32_t(last.get() - first.get());
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     const typename InputIterator::value_type*
michael@0:     read( const InputIterator& iter )
michael@0:       {
michael@0:         return iter.get();
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     void
michael@0:     advance( InputIterator& s, difference_type n )
michael@0:       {
michael@0:         s.advance(n);
michael@0:       }
michael@0:   };
michael@0: 
michael@0: template <class CharT>
michael@0: struct nsCharSourceTraits<CharT*>
michael@0:   {
michael@0:     typedef ptrdiff_t difference_type;
michael@0: 
michael@0:     static
michael@0:     uint32_t
michael@0:     readable_distance( CharT* s )
michael@0:       {
michael@0:         return uint32_t(nsCharTraits<CharT>::length(s));
michael@0: //      return numeric_limits<uint32_t>::max();
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     uint32_t
michael@0:     readable_distance( CharT* first, CharT* last )
michael@0:       {
michael@0:         return uint32_t(last-first);
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     const CharT*
michael@0:     read( CharT* s )
michael@0:       {
michael@0:         return s;
michael@0:       }
michael@0: 
michael@0:     static
michael@0:     void
michael@0:     advance( CharT*& s, difference_type n )
michael@0:       {
michael@0:         s += n;
michael@0:       }
michael@0:   };
michael@0: 
michael@0: template <class OutputIterator>
michael@0: struct nsCharSinkTraits
michael@0:   {
michael@0:     static
michael@0:     void
michael@0:     write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n )
michael@0:       {
michael@0:         iter.write(s, n);
michael@0:       }
michael@0:   };
michael@0: 
michael@0: template <class CharT>
michael@0: struct nsCharSinkTraits<CharT*>
michael@0:   {
michael@0:     static
michael@0:     void
michael@0:     write( CharT*& iter, const CharT* s, uint32_t n )
michael@0:       {
michael@0:         nsCharTraits<CharT>::move(iter, s, n);
michael@0:         iter += n;
michael@0:       }
michael@0:   };
michael@0: 
michael@0: #endif // !defined(nsCharTraits_h___)