michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef nsCharTraits_h___ michael@0: #define nsCharTraits_h___ michael@0: michael@0: #include // for |EOF|, |WEOF| michael@0: #include // for |memcpy|, et al michael@0: michael@0: #include "nscore.h" // for |char16_t| michael@0: michael@0: // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in michael@0: // particular the standalone software updater. In that case stub out michael@0: // the macros provided by nsDebug.h which are only usable when linking XPCOM michael@0: michael@0: #ifdef NS_NO_XPCOM michael@0: #define NS_WARNING(msg) michael@0: #define NS_ASSERTION(cond, msg) michael@0: #define NS_ERROR(msg) michael@0: #else michael@0: #include "nsDebug.h" // for NS_ASSERTION michael@0: #endif michael@0: michael@0: /* michael@0: * Some macros for converting char16_t (UTF-16) to and from Unicode scalar michael@0: * values. michael@0: * michael@0: * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by michael@0: * using "surrogate pairs". These consist of a high surrogate, i.e. a code michael@0: * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point michael@0: * in the range U+DC00 - U+DFFF, like this: michael@0: * michael@0: * U+D800 U+DC00 = U+10000 michael@0: * U+D800 U+DC01 = U+10001 michael@0: * ... michael@0: * U+DBFF U+DFFE = U+10FFFE michael@0: * U+DBFF U+DFFF = U+10FFFF michael@0: * michael@0: * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode michael@0: * scalar values and are not well-formed UTF-16 except as high-surrogate / michael@0: * low-surrogate pairs. michael@0: */ michael@0: michael@0: #define PLANE1_BASE uint32_t(0x00010000) michael@0: // High surrogates are in the range 0xD800 -- OxDBFF michael@0: #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) michael@0: // Low surrogates are in the range 0xDC00 -- 0xDFFF michael@0: #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) michael@0: // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE michael@0: #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800) michael@0: michael@0: // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF michael@0: michael@0: // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) michael@0: // I wonder whether we could somehow assert that H is a high surrogate michael@0: // and L is a low surrogate michael@0: #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \ michael@0: (uint32_t(l) & 0x03FF) + PLANE1_BASE) michael@0: michael@0: // Extract surrogates from a UCS4 char michael@0: // Reference: the Unicode standard 4.0, section 3.9 michael@0: // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and michael@0: // 0xD7C0 == 0xD800 - 0x0080, michael@0: // ((c - 0x10000) >> 10) + 0xD800 can be simplified to michael@0: #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \ michael@0: char16_t(0xD7C0)) michael@0: // where it's to be noted that 0xD7C0 is not bitwise-OR'd michael@0: // but added. michael@0: michael@0: // Since 0x10000 & 0x03FF == 0, michael@0: // (c - 0x10000) & 0x03FF == c & 0x03FF so that michael@0: // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to michael@0: #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \ michael@0: char16_t(0xDC00)) michael@0: michael@0: #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE) michael@0: #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD) michael@0: michael@0: #define UCS_END uint32_t(0x00110000) michael@0: #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c)) michael@0: #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) michael@0: michael@0: template struct nsCharTraits {}; michael@0: michael@0: template <> michael@0: struct nsCharTraits michael@0: { michael@0: typedef char16_t char_type; michael@0: typedef uint16_t unsigned_char_type; michael@0: typedef char incompatible_char_type; michael@0: michael@0: static char_type* const sEmptyBuffer; michael@0: michael@0: static michael@0: void michael@0: assign( char_type& lhs, char_type rhs ) michael@0: { michael@0: lhs = rhs; michael@0: } michael@0: michael@0: michael@0: // integer representation of characters: michael@0: typedef int int_type; michael@0: michael@0: static michael@0: char_type michael@0: to_char_type( int_type c ) michael@0: { michael@0: return char_type(c); michael@0: } michael@0: michael@0: static michael@0: int_type michael@0: to_int_type( char_type c ) michael@0: { michael@0: return int_type( static_cast(c) ); michael@0: } michael@0: michael@0: static michael@0: bool michael@0: eq_int_type( int_type lhs, int_type rhs ) michael@0: { michael@0: return lhs == rhs; michael@0: } michael@0: michael@0: michael@0: // |char_type| comparisons: michael@0: michael@0: static michael@0: bool michael@0: eq( char_type lhs, char_type rhs ) michael@0: { michael@0: return lhs == rhs; michael@0: } michael@0: michael@0: static michael@0: bool michael@0: lt( char_type lhs, char_type rhs ) michael@0: { michael@0: return lhs < rhs; michael@0: } michael@0: michael@0: michael@0: // operations on s[n] arrays: michael@0: michael@0: static michael@0: char_type* michael@0: move( char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: return static_cast(memmove(s1, s2, n * sizeof(char_type))); michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: copy( char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: return static_cast(memcpy(s1, s2, n * sizeof(char_type))); michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: copyASCII( char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: for (char_type* s = s1; n--; ++s, ++s2) { michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: *s = *s2; michael@0: } michael@0: return s1; michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: assign( char_type* s, size_t n, char_type c ) michael@0: { michael@0: char_type* result = s; michael@0: while ( n-- ) michael@0: assign(*s++, c); michael@0: return result; michael@0: } michael@0: michael@0: static michael@0: int michael@0: compare( const char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: if ( !eq(*s1, *s2) ) michael@0: return to_int_type(*s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: static michael@0: int michael@0: compareASCII( const char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) michael@0: return to_int_type(*s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: // this version assumes that s2 is null-terminated and s1 has length n. michael@0: // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, michael@0: // we return 1. michael@0: static michael@0: int michael@0: compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: if ( !*s2 ) michael@0: return 1; michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) michael@0: return to_int_type(*s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: if ( *s2 ) michael@0: return -1; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: /** michael@0: * Convert c to its lower-case form, but only if c is in the ASCII michael@0: * range. Otherwise leave it alone. michael@0: */ michael@0: static michael@0: char_type michael@0: ASCIIToLower( char_type c ) michael@0: { michael@0: if (c >= 'A' && c <= 'Z') michael@0: return char_type(c + ('a' - 'A')); michael@0: michael@0: return c; michael@0: } michael@0: michael@0: static michael@0: int michael@0: compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), michael@0: "Unexpected uppercase character"); michael@0: char_type lower_s1 = ASCIIToLower(*s1); michael@0: if ( lower_s1 != to_char_type(*s2) ) michael@0: return to_int_type(lower_s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: // this version assumes that s2 is null-terminated and s1 has length n. michael@0: // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, michael@0: // we return 1. michael@0: static michael@0: int michael@0: compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: if ( !*s2 ) michael@0: return 1; michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), michael@0: "Unexpected uppercase character"); michael@0: char_type lower_s1 = ASCIIToLower(*s1); michael@0: if ( lower_s1 != to_char_type(*s2) ) michael@0: return to_int_type(lower_s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: if ( *s2 ) michael@0: return -1; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: static michael@0: size_t michael@0: length( const char_type* s ) michael@0: { michael@0: size_t result = 0; michael@0: while ( !eq(*s++, char_type(0)) ) michael@0: ++result; michael@0: return result; michael@0: } michael@0: michael@0: static michael@0: const char_type* michael@0: find( const char_type* s, size_t n, char_type c ) michael@0: { michael@0: while ( n-- ) michael@0: { michael@0: if ( eq(*s, c) ) michael@0: return s; michael@0: ++s; michael@0: } michael@0: michael@0: return 0; michael@0: } michael@0: }; michael@0: michael@0: template <> michael@0: struct nsCharTraits michael@0: { michael@0: typedef char char_type; michael@0: typedef unsigned char unsigned_char_type; michael@0: typedef char16_t incompatible_char_type; michael@0: michael@0: static char_type* const sEmptyBuffer; michael@0: michael@0: static michael@0: void michael@0: assign( char_type& lhs, char_type rhs ) michael@0: { michael@0: lhs = rhs; michael@0: } michael@0: michael@0: michael@0: // integer representation of characters: michael@0: michael@0: typedef int int_type; michael@0: michael@0: static michael@0: char_type michael@0: to_char_type( int_type c ) michael@0: { michael@0: return char_type(c); michael@0: } michael@0: michael@0: static michael@0: int_type michael@0: to_int_type( char_type c ) michael@0: { michael@0: return int_type( static_cast(c) ); michael@0: } michael@0: michael@0: static michael@0: bool michael@0: eq_int_type( int_type lhs, int_type rhs ) michael@0: { michael@0: return lhs == rhs; michael@0: } michael@0: michael@0: michael@0: // |char_type| comparisons: michael@0: michael@0: static michael@0: bool michael@0: eq( char_type lhs, char_type rhs ) michael@0: { michael@0: return lhs == rhs; michael@0: } michael@0: michael@0: static michael@0: bool michael@0: lt( char_type lhs, char_type rhs ) michael@0: { michael@0: return lhs < rhs; michael@0: } michael@0: michael@0: michael@0: // operations on s[n] arrays: michael@0: michael@0: static michael@0: char_type* michael@0: move( char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: return static_cast(memmove(s1, s2, n * sizeof(char_type))); michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: copy( char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: return static_cast(memcpy(s1, s2, n * sizeof(char_type))); michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: copyASCII( char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: return copy(s1, s2, n); michael@0: } michael@0: michael@0: static michael@0: char_type* michael@0: assign( char_type* s, size_t n, char_type c ) michael@0: { michael@0: return static_cast(memset(s, to_int_type(c), n)); michael@0: } michael@0: michael@0: static michael@0: int michael@0: compare( const char_type* s1, const char_type* s2, size_t n ) michael@0: { michael@0: return memcmp(s1, s2, n); michael@0: } michael@0: michael@0: static michael@0: int michael@0: compareASCII( const char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: #ifdef DEBUG michael@0: for (size_t i = 0; i < n; ++i) michael@0: { michael@0: NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character"); michael@0: } michael@0: #endif michael@0: return compare(s1, s2, n); michael@0: } michael@0: michael@0: // this version assumes that s2 is null-terminated and s1 has length n. michael@0: // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, michael@0: // we return 1. michael@0: static michael@0: int michael@0: compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) michael@0: { michael@0: // can't use strcmp here because we don't want to stop when s1 michael@0: // contains a null michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: if ( !*s2 ) michael@0: return 1; michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: if ( *s1 != *s2 ) michael@0: return to_int_type(*s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: if ( *s2 ) michael@0: return -1; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: /** michael@0: * Convert c to its lower-case form, but only if c is ASCII. michael@0: */ michael@0: static michael@0: char_type michael@0: ASCIIToLower( char_type c ) michael@0: { michael@0: if (c >= 'A' && c <= 'Z') michael@0: return char_type(c + ('a' - 'A')); michael@0: michael@0: return c; michael@0: } michael@0: michael@0: static michael@0: int michael@0: compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), michael@0: "Unexpected uppercase character"); michael@0: char_type lower_s1 = ASCIIToLower(*s1); michael@0: if ( lower_s1 != *s2 ) michael@0: return to_int_type(lower_s1) - to_int_type(*s2); michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: // this version assumes that s2 is null-terminated and s1 has length n. michael@0: // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, michael@0: // we return 1. michael@0: static michael@0: int michael@0: compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) michael@0: { michael@0: for ( ; n--; ++s1, ++s2 ) michael@0: { michael@0: if ( !*s2 ) michael@0: return 1; michael@0: NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); michael@0: NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), michael@0: "Unexpected uppercase character"); michael@0: char_type lower_s1 = ASCIIToLower(*s1); michael@0: if ( lower_s1 != *s2 ) michael@0: return to_int_type(lower_s1) - to_int_type(*s2); michael@0: } michael@0: michael@0: if ( *s2 ) michael@0: return -1; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: static michael@0: size_t michael@0: length( const char_type* s ) michael@0: { michael@0: return strlen(s); michael@0: } michael@0: michael@0: static michael@0: const char_type* michael@0: find( const char_type* s, size_t n, char_type c ) michael@0: { michael@0: return reinterpret_cast(memchr(s, to_int_type(c), n)); michael@0: } michael@0: }; michael@0: michael@0: template michael@0: struct nsCharSourceTraits michael@0: { michael@0: typedef typename InputIterator::difference_type difference_type; michael@0: michael@0: static michael@0: uint32_t michael@0: readable_distance( const InputIterator& first, const InputIterator& last ) michael@0: { michael@0: // assumes single fragment michael@0: return uint32_t(last.get() - first.get()); michael@0: } michael@0: michael@0: static michael@0: const typename InputIterator::value_type* michael@0: read( const InputIterator& iter ) michael@0: { michael@0: return iter.get(); michael@0: } michael@0: michael@0: static michael@0: void michael@0: advance( InputIterator& s, difference_type n ) michael@0: { michael@0: s.advance(n); michael@0: } michael@0: }; michael@0: michael@0: template michael@0: struct nsCharSourceTraits michael@0: { michael@0: typedef ptrdiff_t difference_type; michael@0: michael@0: static michael@0: uint32_t michael@0: readable_distance( CharT* s ) michael@0: { michael@0: return uint32_t(nsCharTraits::length(s)); michael@0: // return numeric_limits::max(); michael@0: } michael@0: michael@0: static michael@0: uint32_t michael@0: readable_distance( CharT* first, CharT* last ) michael@0: { michael@0: return uint32_t(last-first); michael@0: } michael@0: michael@0: static michael@0: const CharT* michael@0: read( CharT* s ) michael@0: { michael@0: return s; michael@0: } michael@0: michael@0: static michael@0: void michael@0: advance( CharT*& s, difference_type n ) michael@0: { michael@0: s += n; michael@0: } michael@0: }; michael@0: michael@0: template michael@0: struct nsCharSinkTraits michael@0: { michael@0: static michael@0: void michael@0: write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n ) michael@0: { michael@0: iter.write(s, n); michael@0: } michael@0: }; michael@0: michael@0: template michael@0: struct nsCharSinkTraits michael@0: { michael@0: static michael@0: void michael@0: write( CharT*& iter, const CharT* s, uint32_t n ) michael@0: { michael@0: nsCharTraits::move(iter, s, n); michael@0: iter += n; michael@0: } michael@0: }; michael@0: michael@0: #endif // !defined(nsCharTraits_h___)