1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/string/public/nsCharTraits.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,602 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef nsCharTraits_h___ 1.10 +#define nsCharTraits_h___ 1.11 + 1.12 +#include <ctype.h> // for |EOF|, |WEOF| 1.13 +#include <string.h> // for |memcpy|, et al 1.14 + 1.15 +#include "nscore.h" // for |char16_t| 1.16 + 1.17 +// This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in 1.18 +// particular the standalone software updater. In that case stub out 1.19 +// the macros provided by nsDebug.h which are only usable when linking XPCOM 1.20 + 1.21 +#ifdef NS_NO_XPCOM 1.22 +#define NS_WARNING(msg) 1.23 +#define NS_ASSERTION(cond, msg) 1.24 +#define NS_ERROR(msg) 1.25 +#else 1.26 +#include "nsDebug.h" // for NS_ASSERTION 1.27 +#endif 1.28 + 1.29 +/* 1.30 + * Some macros for converting char16_t (UTF-16) to and from Unicode scalar 1.31 + * values. 1.32 + * 1.33 + * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by 1.34 + * using "surrogate pairs". These consist of a high surrogate, i.e. a code 1.35 + * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point 1.36 + * in the range U+DC00 - U+DFFF, like this: 1.37 + * 1.38 + * U+D800 U+DC00 = U+10000 1.39 + * U+D800 U+DC01 = U+10001 1.40 + * ... 1.41 + * U+DBFF U+DFFE = U+10FFFE 1.42 + * U+DBFF U+DFFF = U+10FFFF 1.43 + * 1.44 + * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode 1.45 + * scalar values and are not well-formed UTF-16 except as high-surrogate / 1.46 + * low-surrogate pairs. 1.47 + */ 1.48 + 1.49 +#define PLANE1_BASE uint32_t(0x00010000) 1.50 +// High surrogates are in the range 0xD800 -- OxDBFF 1.51 +#define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) 1.52 +// Low surrogates are in the range 0xDC00 -- 0xDFFF 1.53 +#define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) 1.54 +// Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE 1.55 +#define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800) 1.56 + 1.57 +// Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF 1.58 + 1.59 +// N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) 1.60 +// I wonder whether we could somehow assert that H is a high surrogate 1.61 +// and L is a low surrogate 1.62 +#define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \ 1.63 + (uint32_t(l) & 0x03FF) + PLANE1_BASE) 1.64 + 1.65 +// Extract surrogates from a UCS4 char 1.66 +// Reference: the Unicode standard 4.0, section 3.9 1.67 +// Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and 1.68 +// 0xD7C0 == 0xD800 - 0x0080, 1.69 +// ((c - 0x10000) >> 10) + 0xD800 can be simplified to 1.70 +#define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \ 1.71 + char16_t(0xD7C0)) 1.72 +// where it's to be noted that 0xD7C0 is not bitwise-OR'd 1.73 +// but added. 1.74 + 1.75 +// Since 0x10000 & 0x03FF == 0, 1.76 +// (c - 0x10000) & 0x03FF == c & 0x03FF so that 1.77 +// ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to 1.78 +#define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \ 1.79 + char16_t(0xDC00)) 1.80 + 1.81 +#define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE) 1.82 +#define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD) 1.83 + 1.84 +#define UCS_END uint32_t(0x00110000) 1.85 +#define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c)) 1.86 +#define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) 1.87 + 1.88 +template <class CharT> struct nsCharTraits {}; 1.89 + 1.90 +template <> 1.91 +struct nsCharTraits<char16_t> 1.92 + { 1.93 + typedef char16_t char_type; 1.94 + typedef uint16_t unsigned_char_type; 1.95 + typedef char incompatible_char_type; 1.96 + 1.97 + static char_type* const sEmptyBuffer; 1.98 + 1.99 + static 1.100 + void 1.101 + assign( char_type& lhs, char_type rhs ) 1.102 + { 1.103 + lhs = rhs; 1.104 + } 1.105 + 1.106 + 1.107 + // integer representation of characters: 1.108 + typedef int int_type; 1.109 + 1.110 + static 1.111 + char_type 1.112 + to_char_type( int_type c ) 1.113 + { 1.114 + return char_type(c); 1.115 + } 1.116 + 1.117 + static 1.118 + int_type 1.119 + to_int_type( char_type c ) 1.120 + { 1.121 + return int_type( static_cast<unsigned_char_type>(c) ); 1.122 + } 1.123 + 1.124 + static 1.125 + bool 1.126 + eq_int_type( int_type lhs, int_type rhs ) 1.127 + { 1.128 + return lhs == rhs; 1.129 + } 1.130 + 1.131 + 1.132 + // |char_type| comparisons: 1.133 + 1.134 + static 1.135 + bool 1.136 + eq( char_type lhs, char_type rhs ) 1.137 + { 1.138 + return lhs == rhs; 1.139 + } 1.140 + 1.141 + static 1.142 + bool 1.143 + lt( char_type lhs, char_type rhs ) 1.144 + { 1.145 + return lhs < rhs; 1.146 + } 1.147 + 1.148 + 1.149 + // operations on s[n] arrays: 1.150 + 1.151 + static 1.152 + char_type* 1.153 + move( char_type* s1, const char_type* s2, size_t n ) 1.154 + { 1.155 + return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type))); 1.156 + } 1.157 + 1.158 + static 1.159 + char_type* 1.160 + copy( char_type* s1, const char_type* s2, size_t n ) 1.161 + { 1.162 + return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type))); 1.163 + } 1.164 + 1.165 + static 1.166 + char_type* 1.167 + copyASCII( char_type* s1, const char* s2, size_t n ) 1.168 + { 1.169 + for (char_type* s = s1; n--; ++s, ++s2) { 1.170 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.171 + *s = *s2; 1.172 + } 1.173 + return s1; 1.174 + } 1.175 + 1.176 + static 1.177 + char_type* 1.178 + assign( char_type* s, size_t n, char_type c ) 1.179 + { 1.180 + char_type* result = s; 1.181 + while ( n-- ) 1.182 + assign(*s++, c); 1.183 + return result; 1.184 + } 1.185 + 1.186 + static 1.187 + int 1.188 + compare( const char_type* s1, const char_type* s2, size_t n ) 1.189 + { 1.190 + for ( ; n--; ++s1, ++s2 ) 1.191 + { 1.192 + if ( !eq(*s1, *s2) ) 1.193 + return to_int_type(*s1) - to_int_type(*s2); 1.194 + } 1.195 + 1.196 + return 0; 1.197 + } 1.198 + 1.199 + static 1.200 + int 1.201 + compareASCII( const char_type* s1, const char* s2, size_t n ) 1.202 + { 1.203 + for ( ; n--; ++s1, ++s2 ) 1.204 + { 1.205 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.206 + if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) 1.207 + return to_int_type(*s1) - to_int_type(*s2); 1.208 + } 1.209 + 1.210 + return 0; 1.211 + } 1.212 + 1.213 + // this version assumes that s2 is null-terminated and s1 has length n. 1.214 + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 1.215 + // we return 1. 1.216 + static 1.217 + int 1.218 + compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) 1.219 + { 1.220 + for ( ; n--; ++s1, ++s2 ) 1.221 + { 1.222 + if ( !*s2 ) 1.223 + return 1; 1.224 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.225 + if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) 1.226 + return to_int_type(*s1) - to_int_type(*s2); 1.227 + } 1.228 + 1.229 + if ( *s2 ) 1.230 + return -1; 1.231 + 1.232 + return 0; 1.233 + } 1.234 + 1.235 + /** 1.236 + * Convert c to its lower-case form, but only if c is in the ASCII 1.237 + * range. Otherwise leave it alone. 1.238 + */ 1.239 + static 1.240 + char_type 1.241 + ASCIIToLower( char_type c ) 1.242 + { 1.243 + if (c >= 'A' && c <= 'Z') 1.244 + return char_type(c + ('a' - 'A')); 1.245 + 1.246 + return c; 1.247 + } 1.248 + 1.249 + static 1.250 + int 1.251 + compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) 1.252 + { 1.253 + for ( ; n--; ++s1, ++s2 ) 1.254 + { 1.255 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.256 + NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), 1.257 + "Unexpected uppercase character"); 1.258 + char_type lower_s1 = ASCIIToLower(*s1); 1.259 + if ( lower_s1 != to_char_type(*s2) ) 1.260 + return to_int_type(lower_s1) - to_int_type(*s2); 1.261 + } 1.262 + 1.263 + return 0; 1.264 + } 1.265 + 1.266 + // this version assumes that s2 is null-terminated and s1 has length n. 1.267 + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 1.268 + // we return 1. 1.269 + static 1.270 + int 1.271 + compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) 1.272 + { 1.273 + for ( ; n--; ++s1, ++s2 ) 1.274 + { 1.275 + if ( !*s2 ) 1.276 + return 1; 1.277 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.278 + NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), 1.279 + "Unexpected uppercase character"); 1.280 + char_type lower_s1 = ASCIIToLower(*s1); 1.281 + if ( lower_s1 != to_char_type(*s2) ) 1.282 + return to_int_type(lower_s1) - to_int_type(*s2); 1.283 + } 1.284 + 1.285 + if ( *s2 ) 1.286 + return -1; 1.287 + 1.288 + return 0; 1.289 + } 1.290 + 1.291 + static 1.292 + size_t 1.293 + length( const char_type* s ) 1.294 + { 1.295 + size_t result = 0; 1.296 + while ( !eq(*s++, char_type(0)) ) 1.297 + ++result; 1.298 + return result; 1.299 + } 1.300 + 1.301 + static 1.302 + const char_type* 1.303 + find( const char_type* s, size_t n, char_type c ) 1.304 + { 1.305 + while ( n-- ) 1.306 + { 1.307 + if ( eq(*s, c) ) 1.308 + return s; 1.309 + ++s; 1.310 + } 1.311 + 1.312 + return 0; 1.313 + } 1.314 + }; 1.315 + 1.316 +template <> 1.317 +struct nsCharTraits<char> 1.318 + { 1.319 + typedef char char_type; 1.320 + typedef unsigned char unsigned_char_type; 1.321 + typedef char16_t incompatible_char_type; 1.322 + 1.323 + static char_type* const sEmptyBuffer; 1.324 + 1.325 + static 1.326 + void 1.327 + assign( char_type& lhs, char_type rhs ) 1.328 + { 1.329 + lhs = rhs; 1.330 + } 1.331 + 1.332 + 1.333 + // integer representation of characters: 1.334 + 1.335 + typedef int int_type; 1.336 + 1.337 + static 1.338 + char_type 1.339 + to_char_type( int_type c ) 1.340 + { 1.341 + return char_type(c); 1.342 + } 1.343 + 1.344 + static 1.345 + int_type 1.346 + to_int_type( char_type c ) 1.347 + { 1.348 + return int_type( static_cast<unsigned_char_type>(c) ); 1.349 + } 1.350 + 1.351 + static 1.352 + bool 1.353 + eq_int_type( int_type lhs, int_type rhs ) 1.354 + { 1.355 + return lhs == rhs; 1.356 + } 1.357 + 1.358 + 1.359 + // |char_type| comparisons: 1.360 + 1.361 + static 1.362 + bool 1.363 + eq( char_type lhs, char_type rhs ) 1.364 + { 1.365 + return lhs == rhs; 1.366 + } 1.367 + 1.368 + static 1.369 + bool 1.370 + lt( char_type lhs, char_type rhs ) 1.371 + { 1.372 + return lhs < rhs; 1.373 + } 1.374 + 1.375 + 1.376 + // operations on s[n] arrays: 1.377 + 1.378 + static 1.379 + char_type* 1.380 + move( char_type* s1, const char_type* s2, size_t n ) 1.381 + { 1.382 + return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type))); 1.383 + } 1.384 + 1.385 + static 1.386 + char_type* 1.387 + copy( char_type* s1, const char_type* s2, size_t n ) 1.388 + { 1.389 + return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type))); 1.390 + } 1.391 + 1.392 + static 1.393 + char_type* 1.394 + copyASCII( char_type* s1, const char* s2, size_t n ) 1.395 + { 1.396 + return copy(s1, s2, n); 1.397 + } 1.398 + 1.399 + static 1.400 + char_type* 1.401 + assign( char_type* s, size_t n, char_type c ) 1.402 + { 1.403 + return static_cast<char_type*>(memset(s, to_int_type(c), n)); 1.404 + } 1.405 + 1.406 + static 1.407 + int 1.408 + compare( const char_type* s1, const char_type* s2, size_t n ) 1.409 + { 1.410 + return memcmp(s1, s2, n); 1.411 + } 1.412 + 1.413 + static 1.414 + int 1.415 + compareASCII( const char_type* s1, const char* s2, size_t n ) 1.416 + { 1.417 +#ifdef DEBUG 1.418 + for (size_t i = 0; i < n; ++i) 1.419 + { 1.420 + NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character"); 1.421 + } 1.422 +#endif 1.423 + return compare(s1, s2, n); 1.424 + } 1.425 + 1.426 + // this version assumes that s2 is null-terminated and s1 has length n. 1.427 + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 1.428 + // we return 1. 1.429 + static 1.430 + int 1.431 + compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) 1.432 + { 1.433 + // can't use strcmp here because we don't want to stop when s1 1.434 + // contains a null 1.435 + for ( ; n--; ++s1, ++s2 ) 1.436 + { 1.437 + if ( !*s2 ) 1.438 + return 1; 1.439 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.440 + if ( *s1 != *s2 ) 1.441 + return to_int_type(*s1) - to_int_type(*s2); 1.442 + } 1.443 + 1.444 + if ( *s2 ) 1.445 + return -1; 1.446 + 1.447 + return 0; 1.448 + } 1.449 + 1.450 + /** 1.451 + * Convert c to its lower-case form, but only if c is ASCII. 1.452 + */ 1.453 + static 1.454 + char_type 1.455 + ASCIIToLower( char_type c ) 1.456 + { 1.457 + if (c >= 'A' && c <= 'Z') 1.458 + return char_type(c + ('a' - 'A')); 1.459 + 1.460 + return c; 1.461 + } 1.462 + 1.463 + static 1.464 + int 1.465 + compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) 1.466 + { 1.467 + for ( ; n--; ++s1, ++s2 ) 1.468 + { 1.469 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.470 + NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), 1.471 + "Unexpected uppercase character"); 1.472 + char_type lower_s1 = ASCIIToLower(*s1); 1.473 + if ( lower_s1 != *s2 ) 1.474 + return to_int_type(lower_s1) - to_int_type(*s2); 1.475 + } 1.476 + return 0; 1.477 + } 1.478 + 1.479 + // this version assumes that s2 is null-terminated and s1 has length n. 1.480 + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, 1.481 + // we return 1. 1.482 + static 1.483 + int 1.484 + compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) 1.485 + { 1.486 + for ( ; n--; ++s1, ++s2 ) 1.487 + { 1.488 + if ( !*s2 ) 1.489 + return 1; 1.490 + NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); 1.491 + NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), 1.492 + "Unexpected uppercase character"); 1.493 + char_type lower_s1 = ASCIIToLower(*s1); 1.494 + if ( lower_s1 != *s2 ) 1.495 + return to_int_type(lower_s1) - to_int_type(*s2); 1.496 + } 1.497 + 1.498 + if ( *s2 ) 1.499 + return -1; 1.500 + 1.501 + return 0; 1.502 + } 1.503 + 1.504 + static 1.505 + size_t 1.506 + length( const char_type* s ) 1.507 + { 1.508 + return strlen(s); 1.509 + } 1.510 + 1.511 + static 1.512 + const char_type* 1.513 + find( const char_type* s, size_t n, char_type c ) 1.514 + { 1.515 + return reinterpret_cast<const char_type*>(memchr(s, to_int_type(c), n)); 1.516 + } 1.517 + }; 1.518 + 1.519 +template <class InputIterator> 1.520 +struct nsCharSourceTraits 1.521 + { 1.522 + typedef typename InputIterator::difference_type difference_type; 1.523 + 1.524 + static 1.525 + uint32_t 1.526 + readable_distance( const InputIterator& first, const InputIterator& last ) 1.527 + { 1.528 + // assumes single fragment 1.529 + return uint32_t(last.get() - first.get()); 1.530 + } 1.531 + 1.532 + static 1.533 + const typename InputIterator::value_type* 1.534 + read( const InputIterator& iter ) 1.535 + { 1.536 + return iter.get(); 1.537 + } 1.538 + 1.539 + static 1.540 + void 1.541 + advance( InputIterator& s, difference_type n ) 1.542 + { 1.543 + s.advance(n); 1.544 + } 1.545 + }; 1.546 + 1.547 +template <class CharT> 1.548 +struct nsCharSourceTraits<CharT*> 1.549 + { 1.550 + typedef ptrdiff_t difference_type; 1.551 + 1.552 + static 1.553 + uint32_t 1.554 + readable_distance( CharT* s ) 1.555 + { 1.556 + return uint32_t(nsCharTraits<CharT>::length(s)); 1.557 +// return numeric_limits<uint32_t>::max(); 1.558 + } 1.559 + 1.560 + static 1.561 + uint32_t 1.562 + readable_distance( CharT* first, CharT* last ) 1.563 + { 1.564 + return uint32_t(last-first); 1.565 + } 1.566 + 1.567 + static 1.568 + const CharT* 1.569 + read( CharT* s ) 1.570 + { 1.571 + return s; 1.572 + } 1.573 + 1.574 + static 1.575 + void 1.576 + advance( CharT*& s, difference_type n ) 1.577 + { 1.578 + s += n; 1.579 + } 1.580 + }; 1.581 + 1.582 +template <class OutputIterator> 1.583 +struct nsCharSinkTraits 1.584 + { 1.585 + static 1.586 + void 1.587 + write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n ) 1.588 + { 1.589 + iter.write(s, n); 1.590 + } 1.591 + }; 1.592 + 1.593 +template <class CharT> 1.594 +struct nsCharSinkTraits<CharT*> 1.595 + { 1.596 + static 1.597 + void 1.598 + write( CharT*& iter, const CharT* s, uint32_t n ) 1.599 + { 1.600 + nsCharTraits<CharT>::move(iter, s, n); 1.601 + iter += n; 1.602 + } 1.603 + }; 1.604 + 1.605 +#endif // !defined(nsCharTraits_h___)