Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef nsCharTraits_h___
7 #define nsCharTraits_h___
9 #include <ctype.h> // for |EOF|, |WEOF|
10 #include <string.h> // for |memcpy|, et al
12 #include "nscore.h" // for |char16_t|
14 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in
15 // particular the standalone software updater. In that case stub out
16 // the macros provided by nsDebug.h which are only usable when linking XPCOM
18 #ifdef NS_NO_XPCOM
19 #define NS_WARNING(msg)
20 #define NS_ASSERTION(cond, msg)
21 #define NS_ERROR(msg)
22 #else
23 #include "nsDebug.h" // for NS_ASSERTION
24 #endif
26 /*
27 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar
28 * values.
29 *
30 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by
31 * using "surrogate pairs". These consist of a high surrogate, i.e. a code
32 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point
33 * in the range U+DC00 - U+DFFF, like this:
34 *
35 * U+D800 U+DC00 = U+10000
36 * U+D800 U+DC01 = U+10001
37 * ...
38 * U+DBFF U+DFFE = U+10FFFE
39 * U+DBFF U+DFFF = U+10FFFF
40 *
41 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode
42 * scalar values and are not well-formed UTF-16 except as high-surrogate /
43 * low-surrogate pairs.
44 */
46 #define PLANE1_BASE uint32_t(0x00010000)
47 // High surrogates are in the range 0xD800 -- OxDBFF
48 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800)
49 // Low surrogates are in the range 0xDC00 -- 0xDFFF
50 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00)
51 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE
52 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800)
54 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF
56 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00)
57 // I wonder whether we could somehow assert that H is a high surrogate
58 // and L is a low surrogate
59 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \
60 (uint32_t(l) & 0x03FF) + PLANE1_BASE)
62 // Extract surrogates from a UCS4 char
63 // Reference: the Unicode standard 4.0, section 3.9
64 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and
65 // 0xD7C0 == 0xD800 - 0x0080,
66 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to
67 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \
68 char16_t(0xD7C0))
69 // where it's to be noted that 0xD7C0 is not bitwise-OR'd
70 // but added.
72 // Since 0x10000 & 0x03FF == 0,
73 // (c - 0x10000) & 0x03FF == c & 0x03FF so that
74 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to
75 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \
76 char16_t(0xDC00))
78 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE)
79 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD)
81 #define UCS_END uint32_t(0x00110000)
82 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c))
83 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR)
85 template <class CharT> struct nsCharTraits {};
87 template <>
88 struct nsCharTraits<char16_t>
89 {
90 typedef char16_t char_type;
91 typedef uint16_t unsigned_char_type;
92 typedef char incompatible_char_type;
94 static char_type* const sEmptyBuffer;
96 static
97 void
98 assign( char_type& lhs, char_type rhs )
99 {
100 lhs = rhs;
101 }
104 // integer representation of characters:
105 typedef int int_type;
107 static
108 char_type
109 to_char_type( int_type c )
110 {
111 return char_type(c);
112 }
114 static
115 int_type
116 to_int_type( char_type c )
117 {
118 return int_type( static_cast<unsigned_char_type>(c) );
119 }
121 static
122 bool
123 eq_int_type( int_type lhs, int_type rhs )
124 {
125 return lhs == rhs;
126 }
129 // |char_type| comparisons:
131 static
132 bool
133 eq( char_type lhs, char_type rhs )
134 {
135 return lhs == rhs;
136 }
138 static
139 bool
140 lt( char_type lhs, char_type rhs )
141 {
142 return lhs < rhs;
143 }
146 // operations on s[n] arrays:
148 static
149 char_type*
150 move( char_type* s1, const char_type* s2, size_t n )
151 {
152 return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));
153 }
155 static
156 char_type*
157 copy( char_type* s1, const char_type* s2, size_t n )
158 {
159 return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));
160 }
162 static
163 char_type*
164 copyASCII( char_type* s1, const char* s2, size_t n )
165 {
166 for (char_type* s = s1; n--; ++s, ++s2) {
167 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
168 *s = *s2;
169 }
170 return s1;
171 }
173 static
174 char_type*
175 assign( char_type* s, size_t n, char_type c )
176 {
177 char_type* result = s;
178 while ( n-- )
179 assign(*s++, c);
180 return result;
181 }
183 static
184 int
185 compare( const char_type* s1, const char_type* s2, size_t n )
186 {
187 for ( ; n--; ++s1, ++s2 )
188 {
189 if ( !eq(*s1, *s2) )
190 return to_int_type(*s1) - to_int_type(*s2);
191 }
193 return 0;
194 }
196 static
197 int
198 compareASCII( const char_type* s1, const char* s2, size_t n )
199 {
200 for ( ; n--; ++s1, ++s2 )
201 {
202 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
203 if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )
204 return to_int_type(*s1) - to_int_type(*s2);
205 }
207 return 0;
208 }
210 // this version assumes that s2 is null-terminated and s1 has length n.
211 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
212 // we return 1.
213 static
214 int
215 compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
216 {
217 for ( ; n--; ++s1, ++s2 )
218 {
219 if ( !*s2 )
220 return 1;
221 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
222 if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) )
223 return to_int_type(*s1) - to_int_type(*s2);
224 }
226 if ( *s2 )
227 return -1;
229 return 0;
230 }
232 /**
233 * Convert c to its lower-case form, but only if c is in the ASCII
234 * range. Otherwise leave it alone.
235 */
236 static
237 char_type
238 ASCIIToLower( char_type c )
239 {
240 if (c >= 'A' && c <= 'Z')
241 return char_type(c + ('a' - 'A'));
243 return c;
244 }
246 static
247 int
248 compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )
249 {
250 for ( ; n--; ++s1, ++s2 )
251 {
252 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
253 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
254 "Unexpected uppercase character");
255 char_type lower_s1 = ASCIIToLower(*s1);
256 if ( lower_s1 != to_char_type(*s2) )
257 return to_int_type(lower_s1) - to_int_type(*s2);
258 }
260 return 0;
261 }
263 // this version assumes that s2 is null-terminated and s1 has length n.
264 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
265 // we return 1.
266 static
267 int
268 compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
269 {
270 for ( ; n--; ++s1, ++s2 )
271 {
272 if ( !*s2 )
273 return 1;
274 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
275 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
276 "Unexpected uppercase character");
277 char_type lower_s1 = ASCIIToLower(*s1);
278 if ( lower_s1 != to_char_type(*s2) )
279 return to_int_type(lower_s1) - to_int_type(*s2);
280 }
282 if ( *s2 )
283 return -1;
285 return 0;
286 }
288 static
289 size_t
290 length( const char_type* s )
291 {
292 size_t result = 0;
293 while ( !eq(*s++, char_type(0)) )
294 ++result;
295 return result;
296 }
298 static
299 const char_type*
300 find( const char_type* s, size_t n, char_type c )
301 {
302 while ( n-- )
303 {
304 if ( eq(*s, c) )
305 return s;
306 ++s;
307 }
309 return 0;
310 }
311 };
313 template <>
314 struct nsCharTraits<char>
315 {
316 typedef char char_type;
317 typedef unsigned char unsigned_char_type;
318 typedef char16_t incompatible_char_type;
320 static char_type* const sEmptyBuffer;
322 static
323 void
324 assign( char_type& lhs, char_type rhs )
325 {
326 lhs = rhs;
327 }
330 // integer representation of characters:
332 typedef int int_type;
334 static
335 char_type
336 to_char_type( int_type c )
337 {
338 return char_type(c);
339 }
341 static
342 int_type
343 to_int_type( char_type c )
344 {
345 return int_type( static_cast<unsigned_char_type>(c) );
346 }
348 static
349 bool
350 eq_int_type( int_type lhs, int_type rhs )
351 {
352 return lhs == rhs;
353 }
356 // |char_type| comparisons:
358 static
359 bool
360 eq( char_type lhs, char_type rhs )
361 {
362 return lhs == rhs;
363 }
365 static
366 bool
367 lt( char_type lhs, char_type rhs )
368 {
369 return lhs < rhs;
370 }
373 // operations on s[n] arrays:
375 static
376 char_type*
377 move( char_type* s1, const char_type* s2, size_t n )
378 {
379 return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type)));
380 }
382 static
383 char_type*
384 copy( char_type* s1, const char_type* s2, size_t n )
385 {
386 return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type)));
387 }
389 static
390 char_type*
391 copyASCII( char_type* s1, const char* s2, size_t n )
392 {
393 return copy(s1, s2, n);
394 }
396 static
397 char_type*
398 assign( char_type* s, size_t n, char_type c )
399 {
400 return static_cast<char_type*>(memset(s, to_int_type(c), n));
401 }
403 static
404 int
405 compare( const char_type* s1, const char_type* s2, size_t n )
406 {
407 return memcmp(s1, s2, n);
408 }
410 static
411 int
412 compareASCII( const char_type* s1, const char* s2, size_t n )
413 {
414 #ifdef DEBUG
415 for (size_t i = 0; i < n; ++i)
416 {
417 NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character");
418 }
419 #endif
420 return compare(s1, s2, n);
421 }
423 // this version assumes that s2 is null-terminated and s1 has length n.
424 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
425 // we return 1.
426 static
427 int
428 compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
429 {
430 // can't use strcmp here because we don't want to stop when s1
431 // contains a null
432 for ( ; n--; ++s1, ++s2 )
433 {
434 if ( !*s2 )
435 return 1;
436 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
437 if ( *s1 != *s2 )
438 return to_int_type(*s1) - to_int_type(*s2);
439 }
441 if ( *s2 )
442 return -1;
444 return 0;
445 }
447 /**
448 * Convert c to its lower-case form, but only if c is ASCII.
449 */
450 static
451 char_type
452 ASCIIToLower( char_type c )
453 {
454 if (c >= 'A' && c <= 'Z')
455 return char_type(c + ('a' - 'A'));
457 return c;
458 }
460 static
461 int
462 compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n )
463 {
464 for ( ; n--; ++s1, ++s2 )
465 {
466 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
467 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
468 "Unexpected uppercase character");
469 char_type lower_s1 = ASCIIToLower(*s1);
470 if ( lower_s1 != *s2 )
471 return to_int_type(lower_s1) - to_int_type(*s2);
472 }
473 return 0;
474 }
476 // this version assumes that s2 is null-terminated and s1 has length n.
477 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2,
478 // we return 1.
479 static
480 int
481 compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 )
482 {
483 for ( ; n--; ++s1, ++s2 )
484 {
485 if ( !*s2 )
486 return 1;
487 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character");
488 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'),
489 "Unexpected uppercase character");
490 char_type lower_s1 = ASCIIToLower(*s1);
491 if ( lower_s1 != *s2 )
492 return to_int_type(lower_s1) - to_int_type(*s2);
493 }
495 if ( *s2 )
496 return -1;
498 return 0;
499 }
501 static
502 size_t
503 length( const char_type* s )
504 {
505 return strlen(s);
506 }
508 static
509 const char_type*
510 find( const char_type* s, size_t n, char_type c )
511 {
512 return reinterpret_cast<const char_type*>(memchr(s, to_int_type(c), n));
513 }
514 };
516 template <class InputIterator>
517 struct nsCharSourceTraits
518 {
519 typedef typename InputIterator::difference_type difference_type;
521 static
522 uint32_t
523 readable_distance( const InputIterator& first, const InputIterator& last )
524 {
525 // assumes single fragment
526 return uint32_t(last.get() - first.get());
527 }
529 static
530 const typename InputIterator::value_type*
531 read( const InputIterator& iter )
532 {
533 return iter.get();
534 }
536 static
537 void
538 advance( InputIterator& s, difference_type n )
539 {
540 s.advance(n);
541 }
542 };
544 template <class CharT>
545 struct nsCharSourceTraits<CharT*>
546 {
547 typedef ptrdiff_t difference_type;
549 static
550 uint32_t
551 readable_distance( CharT* s )
552 {
553 return uint32_t(nsCharTraits<CharT>::length(s));
554 // return numeric_limits<uint32_t>::max();
555 }
557 static
558 uint32_t
559 readable_distance( CharT* first, CharT* last )
560 {
561 return uint32_t(last-first);
562 }
564 static
565 const CharT*
566 read( CharT* s )
567 {
568 return s;
569 }
571 static
572 void
573 advance( CharT*& s, difference_type n )
574 {
575 s += n;
576 }
577 };
579 template <class OutputIterator>
580 struct nsCharSinkTraits
581 {
582 static
583 void
584 write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n )
585 {
586 iter.write(s, n);
587 }
588 };
590 template <class CharT>
591 struct nsCharSinkTraits<CharT*>
592 {
593 static
594 void
595 write( CharT*& iter, const CharT* s, uint32_t n )
596 {
597 nsCharTraits<CharT>::move(iter, s, n);
598 iter += n;
599 }
600 };
602 #endif // !defined(nsCharTraits_h___)