|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #ifndef nsCharTraits_h___ |
|
7 #define nsCharTraits_h___ |
|
8 |
|
9 #include <ctype.h> // for |EOF|, |WEOF| |
|
10 #include <string.h> // for |memcpy|, et al |
|
11 |
|
12 #include "nscore.h" // for |char16_t| |
|
13 |
|
14 // This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in |
|
15 // particular the standalone software updater. In that case stub out |
|
16 // the macros provided by nsDebug.h which are only usable when linking XPCOM |
|
17 |
|
18 #ifdef NS_NO_XPCOM |
|
19 #define NS_WARNING(msg) |
|
20 #define NS_ASSERTION(cond, msg) |
|
21 #define NS_ERROR(msg) |
|
22 #else |
|
23 #include "nsDebug.h" // for NS_ASSERTION |
|
24 #endif |
|
25 |
|
26 /* |
|
27 * Some macros for converting char16_t (UTF-16) to and from Unicode scalar |
|
28 * values. |
|
29 * |
|
30 * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by |
|
31 * using "surrogate pairs". These consist of a high surrogate, i.e. a code |
|
32 * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point |
|
33 * in the range U+DC00 - U+DFFF, like this: |
|
34 * |
|
35 * U+D800 U+DC00 = U+10000 |
|
36 * U+D800 U+DC01 = U+10001 |
|
37 * ... |
|
38 * U+DBFF U+DFFE = U+10FFFE |
|
39 * U+DBFF U+DFFF = U+10FFFF |
|
40 * |
|
41 * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode |
|
42 * scalar values and are not well-formed UTF-16 except as high-surrogate / |
|
43 * low-surrogate pairs. |
|
44 */ |
|
45 |
|
46 #define PLANE1_BASE uint32_t(0x00010000) |
|
47 // High surrogates are in the range 0xD800 -- OxDBFF |
|
48 #define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) |
|
49 // Low surrogates are in the range 0xDC00 -- 0xDFFF |
|
50 #define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) |
|
51 // Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE |
|
52 #define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800) |
|
53 |
|
54 // Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF |
|
55 |
|
56 // N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) |
|
57 // I wonder whether we could somehow assert that H is a high surrogate |
|
58 // and L is a low surrogate |
|
59 #define SURROGATE_TO_UCS4(h, l) (((uint32_t(h) & 0x03FF) << 10) + \ |
|
60 (uint32_t(l) & 0x03FF) + PLANE1_BASE) |
|
61 |
|
62 // Extract surrogates from a UCS4 char |
|
63 // Reference: the Unicode standard 4.0, section 3.9 |
|
64 // Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and |
|
65 // 0xD7C0 == 0xD800 - 0x0080, |
|
66 // ((c - 0x10000) >> 10) + 0xD800 can be simplified to |
|
67 #define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + \ |
|
68 char16_t(0xD7C0)) |
|
69 // where it's to be noted that 0xD7C0 is not bitwise-OR'd |
|
70 // but added. |
|
71 |
|
72 // Since 0x10000 & 0x03FF == 0, |
|
73 // (c - 0x10000) & 0x03FF == c & 0x03FF so that |
|
74 // ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to |
|
75 #define L_SURROGATE(c) char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | \ |
|
76 char16_t(0xDC00)) |
|
77 |
|
78 #define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE) |
|
79 #define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD) |
|
80 |
|
81 #define UCS_END uint32_t(0x00110000) |
|
82 #define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c)) |
|
83 #define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) |
|
84 |
|
85 template <class CharT> struct nsCharTraits {}; |
|
86 |
|
87 template <> |
|
88 struct nsCharTraits<char16_t> |
|
89 { |
|
90 typedef char16_t char_type; |
|
91 typedef uint16_t unsigned_char_type; |
|
92 typedef char incompatible_char_type; |
|
93 |
|
94 static char_type* const sEmptyBuffer; |
|
95 |
|
96 static |
|
97 void |
|
98 assign( char_type& lhs, char_type rhs ) |
|
99 { |
|
100 lhs = rhs; |
|
101 } |
|
102 |
|
103 |
|
104 // integer representation of characters: |
|
105 typedef int int_type; |
|
106 |
|
107 static |
|
108 char_type |
|
109 to_char_type( int_type c ) |
|
110 { |
|
111 return char_type(c); |
|
112 } |
|
113 |
|
114 static |
|
115 int_type |
|
116 to_int_type( char_type c ) |
|
117 { |
|
118 return int_type( static_cast<unsigned_char_type>(c) ); |
|
119 } |
|
120 |
|
121 static |
|
122 bool |
|
123 eq_int_type( int_type lhs, int_type rhs ) |
|
124 { |
|
125 return lhs == rhs; |
|
126 } |
|
127 |
|
128 |
|
129 // |char_type| comparisons: |
|
130 |
|
131 static |
|
132 bool |
|
133 eq( char_type lhs, char_type rhs ) |
|
134 { |
|
135 return lhs == rhs; |
|
136 } |
|
137 |
|
138 static |
|
139 bool |
|
140 lt( char_type lhs, char_type rhs ) |
|
141 { |
|
142 return lhs < rhs; |
|
143 } |
|
144 |
|
145 |
|
146 // operations on s[n] arrays: |
|
147 |
|
148 static |
|
149 char_type* |
|
150 move( char_type* s1, const char_type* s2, size_t n ) |
|
151 { |
|
152 return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type))); |
|
153 } |
|
154 |
|
155 static |
|
156 char_type* |
|
157 copy( char_type* s1, const char_type* s2, size_t n ) |
|
158 { |
|
159 return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type))); |
|
160 } |
|
161 |
|
162 static |
|
163 char_type* |
|
164 copyASCII( char_type* s1, const char* s2, size_t n ) |
|
165 { |
|
166 for (char_type* s = s1; n--; ++s, ++s2) { |
|
167 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
168 *s = *s2; |
|
169 } |
|
170 return s1; |
|
171 } |
|
172 |
|
173 static |
|
174 char_type* |
|
175 assign( char_type* s, size_t n, char_type c ) |
|
176 { |
|
177 char_type* result = s; |
|
178 while ( n-- ) |
|
179 assign(*s++, c); |
|
180 return result; |
|
181 } |
|
182 |
|
183 static |
|
184 int |
|
185 compare( const char_type* s1, const char_type* s2, size_t n ) |
|
186 { |
|
187 for ( ; n--; ++s1, ++s2 ) |
|
188 { |
|
189 if ( !eq(*s1, *s2) ) |
|
190 return to_int_type(*s1) - to_int_type(*s2); |
|
191 } |
|
192 |
|
193 return 0; |
|
194 } |
|
195 |
|
196 static |
|
197 int |
|
198 compareASCII( const char_type* s1, const char* s2, size_t n ) |
|
199 { |
|
200 for ( ; n--; ++s1, ++s2 ) |
|
201 { |
|
202 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
203 if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) |
|
204 return to_int_type(*s1) - to_int_type(*s2); |
|
205 } |
|
206 |
|
207 return 0; |
|
208 } |
|
209 |
|
210 // this version assumes that s2 is null-terminated and s1 has length n. |
|
211 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, |
|
212 // we return 1. |
|
213 static |
|
214 int |
|
215 compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) |
|
216 { |
|
217 for ( ; n--; ++s1, ++s2 ) |
|
218 { |
|
219 if ( !*s2 ) |
|
220 return 1; |
|
221 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
222 if ( !eq_int_type(to_int_type(*s1), to_int_type(*s2)) ) |
|
223 return to_int_type(*s1) - to_int_type(*s2); |
|
224 } |
|
225 |
|
226 if ( *s2 ) |
|
227 return -1; |
|
228 |
|
229 return 0; |
|
230 } |
|
231 |
|
232 /** |
|
233 * Convert c to its lower-case form, but only if c is in the ASCII |
|
234 * range. Otherwise leave it alone. |
|
235 */ |
|
236 static |
|
237 char_type |
|
238 ASCIIToLower( char_type c ) |
|
239 { |
|
240 if (c >= 'A' && c <= 'Z') |
|
241 return char_type(c + ('a' - 'A')); |
|
242 |
|
243 return c; |
|
244 } |
|
245 |
|
246 static |
|
247 int |
|
248 compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) |
|
249 { |
|
250 for ( ; n--; ++s1, ++s2 ) |
|
251 { |
|
252 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
253 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), |
|
254 "Unexpected uppercase character"); |
|
255 char_type lower_s1 = ASCIIToLower(*s1); |
|
256 if ( lower_s1 != to_char_type(*s2) ) |
|
257 return to_int_type(lower_s1) - to_int_type(*s2); |
|
258 } |
|
259 |
|
260 return 0; |
|
261 } |
|
262 |
|
263 // this version assumes that s2 is null-terminated and s1 has length n. |
|
264 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, |
|
265 // we return 1. |
|
266 static |
|
267 int |
|
268 compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) |
|
269 { |
|
270 for ( ; n--; ++s1, ++s2 ) |
|
271 { |
|
272 if ( !*s2 ) |
|
273 return 1; |
|
274 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
275 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), |
|
276 "Unexpected uppercase character"); |
|
277 char_type lower_s1 = ASCIIToLower(*s1); |
|
278 if ( lower_s1 != to_char_type(*s2) ) |
|
279 return to_int_type(lower_s1) - to_int_type(*s2); |
|
280 } |
|
281 |
|
282 if ( *s2 ) |
|
283 return -1; |
|
284 |
|
285 return 0; |
|
286 } |
|
287 |
|
288 static |
|
289 size_t |
|
290 length( const char_type* s ) |
|
291 { |
|
292 size_t result = 0; |
|
293 while ( !eq(*s++, char_type(0)) ) |
|
294 ++result; |
|
295 return result; |
|
296 } |
|
297 |
|
298 static |
|
299 const char_type* |
|
300 find( const char_type* s, size_t n, char_type c ) |
|
301 { |
|
302 while ( n-- ) |
|
303 { |
|
304 if ( eq(*s, c) ) |
|
305 return s; |
|
306 ++s; |
|
307 } |
|
308 |
|
309 return 0; |
|
310 } |
|
311 }; |
|
312 |
|
313 template <> |
|
314 struct nsCharTraits<char> |
|
315 { |
|
316 typedef char char_type; |
|
317 typedef unsigned char unsigned_char_type; |
|
318 typedef char16_t incompatible_char_type; |
|
319 |
|
320 static char_type* const sEmptyBuffer; |
|
321 |
|
322 static |
|
323 void |
|
324 assign( char_type& lhs, char_type rhs ) |
|
325 { |
|
326 lhs = rhs; |
|
327 } |
|
328 |
|
329 |
|
330 // integer representation of characters: |
|
331 |
|
332 typedef int int_type; |
|
333 |
|
334 static |
|
335 char_type |
|
336 to_char_type( int_type c ) |
|
337 { |
|
338 return char_type(c); |
|
339 } |
|
340 |
|
341 static |
|
342 int_type |
|
343 to_int_type( char_type c ) |
|
344 { |
|
345 return int_type( static_cast<unsigned_char_type>(c) ); |
|
346 } |
|
347 |
|
348 static |
|
349 bool |
|
350 eq_int_type( int_type lhs, int_type rhs ) |
|
351 { |
|
352 return lhs == rhs; |
|
353 } |
|
354 |
|
355 |
|
356 // |char_type| comparisons: |
|
357 |
|
358 static |
|
359 bool |
|
360 eq( char_type lhs, char_type rhs ) |
|
361 { |
|
362 return lhs == rhs; |
|
363 } |
|
364 |
|
365 static |
|
366 bool |
|
367 lt( char_type lhs, char_type rhs ) |
|
368 { |
|
369 return lhs < rhs; |
|
370 } |
|
371 |
|
372 |
|
373 // operations on s[n] arrays: |
|
374 |
|
375 static |
|
376 char_type* |
|
377 move( char_type* s1, const char_type* s2, size_t n ) |
|
378 { |
|
379 return static_cast<char_type*>(memmove(s1, s2, n * sizeof(char_type))); |
|
380 } |
|
381 |
|
382 static |
|
383 char_type* |
|
384 copy( char_type* s1, const char_type* s2, size_t n ) |
|
385 { |
|
386 return static_cast<char_type*>(memcpy(s1, s2, n * sizeof(char_type))); |
|
387 } |
|
388 |
|
389 static |
|
390 char_type* |
|
391 copyASCII( char_type* s1, const char* s2, size_t n ) |
|
392 { |
|
393 return copy(s1, s2, n); |
|
394 } |
|
395 |
|
396 static |
|
397 char_type* |
|
398 assign( char_type* s, size_t n, char_type c ) |
|
399 { |
|
400 return static_cast<char_type*>(memset(s, to_int_type(c), n)); |
|
401 } |
|
402 |
|
403 static |
|
404 int |
|
405 compare( const char_type* s1, const char_type* s2, size_t n ) |
|
406 { |
|
407 return memcmp(s1, s2, n); |
|
408 } |
|
409 |
|
410 static |
|
411 int |
|
412 compareASCII( const char_type* s1, const char* s2, size_t n ) |
|
413 { |
|
414 #ifdef DEBUG |
|
415 for (size_t i = 0; i < n; ++i) |
|
416 { |
|
417 NS_ASSERTION(!(s2[i] & ~0x7F), "Unexpected non-ASCII character"); |
|
418 } |
|
419 #endif |
|
420 return compare(s1, s2, n); |
|
421 } |
|
422 |
|
423 // this version assumes that s2 is null-terminated and s1 has length n. |
|
424 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, |
|
425 // we return 1. |
|
426 static |
|
427 int |
|
428 compareASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) |
|
429 { |
|
430 // can't use strcmp here because we don't want to stop when s1 |
|
431 // contains a null |
|
432 for ( ; n--; ++s1, ++s2 ) |
|
433 { |
|
434 if ( !*s2 ) |
|
435 return 1; |
|
436 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
437 if ( *s1 != *s2 ) |
|
438 return to_int_type(*s1) - to_int_type(*s2); |
|
439 } |
|
440 |
|
441 if ( *s2 ) |
|
442 return -1; |
|
443 |
|
444 return 0; |
|
445 } |
|
446 |
|
447 /** |
|
448 * Convert c to its lower-case form, but only if c is ASCII. |
|
449 */ |
|
450 static |
|
451 char_type |
|
452 ASCIIToLower( char_type c ) |
|
453 { |
|
454 if (c >= 'A' && c <= 'Z') |
|
455 return char_type(c + ('a' - 'A')); |
|
456 |
|
457 return c; |
|
458 } |
|
459 |
|
460 static |
|
461 int |
|
462 compareLowerCaseToASCII( const char_type* s1, const char* s2, size_t n ) |
|
463 { |
|
464 for ( ; n--; ++s1, ++s2 ) |
|
465 { |
|
466 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
467 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), |
|
468 "Unexpected uppercase character"); |
|
469 char_type lower_s1 = ASCIIToLower(*s1); |
|
470 if ( lower_s1 != *s2 ) |
|
471 return to_int_type(lower_s1) - to_int_type(*s2); |
|
472 } |
|
473 return 0; |
|
474 } |
|
475 |
|
476 // this version assumes that s2 is null-terminated and s1 has length n. |
|
477 // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, |
|
478 // we return 1. |
|
479 static |
|
480 int |
|
481 compareLowerCaseToASCIINullTerminated( const char_type* s1, size_t n, const char* s2 ) |
|
482 { |
|
483 for ( ; n--; ++s1, ++s2 ) |
|
484 { |
|
485 if ( !*s2 ) |
|
486 return 1; |
|
487 NS_ASSERTION(!(*s2 & ~0x7F), "Unexpected non-ASCII character"); |
|
488 NS_ASSERTION(!(*s2 >= 'A' && *s2 <= 'Z'), |
|
489 "Unexpected uppercase character"); |
|
490 char_type lower_s1 = ASCIIToLower(*s1); |
|
491 if ( lower_s1 != *s2 ) |
|
492 return to_int_type(lower_s1) - to_int_type(*s2); |
|
493 } |
|
494 |
|
495 if ( *s2 ) |
|
496 return -1; |
|
497 |
|
498 return 0; |
|
499 } |
|
500 |
|
501 static |
|
502 size_t |
|
503 length( const char_type* s ) |
|
504 { |
|
505 return strlen(s); |
|
506 } |
|
507 |
|
508 static |
|
509 const char_type* |
|
510 find( const char_type* s, size_t n, char_type c ) |
|
511 { |
|
512 return reinterpret_cast<const char_type*>(memchr(s, to_int_type(c), n)); |
|
513 } |
|
514 }; |
|
515 |
|
516 template <class InputIterator> |
|
517 struct nsCharSourceTraits |
|
518 { |
|
519 typedef typename InputIterator::difference_type difference_type; |
|
520 |
|
521 static |
|
522 uint32_t |
|
523 readable_distance( const InputIterator& first, const InputIterator& last ) |
|
524 { |
|
525 // assumes single fragment |
|
526 return uint32_t(last.get() - first.get()); |
|
527 } |
|
528 |
|
529 static |
|
530 const typename InputIterator::value_type* |
|
531 read( const InputIterator& iter ) |
|
532 { |
|
533 return iter.get(); |
|
534 } |
|
535 |
|
536 static |
|
537 void |
|
538 advance( InputIterator& s, difference_type n ) |
|
539 { |
|
540 s.advance(n); |
|
541 } |
|
542 }; |
|
543 |
|
544 template <class CharT> |
|
545 struct nsCharSourceTraits<CharT*> |
|
546 { |
|
547 typedef ptrdiff_t difference_type; |
|
548 |
|
549 static |
|
550 uint32_t |
|
551 readable_distance( CharT* s ) |
|
552 { |
|
553 return uint32_t(nsCharTraits<CharT>::length(s)); |
|
554 // return numeric_limits<uint32_t>::max(); |
|
555 } |
|
556 |
|
557 static |
|
558 uint32_t |
|
559 readable_distance( CharT* first, CharT* last ) |
|
560 { |
|
561 return uint32_t(last-first); |
|
562 } |
|
563 |
|
564 static |
|
565 const CharT* |
|
566 read( CharT* s ) |
|
567 { |
|
568 return s; |
|
569 } |
|
570 |
|
571 static |
|
572 void |
|
573 advance( CharT*& s, difference_type n ) |
|
574 { |
|
575 s += n; |
|
576 } |
|
577 }; |
|
578 |
|
579 template <class OutputIterator> |
|
580 struct nsCharSinkTraits |
|
581 { |
|
582 static |
|
583 void |
|
584 write( OutputIterator& iter, const typename OutputIterator::value_type* s, uint32_t n ) |
|
585 { |
|
586 iter.write(s, n); |
|
587 } |
|
588 }; |
|
589 |
|
590 template <class CharT> |
|
591 struct nsCharSinkTraits<CharT*> |
|
592 { |
|
593 static |
|
594 void |
|
595 write( CharT*& iter, const CharT* s, uint32_t n ) |
|
596 { |
|
597 nsCharTraits<CharT>::move(iter, s, n); |
|
598 iter += n; |
|
599 } |
|
600 }; |
|
601 |
|
602 #endif // !defined(nsCharTraits_h___) |