|
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsUnicharUtils.h" |
|
7 #include "nsXPCOMStrings.h" |
|
8 #include "nsUTF8Utils.h" |
|
9 #include "nsUnicodeProperties.h" |
|
10 #include "mozilla/Likely.h" |
|
11 #include "mozilla/HashFunctions.h" |
|
12 |
|
13 // We map x -> x, except for upper-case letters, |
|
14 // which we map to their lower-case equivalents. |
|
15 static const uint8_t gASCIIToLower [128] = { |
|
16 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
|
17 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
|
18 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, |
|
19 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, |
|
20 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, |
|
21 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, |
|
22 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, |
|
23 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, |
|
24 }; |
|
25 |
|
26 #define IS_ASCII(u) ((u) < 0x80) |
|
27 #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z')) |
|
28 #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z')) |
|
29 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u)) |
|
30 #define IS_ASCII_SPACE(u) (' ' == (u)) |
|
31 |
|
32 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast |
|
33 // when they're called from within the case-insensitive comparators, so we |
|
34 // define inlined versions. |
|
35 static MOZ_ALWAYS_INLINE uint32_t |
|
36 ToLowerCase_inline(uint32_t aChar) |
|
37 { |
|
38 if (IS_ASCII(aChar)) { |
|
39 return gASCIIToLower[aChar]; |
|
40 } |
|
41 |
|
42 return mozilla::unicode::GetLowercase(aChar); |
|
43 } |
|
44 |
|
45 static MOZ_ALWAYS_INLINE uint32_t |
|
46 ToLowerCaseASCII_inline(const uint32_t aChar) |
|
47 { |
|
48 if (IS_ASCII(aChar)) { |
|
49 return gASCIIToLower[aChar]; |
|
50 } |
|
51 |
|
52 return aChar; |
|
53 } |
|
54 |
|
55 void |
|
56 ToLowerCase(nsAString& aString) |
|
57 { |
|
58 char16_t *buf = aString.BeginWriting(); |
|
59 ToLowerCase(buf, buf, aString.Length()); |
|
60 } |
|
61 |
|
62 void |
|
63 ToLowerCase(const nsAString& aSource, |
|
64 nsAString& aDest) |
|
65 { |
|
66 const char16_t *in; |
|
67 char16_t *out; |
|
68 uint32_t len = NS_StringGetData(aSource, &in); |
|
69 NS_StringGetMutableData(aDest, len, &out); |
|
70 NS_ASSERTION(out, "Uh..."); |
|
71 ToLowerCase(in, out, len); |
|
72 } |
|
73 |
|
74 uint32_t |
|
75 ToLowerCaseASCII(const uint32_t aChar) |
|
76 { |
|
77 return ToLowerCaseASCII_inline(aChar); |
|
78 } |
|
79 |
|
80 void |
|
81 ToUpperCase(nsAString& aString) |
|
82 { |
|
83 char16_t *buf = aString.BeginWriting(); |
|
84 ToUpperCase(buf, buf, aString.Length()); |
|
85 } |
|
86 |
|
87 void |
|
88 ToUpperCase(const nsAString& aSource, |
|
89 nsAString& aDest) |
|
90 { |
|
91 const char16_t *in; |
|
92 char16_t *out; |
|
93 uint32_t len = NS_StringGetData(aSource, &in); |
|
94 NS_StringGetMutableData(aDest, len, &out); |
|
95 NS_ASSERTION(out, "Uh..."); |
|
96 ToUpperCase(in, out, len); |
|
97 } |
|
98 |
|
99 #ifdef MOZILLA_INTERNAL_API |
|
100 |
|
101 int32_t |
|
102 nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs, |
|
103 const char16_t* rhs, |
|
104 uint32_t lLength, |
|
105 uint32_t rLength) const |
|
106 { |
|
107 return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) : |
|
108 (lLength > rLength) ? 1 : -1; |
|
109 } |
|
110 |
|
111 int32_t |
|
112 nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs, |
|
113 const char* rhs, |
|
114 uint32_t lLength, |
|
115 uint32_t rLength) const |
|
116 { |
|
117 return CaseInsensitiveCompare(lhs, rhs, lLength, rLength); |
|
118 } |
|
119 |
|
120 int32_t |
|
121 nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs, |
|
122 const char16_t* rhs, |
|
123 uint32_t lLength, |
|
124 uint32_t rLength) const |
|
125 { |
|
126 if (lLength != rLength) { |
|
127 if (lLength > rLength) |
|
128 return 1; |
|
129 return -1; |
|
130 } |
|
131 |
|
132 while (rLength) { |
|
133 // we don't care about surrogates here, because we're only |
|
134 // lowercasing the ASCII range |
|
135 char16_t l = *lhs++; |
|
136 char16_t r = *rhs++; |
|
137 if (l != r) { |
|
138 l = ToLowerCaseASCII_inline(l); |
|
139 r = ToLowerCaseASCII_inline(r); |
|
140 |
|
141 if (l > r) |
|
142 return 1; |
|
143 else if (r > l) |
|
144 return -1; |
|
145 } |
|
146 rLength--; |
|
147 } |
|
148 |
|
149 return 0; |
|
150 } |
|
151 |
|
152 #endif // MOZILLA_INTERNAL_API |
|
153 |
|
154 uint32_t |
|
155 ToLowerCase(uint32_t aChar) |
|
156 { |
|
157 return ToLowerCase_inline(aChar); |
|
158 } |
|
159 |
|
160 void |
|
161 ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) |
|
162 { |
|
163 for (uint32_t i = 0; i < aLen; i++) { |
|
164 uint32_t ch = aIn[i]; |
|
165 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && |
|
166 NS_IS_LOW_SURROGATE(aIn[i + 1])) { |
|
167 ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); |
|
168 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); |
|
169 aOut[i++] = H_SURROGATE(ch); |
|
170 aOut[i] = L_SURROGATE(ch); |
|
171 continue; |
|
172 } |
|
173 aOut[i] = ToLowerCase(ch); |
|
174 } |
|
175 } |
|
176 |
|
177 uint32_t |
|
178 ToUpperCase(uint32_t aChar) |
|
179 { |
|
180 if (IS_ASCII(aChar)) { |
|
181 if (IS_ASCII_LOWER(aChar)) { |
|
182 return aChar - 0x20; |
|
183 } |
|
184 return aChar; |
|
185 } |
|
186 |
|
187 return mozilla::unicode::GetUppercase(aChar); |
|
188 } |
|
189 |
|
190 void |
|
191 ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen) |
|
192 { |
|
193 for (uint32_t i = 0; i < aLen; i++) { |
|
194 uint32_t ch = aIn[i]; |
|
195 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 && |
|
196 NS_IS_LOW_SURROGATE(aIn[i + 1])) { |
|
197 ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); |
|
198 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); |
|
199 aOut[i++] = H_SURROGATE(ch); |
|
200 aOut[i] = L_SURROGATE(ch); |
|
201 continue; |
|
202 } |
|
203 aOut[i] = ToUpperCase(ch); |
|
204 } |
|
205 } |
|
206 |
|
207 uint32_t |
|
208 ToTitleCase(uint32_t aChar) |
|
209 { |
|
210 if (IS_ASCII(aChar)) { |
|
211 return ToUpperCase(aChar); |
|
212 } |
|
213 |
|
214 return mozilla::unicode::GetTitlecaseForLower(aChar); |
|
215 } |
|
216 |
|
217 int32_t |
|
218 CaseInsensitiveCompare(const char16_t *a, |
|
219 const char16_t *b, |
|
220 uint32_t len) |
|
221 { |
|
222 NS_ASSERTION(a && b, "Do not pass in invalid pointers!"); |
|
223 |
|
224 if (len) { |
|
225 do { |
|
226 uint32_t c1 = *a++; |
|
227 uint32_t c2 = *b++; |
|
228 |
|
229 // Unfortunately, we need to check for surrogates BEFORE we check |
|
230 // for equality, because we could have identical high surrogates |
|
231 // but non-identical characters, so we can't just skip them |
|
232 |
|
233 // If c1 isn't a surrogate, we don't bother to check c2; |
|
234 // in the case where it _is_ a surrogate, we're definitely going to get |
|
235 // a mismatch, and don't need to interpret and lowercase it |
|
236 |
|
237 if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) { |
|
238 c1 = SURROGATE_TO_UCS4(c1, *a++); |
|
239 if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) { |
|
240 c2 = SURROGATE_TO_UCS4(c2, *b++); |
|
241 } |
|
242 // If c2 wasn't a surrogate, decrementing len means we'd stop |
|
243 // short of the end of string b, but that doesn't actually matter |
|
244 // because we're going to find a mismatch and return early |
|
245 --len; |
|
246 } |
|
247 |
|
248 if (c1 != c2) { |
|
249 c1 = ToLowerCase_inline(c1); |
|
250 c2 = ToLowerCase_inline(c2); |
|
251 if (c1 != c2) { |
|
252 if (c1 < c2) { |
|
253 return -1; |
|
254 } |
|
255 return 1; |
|
256 } |
|
257 } |
|
258 } while (--len != 0); |
|
259 } |
|
260 return 0; |
|
261 } |
|
262 |
|
263 // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext |
|
264 // to the byte following the end of the sequence. |
|
265 // |
|
266 // If the sequence is invalid, or if computing the codepoint would take us off |
|
267 // the end of the string (as marked by aEnd), returns -1 and does not set |
|
268 // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes |
|
269 // you've done that already. |
|
270 static MOZ_ALWAYS_INLINE uint32_t |
|
271 GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext) |
|
272 { |
|
273 // Convert to unsigned char so that stuffing chars into PRUint32s doesn't |
|
274 // sign extend. |
|
275 const unsigned char *str = (unsigned char*)aStr; |
|
276 |
|
277 if (UTF8traits::isASCII(str[0])) { |
|
278 // It's ASCII; just convert to lower-case and return it. |
|
279 *aNext = aStr + 1; |
|
280 return gASCIIToLower[*str]; |
|
281 } |
|
282 if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) { |
|
283 // It's a two-byte sequence, so it looks like |
|
284 // 110XXXXX 10XXXXXX. |
|
285 // This is definitely in the BMP, so we can store straightaway into a |
|
286 // uint16_t. |
|
287 |
|
288 uint16_t c; |
|
289 c = (str[0] & 0x1F) << 6; |
|
290 c += (str[1] & 0x3F); |
|
291 |
|
292 // we don't go through ToLowerCase here, because we know this isn't |
|
293 // an ASCII character so the ASCII fast-path there is useless |
|
294 c = mozilla::unicode::GetLowercase(c); |
|
295 |
|
296 *aNext = aStr + 2; |
|
297 return c; |
|
298 } |
|
299 if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) { |
|
300 // It's a three-byte sequence, so it looks like |
|
301 // 1110XXXX 10XXXXXX 10XXXXXX. |
|
302 // This will just barely fit into 16-bits, so store into a uint16_t. |
|
303 |
|
304 uint16_t c; |
|
305 c = (str[0] & 0x0F) << 12; |
|
306 c += (str[1] & 0x3F) << 6; |
|
307 c += (str[2] & 0x3F); |
|
308 |
|
309 c = mozilla::unicode::GetLowercase(c); |
|
310 |
|
311 *aNext = aStr + 3; |
|
312 return c; |
|
313 } |
|
314 if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) { |
|
315 // It's a four-byte sequence, so it looks like |
|
316 // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX. |
|
317 |
|
318 uint32_t c; |
|
319 c = (str[0] & 0x07) << 18; |
|
320 c += (str[1] & 0x3F) << 12; |
|
321 c += (str[2] & 0x3F) << 6; |
|
322 c += (str[3] & 0x3F); |
|
323 |
|
324 c = mozilla::unicode::GetLowercase(c); |
|
325 |
|
326 *aNext = aStr + 4; |
|
327 return c; |
|
328 } |
|
329 |
|
330 // Hm, we don't understand this sequence. |
|
331 return -1; |
|
332 } |
|
333 |
|
334 int32_t CaseInsensitiveCompare(const char *aLeft, |
|
335 const char *aRight, |
|
336 uint32_t aLeftBytes, |
|
337 uint32_t aRightBytes) |
|
338 { |
|
339 const char *leftEnd = aLeft + aLeftBytes; |
|
340 const char *rightEnd = aRight + aRightBytes; |
|
341 |
|
342 while (aLeft < leftEnd && aRight < rightEnd) { |
|
343 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft); |
|
344 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) |
|
345 return -1; |
|
346 |
|
347 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight); |
|
348 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) |
|
349 return -1; |
|
350 |
|
351 // Now leftChar and rightChar are lower-case, so we can compare them. |
|
352 if (leftChar != rightChar) { |
|
353 if (leftChar > rightChar) |
|
354 return 1; |
|
355 return -1; |
|
356 } |
|
357 } |
|
358 |
|
359 // Make sure that if one string is longer than the other we return the |
|
360 // correct result. |
|
361 if (aLeft < leftEnd) |
|
362 return 1; |
|
363 if (aRight < rightEnd) |
|
364 return -1; |
|
365 |
|
366 return 0; |
|
367 } |
|
368 |
|
369 bool |
|
370 CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight, |
|
371 const char* aLeftEnd, const char* aRightEnd, |
|
372 const char** aLeftNext, const char** aRightNext, |
|
373 bool* aErr) |
|
374 { |
|
375 NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null."); |
|
376 NS_ASSERTION(aRightNext, "Out pointer shouldn't be null."); |
|
377 NS_ASSERTION(aErr, "Out pointer shouldn't be null."); |
|
378 NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd."); |
|
379 NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd."); |
|
380 |
|
381 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext); |
|
382 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) { |
|
383 *aErr = true; |
|
384 return false; |
|
385 } |
|
386 |
|
387 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext); |
|
388 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) { |
|
389 *aErr = true; |
|
390 return false; |
|
391 } |
|
392 |
|
393 // Can't have an error past this point. |
|
394 *aErr = false; |
|
395 |
|
396 return leftChar == rightChar; |
|
397 } |
|
398 |
|
399 namespace mozilla { |
|
400 |
|
401 uint32_t |
|
402 HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr) |
|
403 { |
|
404 uint32_t hash = 0; |
|
405 const char* s = aUTF8; |
|
406 const char* end = aUTF8 + aLength; |
|
407 |
|
408 *aErr = false; |
|
409 |
|
410 while (s < end) |
|
411 { |
|
412 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr); |
|
413 if (*aErr) { |
|
414 return 0; |
|
415 } |
|
416 |
|
417 if (ucs4 < PLANE1_BASE) { |
|
418 hash = AddToHash(hash, ucs4); |
|
419 } |
|
420 else { |
|
421 hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4)); |
|
422 } |
|
423 } |
|
424 |
|
425 return hash; |
|
426 } |
|
427 |
|
428 } // namespace mozilla |