intl/unicharutil/util/nsUnicharUtils.cpp

branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
equal deleted inserted replaced
-1:000000000000 0:8e173de686d6
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "nsUnicharUtils.h"
7 #include "nsXPCOMStrings.h"
8 #include "nsUTF8Utils.h"
9 #include "nsUnicodeProperties.h"
10 #include "mozilla/Likely.h"
11 #include "mozilla/HashFunctions.h"
12
13 // We map x -> x, except for upper-case letters,
14 // which we map to their lower-case equivalents.
15 static const uint8_t gASCIIToLower [128] = {
16 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
17 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
18 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
19 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
20 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
21 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
22 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
23 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
24 };
25
26 #define IS_ASCII(u) ((u) < 0x80)
27 #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
28 #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
29 #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
30 #define IS_ASCII_SPACE(u) (' ' == (u))
31
32 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
33 // when they're called from within the case-insensitive comparators, so we
34 // define inlined versions.
35 static MOZ_ALWAYS_INLINE uint32_t
36 ToLowerCase_inline(uint32_t aChar)
37 {
38 if (IS_ASCII(aChar)) {
39 return gASCIIToLower[aChar];
40 }
41
42 return mozilla::unicode::GetLowercase(aChar);
43 }
44
45 static MOZ_ALWAYS_INLINE uint32_t
46 ToLowerCaseASCII_inline(const uint32_t aChar)
47 {
48 if (IS_ASCII(aChar)) {
49 return gASCIIToLower[aChar];
50 }
51
52 return aChar;
53 }
54
55 void
56 ToLowerCase(nsAString& aString)
57 {
58 char16_t *buf = aString.BeginWriting();
59 ToLowerCase(buf, buf, aString.Length());
60 }
61
62 void
63 ToLowerCase(const nsAString& aSource,
64 nsAString& aDest)
65 {
66 const char16_t *in;
67 char16_t *out;
68 uint32_t len = NS_StringGetData(aSource, &in);
69 NS_StringGetMutableData(aDest, len, &out);
70 NS_ASSERTION(out, "Uh...");
71 ToLowerCase(in, out, len);
72 }
73
74 uint32_t
75 ToLowerCaseASCII(const uint32_t aChar)
76 {
77 return ToLowerCaseASCII_inline(aChar);
78 }
79
80 void
81 ToUpperCase(nsAString& aString)
82 {
83 char16_t *buf = aString.BeginWriting();
84 ToUpperCase(buf, buf, aString.Length());
85 }
86
87 void
88 ToUpperCase(const nsAString& aSource,
89 nsAString& aDest)
90 {
91 const char16_t *in;
92 char16_t *out;
93 uint32_t len = NS_StringGetData(aSource, &in);
94 NS_StringGetMutableData(aDest, len, &out);
95 NS_ASSERTION(out, "Uh...");
96 ToUpperCase(in, out, len);
97 }
98
99 #ifdef MOZILLA_INTERNAL_API
100
101 int32_t
102 nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
103 const char16_t* rhs,
104 uint32_t lLength,
105 uint32_t rLength) const
106 {
107 return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
108 (lLength > rLength) ? 1 : -1;
109 }
110
111 int32_t
112 nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
113 const char* rhs,
114 uint32_t lLength,
115 uint32_t rLength) const
116 {
117 return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
118 }
119
120 int32_t
121 nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
122 const char16_t* rhs,
123 uint32_t lLength,
124 uint32_t rLength) const
125 {
126 if (lLength != rLength) {
127 if (lLength > rLength)
128 return 1;
129 return -1;
130 }
131
132 while (rLength) {
133 // we don't care about surrogates here, because we're only
134 // lowercasing the ASCII range
135 char16_t l = *lhs++;
136 char16_t r = *rhs++;
137 if (l != r) {
138 l = ToLowerCaseASCII_inline(l);
139 r = ToLowerCaseASCII_inline(r);
140
141 if (l > r)
142 return 1;
143 else if (r > l)
144 return -1;
145 }
146 rLength--;
147 }
148
149 return 0;
150 }
151
152 #endif // MOZILLA_INTERNAL_API
153
154 uint32_t
155 ToLowerCase(uint32_t aChar)
156 {
157 return ToLowerCase_inline(aChar);
158 }
159
160 void
161 ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
162 {
163 for (uint32_t i = 0; i < aLen; i++) {
164 uint32_t ch = aIn[i];
165 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
166 NS_IS_LOW_SURROGATE(aIn[i + 1])) {
167 ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
168 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
169 aOut[i++] = H_SURROGATE(ch);
170 aOut[i] = L_SURROGATE(ch);
171 continue;
172 }
173 aOut[i] = ToLowerCase(ch);
174 }
175 }
176
177 uint32_t
178 ToUpperCase(uint32_t aChar)
179 {
180 if (IS_ASCII(aChar)) {
181 if (IS_ASCII_LOWER(aChar)) {
182 return aChar - 0x20;
183 }
184 return aChar;
185 }
186
187 return mozilla::unicode::GetUppercase(aChar);
188 }
189
190 void
191 ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
192 {
193 for (uint32_t i = 0; i < aLen; i++) {
194 uint32_t ch = aIn[i];
195 if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
196 NS_IS_LOW_SURROGATE(aIn[i + 1])) {
197 ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
198 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
199 aOut[i++] = H_SURROGATE(ch);
200 aOut[i] = L_SURROGATE(ch);
201 continue;
202 }
203 aOut[i] = ToUpperCase(ch);
204 }
205 }
206
207 uint32_t
208 ToTitleCase(uint32_t aChar)
209 {
210 if (IS_ASCII(aChar)) {
211 return ToUpperCase(aChar);
212 }
213
214 return mozilla::unicode::GetTitlecaseForLower(aChar);
215 }
216
217 int32_t
218 CaseInsensitiveCompare(const char16_t *a,
219 const char16_t *b,
220 uint32_t len)
221 {
222 NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
223
224 if (len) {
225 do {
226 uint32_t c1 = *a++;
227 uint32_t c2 = *b++;
228
229 // Unfortunately, we need to check for surrogates BEFORE we check
230 // for equality, because we could have identical high surrogates
231 // but non-identical characters, so we can't just skip them
232
233 // If c1 isn't a surrogate, we don't bother to check c2;
234 // in the case where it _is_ a surrogate, we're definitely going to get
235 // a mismatch, and don't need to interpret and lowercase it
236
237 if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
238 c1 = SURROGATE_TO_UCS4(c1, *a++);
239 if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
240 c2 = SURROGATE_TO_UCS4(c2, *b++);
241 }
242 // If c2 wasn't a surrogate, decrementing len means we'd stop
243 // short of the end of string b, but that doesn't actually matter
244 // because we're going to find a mismatch and return early
245 --len;
246 }
247
248 if (c1 != c2) {
249 c1 = ToLowerCase_inline(c1);
250 c2 = ToLowerCase_inline(c2);
251 if (c1 != c2) {
252 if (c1 < c2) {
253 return -1;
254 }
255 return 1;
256 }
257 }
258 } while (--len != 0);
259 }
260 return 0;
261 }
262
263 // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext
264 // to the byte following the end of the sequence.
265 //
266 // If the sequence is invalid, or if computing the codepoint would take us off
267 // the end of the string (as marked by aEnd), returns -1 and does not set
268 // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes
269 // you've done that already.
270 static MOZ_ALWAYS_INLINE uint32_t
271 GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
272 {
273 // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
274 // sign extend.
275 const unsigned char *str = (unsigned char*)aStr;
276
277 if (UTF8traits::isASCII(str[0])) {
278 // It's ASCII; just convert to lower-case and return it.
279 *aNext = aStr + 1;
280 return gASCIIToLower[*str];
281 }
282 if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
283 // It's a two-byte sequence, so it looks like
284 // 110XXXXX 10XXXXXX.
285 // This is definitely in the BMP, so we can store straightaway into a
286 // uint16_t.
287
288 uint16_t c;
289 c = (str[0] & 0x1F) << 6;
290 c += (str[1] & 0x3F);
291
292 // we don't go through ToLowerCase here, because we know this isn't
293 // an ASCII character so the ASCII fast-path there is useless
294 c = mozilla::unicode::GetLowercase(c);
295
296 *aNext = aStr + 2;
297 return c;
298 }
299 if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
300 // It's a three-byte sequence, so it looks like
301 // 1110XXXX 10XXXXXX 10XXXXXX.
302 // This will just barely fit into 16-bits, so store into a uint16_t.
303
304 uint16_t c;
305 c = (str[0] & 0x0F) << 12;
306 c += (str[1] & 0x3F) << 6;
307 c += (str[2] & 0x3F);
308
309 c = mozilla::unicode::GetLowercase(c);
310
311 *aNext = aStr + 3;
312 return c;
313 }
314 if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
315 // It's a four-byte sequence, so it looks like
316 // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
317
318 uint32_t c;
319 c = (str[0] & 0x07) << 18;
320 c += (str[1] & 0x3F) << 12;
321 c += (str[2] & 0x3F) << 6;
322 c += (str[3] & 0x3F);
323
324 c = mozilla::unicode::GetLowercase(c);
325
326 *aNext = aStr + 4;
327 return c;
328 }
329
330 // Hm, we don't understand this sequence.
331 return -1;
332 }
333
334 int32_t CaseInsensitiveCompare(const char *aLeft,
335 const char *aRight,
336 uint32_t aLeftBytes,
337 uint32_t aRightBytes)
338 {
339 const char *leftEnd = aLeft + aLeftBytes;
340 const char *rightEnd = aRight + aRightBytes;
341
342 while (aLeft < leftEnd && aRight < rightEnd) {
343 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
344 if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
345 return -1;
346
347 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
348 if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
349 return -1;
350
351 // Now leftChar and rightChar are lower-case, so we can compare them.
352 if (leftChar != rightChar) {
353 if (leftChar > rightChar)
354 return 1;
355 return -1;
356 }
357 }
358
359 // Make sure that if one string is longer than the other we return the
360 // correct result.
361 if (aLeft < leftEnd)
362 return 1;
363 if (aRight < rightEnd)
364 return -1;
365
366 return 0;
367 }
368
369 bool
370 CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
371 const char* aLeftEnd, const char* aRightEnd,
372 const char** aLeftNext, const char** aRightNext,
373 bool* aErr)
374 {
375 NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
376 NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
377 NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
378 NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
379 NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
380
381 uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
382 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
383 *aErr = true;
384 return false;
385 }
386
387 uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
388 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
389 *aErr = true;
390 return false;
391 }
392
393 // Can't have an error past this point.
394 *aErr = false;
395
396 return leftChar == rightChar;
397 }
398
399 namespace mozilla {
400
401 uint32_t
402 HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
403 {
404 uint32_t hash = 0;
405 const char* s = aUTF8;
406 const char* end = aUTF8 + aLength;
407
408 *aErr = false;
409
410 while (s < end)
411 {
412 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
413 if (*aErr) {
414 return 0;
415 }
416
417 if (ucs4 < PLANE1_BASE) {
418 hash = AddToHash(hash, ucs4);
419 }
420 else {
421 hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
422 }
423 }
424
425 return hash;
426 }
427
428 } // namespace mozilla

mercurial