|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 #include "xpcom-private.h" |
|
6 |
|
7 //----------------------------------------------------------------------------- |
|
8 // XP_MACOSX or ANDROID |
|
9 //----------------------------------------------------------------------------- |
|
10 #if defined(XP_MACOSX) || defined(ANDROID) |
|
11 |
|
12 #include "nsAString.h" |
|
13 #include "nsReadableUtils.h" |
|
14 #include "nsString.h" |
|
15 |
|
16 nsresult |
|
17 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) |
|
18 { |
|
19 CopyUTF8toUTF16(input, output); |
|
20 return NS_OK; |
|
21 } |
|
22 |
|
23 nsresult |
|
24 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) |
|
25 { |
|
26 CopyUTF16toUTF8(input, output); |
|
27 return NS_OK; |
|
28 } |
|
29 |
|
30 void |
|
31 NS_StartupNativeCharsetUtils() |
|
32 { |
|
33 } |
|
34 |
|
35 void |
|
36 NS_ShutdownNativeCharsetUtils() |
|
37 { |
|
38 } |
|
39 |
|
40 |
|
41 //----------------------------------------------------------------------------- |
|
42 // XP_UNIX |
|
43 //----------------------------------------------------------------------------- |
|
44 #elif defined(XP_UNIX) |
|
45 |
|
46 #include <stdlib.h> // mbtowc, wctomb |
|
47 #include <locale.h> // setlocale |
|
48 #include "mozilla/Mutex.h" |
|
49 #include "nscore.h" |
|
50 #include "nsAString.h" |
|
51 #include "nsReadableUtils.h" |
|
52 |
|
53 using namespace mozilla; |
|
54 |
|
55 // |
|
56 // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux, |
|
57 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar' |
|
58 // or not (see bug 206811 and |
|
59 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use |
|
60 // iconv for all platforms where nltypes.h and nllanginfo.h are present |
|
61 // along with iconv. |
|
62 // |
|
63 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET) |
|
64 #define USE_ICONV 1 |
|
65 #else |
|
66 #define USE_STDCONV 1 |
|
67 #endif |
|
68 |
|
69 static void |
|
70 isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft) |
|
71 { |
|
72 while (*inputLeft && *outputLeft) { |
|
73 **output = (unsigned char) **input; |
|
74 (*input)++; |
|
75 (*inputLeft)--; |
|
76 (*output)++; |
|
77 (*outputLeft)--; |
|
78 } |
|
79 } |
|
80 |
|
81 static void |
|
82 utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft) |
|
83 { |
|
84 while (*inputLeft && *outputLeft) { |
|
85 **output = (unsigned char) **input; |
|
86 (*input)++; |
|
87 (*inputLeft)--; |
|
88 (*output)++; |
|
89 (*outputLeft)--; |
|
90 } |
|
91 } |
|
92 |
|
93 //----------------------------------------------------------------------------- |
|
94 // conversion using iconv |
|
95 //----------------------------------------------------------------------------- |
|
96 #if defined(USE_ICONV) |
|
97 #include <nl_types.h> // CODESET |
|
98 #include <langinfo.h> // nl_langinfo |
|
99 #include <iconv.h> // iconv_open, iconv, iconv_close |
|
100 #include <errno.h> |
|
101 #include "plstr.h" |
|
102 |
|
103 #if defined(HAVE_ICONV_WITH_CONST_INPUT) |
|
104 #define ICONV_INPUT(x) (x) |
|
105 #else |
|
106 #define ICONV_INPUT(x) ((char **)x) |
|
107 #endif |
|
108 |
|
109 // solaris definitely needs this, but we'll enable it by default |
|
110 // just in case... but we know for sure that iconv(3) in glibc |
|
111 // doesn't need this. |
|
112 #if !defined(__GLIBC__) |
|
113 #define ENABLE_UTF8_FALLBACK_SUPPORT |
|
114 #endif |
|
115 |
|
116 #define INVALID_ICONV_T ((iconv_t) -1) |
|
117 |
|
118 static inline size_t |
|
119 xp_iconv(iconv_t converter, |
|
120 const char **input, |
|
121 size_t *inputLeft, |
|
122 char **output, |
|
123 size_t *outputLeft) |
|
124 { |
|
125 size_t res, outputAvail = outputLeft ? *outputLeft : 0; |
|
126 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft); |
|
127 if (res == (size_t) -1) { |
|
128 // on some platforms (e.g., linux) iconv will fail with |
|
129 // E2BIG if it cannot convert _all_ of its input. it'll |
|
130 // still adjust all of the in/out params correctly, so we |
|
131 // can ignore this error. the assumption is that we will |
|
132 // be called again to complete the conversion. |
|
133 if ((errno == E2BIG) && (*outputLeft < outputAvail)) |
|
134 res = 0; |
|
135 } |
|
136 return res; |
|
137 } |
|
138 |
|
139 static inline void |
|
140 xp_iconv_reset(iconv_t converter) |
|
141 { |
|
142 // NOTE: the man pages on Solaris claim that you can pass nullptr |
|
143 // for all parameter to reset the converter, but beware the |
|
144 // evil Solaris crash if you go down this route >:-) |
|
145 |
|
146 const char *zero_char_in_ptr = nullptr; |
|
147 char *zero_char_out_ptr = nullptr; |
|
148 size_t zero_size_in = 0, |
|
149 zero_size_out = 0; |
|
150 |
|
151 xp_iconv(converter, &zero_char_in_ptr, |
|
152 &zero_size_in, |
|
153 &zero_char_out_ptr, |
|
154 &zero_size_out); |
|
155 } |
|
156 |
|
157 static inline iconv_t |
|
158 xp_iconv_open(const char **to_list, const char **from_list) |
|
159 { |
|
160 iconv_t res; |
|
161 const char **from_name; |
|
162 const char **to_name; |
|
163 |
|
164 // try all possible combinations to locate a converter. |
|
165 to_name = to_list; |
|
166 while (*to_name) { |
|
167 if (**to_name) { |
|
168 from_name = from_list; |
|
169 while (*from_name) { |
|
170 if (**from_name) { |
|
171 res = iconv_open(*to_name, *from_name); |
|
172 if (res != INVALID_ICONV_T) |
|
173 return res; |
|
174 } |
|
175 from_name++; |
|
176 } |
|
177 } |
|
178 to_name++; |
|
179 } |
|
180 |
|
181 return INVALID_ICONV_T; |
|
182 } |
|
183 |
|
184 /* |
|
185 * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we |
|
186 * have to use UTF-16 with iconv(3) on platforms where it's supported. |
|
187 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms |
|
188 * and implementations of iconv(3). On Tru64, it also depends on the environment |
|
189 * variable. To avoid the trouble arising from byte-swapping |
|
190 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling |
|
191 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 |
|
192 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness, |
|
193 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE' |
|
194 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment |
|
195 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much |
|
196 * can be done other than adding a note in the release notes. (bug 206811) |
|
197 */ |
|
198 static const char *UTF_16_NAMES[] = { |
|
199 #if defined(IS_LITTLE_ENDIAN) |
|
200 "UTF-16LE", |
|
201 #if defined(__GLIBC__) |
|
202 "UNICODELITTLE", |
|
203 #endif |
|
204 "UCS-2LE", |
|
205 #else |
|
206 "UTF-16BE", |
|
207 #if defined(__GLIBC__) |
|
208 "UNICODEBIG", |
|
209 #endif |
|
210 "UCS-2BE", |
|
211 #endif |
|
212 "UTF-16", |
|
213 "UCS-2", |
|
214 "UCS2", |
|
215 "UCS_2", |
|
216 "ucs-2", |
|
217 "ucs2", |
|
218 "ucs_2", |
|
219 nullptr |
|
220 }; |
|
221 |
|
222 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
223 static const char *UTF_8_NAMES[] = { |
|
224 "UTF-8", |
|
225 "UTF8", |
|
226 "UTF_8", |
|
227 "utf-8", |
|
228 "utf8", |
|
229 "utf_8", |
|
230 nullptr |
|
231 }; |
|
232 #endif |
|
233 |
|
234 static const char *ISO_8859_1_NAMES[] = { |
|
235 "ISO-8859-1", |
|
236 #if !defined(__GLIBC__) |
|
237 "ISO8859-1", |
|
238 "ISO88591", |
|
239 "ISO_8859_1", |
|
240 "ISO8859_1", |
|
241 "iso-8859-1", |
|
242 "iso8859-1", |
|
243 "iso88591", |
|
244 "iso_8859_1", |
|
245 "iso8859_1", |
|
246 #endif |
|
247 nullptr |
|
248 }; |
|
249 |
|
250 class nsNativeCharsetConverter |
|
251 { |
|
252 public: |
|
253 nsNativeCharsetConverter(); |
|
254 ~nsNativeCharsetConverter(); |
|
255 |
|
256 nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, |
|
257 char16_t **output, uint32_t *outputLeft); |
|
258 nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, |
|
259 char **output, uint32_t *outputLeft); |
|
260 |
|
261 static void GlobalInit(); |
|
262 static void GlobalShutdown(); |
|
263 static bool IsNativeUTF8(); |
|
264 |
|
265 private: |
|
266 static iconv_t gNativeToUnicode; |
|
267 static iconv_t gUnicodeToNative; |
|
268 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
269 static iconv_t gNativeToUTF8; |
|
270 static iconv_t gUTF8ToNative; |
|
271 static iconv_t gUnicodeToUTF8; |
|
272 static iconv_t gUTF8ToUnicode; |
|
273 #endif |
|
274 static Mutex *gLock; |
|
275 static bool gInitialized; |
|
276 static bool gIsNativeUTF8; |
|
277 |
|
278 static void LazyInit(); |
|
279 |
|
280 static void Lock() { if (gLock) gLock->Lock(); } |
|
281 static void Unlock() { if (gLock) gLock->Unlock(); } |
|
282 }; |
|
283 |
|
284 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T; |
|
285 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T; |
|
286 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
287 iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T; |
|
288 iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T; |
|
289 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T; |
|
290 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T; |
|
291 #endif |
|
292 Mutex *nsNativeCharsetConverter::gLock = nullptr; |
|
293 bool nsNativeCharsetConverter::gInitialized = false; |
|
294 bool nsNativeCharsetConverter::gIsNativeUTF8 = false; |
|
295 |
|
296 void |
|
297 nsNativeCharsetConverter::LazyInit() |
|
298 { |
|
299 // LazyInit may be called before NS_StartupNativeCharsetUtils, but |
|
300 // the setlocale it does has to be called before nl_langinfo. Like in |
|
301 // NS_StartupNativeCharsetUtils, assume we are called early enough that |
|
302 // we are the first to care about the locale's charset. |
|
303 if (!gLock) |
|
304 setlocale(LC_CTYPE, ""); |
|
305 const char *blank_list[] = { "", nullptr }; |
|
306 const char **native_charset_list = blank_list; |
|
307 const char *native_charset = nl_langinfo(CODESET); |
|
308 if (native_charset == nullptr) { |
|
309 NS_ERROR("native charset is unknown"); |
|
310 // fallback to ISO-8859-1 |
|
311 native_charset_list = ISO_8859_1_NAMES; |
|
312 } |
|
313 else |
|
314 native_charset_list[0] = native_charset; |
|
315 |
|
316 // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) |
|
317 // return 'UTF-8' (or 'utf-8') |
|
318 if (!PL_strcasecmp(native_charset, "UTF-8")) |
|
319 gIsNativeUTF8 = true; |
|
320 |
|
321 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list); |
|
322 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES); |
|
323 |
|
324 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
325 if (gNativeToUnicode == INVALID_ICONV_T) { |
|
326 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list); |
|
327 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES); |
|
328 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter"); |
|
329 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter"); |
|
330 } |
|
331 if (gUnicodeToNative == INVALID_ICONV_T) { |
|
332 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES); |
|
333 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES); |
|
334 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter"); |
|
335 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter"); |
|
336 } |
|
337 #else |
|
338 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter"); |
|
339 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter"); |
|
340 #endif |
|
341 |
|
342 /* |
|
343 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2 |
|
344 * prepend a byte order mark unicode character (BOM, u+FEFF) during |
|
345 * the first use of the iconv converter. The same is the case of |
|
346 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. |
|
347 * However, we use 'UTF-16LE/BE' in both cases, instead so that we |
|
348 * should be safe. But just in case... |
|
349 * |
|
350 * This dummy conversion gets rid of the BOMs and fixes bug 153562. |
|
351 */ |
|
352 char dummy_input[1] = { ' ' }; |
|
353 char dummy_output[4]; |
|
354 |
|
355 if (gNativeToUnicode != INVALID_ICONV_T) { |
|
356 const char *input = dummy_input; |
|
357 size_t input_left = sizeof(dummy_input); |
|
358 char *output = dummy_output; |
|
359 size_t output_left = sizeof(dummy_output); |
|
360 |
|
361 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left); |
|
362 } |
|
363 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
364 if (gUTF8ToUnicode != INVALID_ICONV_T) { |
|
365 const char *input = dummy_input; |
|
366 size_t input_left = sizeof(dummy_input); |
|
367 char *output = dummy_output; |
|
368 size_t output_left = sizeof(dummy_output); |
|
369 |
|
370 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left); |
|
371 } |
|
372 #endif |
|
373 |
|
374 gInitialized = true; |
|
375 } |
|
376 |
|
377 void |
|
378 nsNativeCharsetConverter::GlobalInit() |
|
379 { |
|
380 gLock = new Mutex("nsNativeCharsetConverter.gLock"); |
|
381 } |
|
382 |
|
383 void |
|
384 nsNativeCharsetConverter::GlobalShutdown() |
|
385 { |
|
386 if (gLock) { |
|
387 delete gLock; |
|
388 gLock = nullptr; |
|
389 } |
|
390 |
|
391 if (gNativeToUnicode != INVALID_ICONV_T) { |
|
392 iconv_close(gNativeToUnicode); |
|
393 gNativeToUnicode = INVALID_ICONV_T; |
|
394 } |
|
395 |
|
396 if (gUnicodeToNative != INVALID_ICONV_T) { |
|
397 iconv_close(gUnicodeToNative); |
|
398 gUnicodeToNative = INVALID_ICONV_T; |
|
399 } |
|
400 |
|
401 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
402 if (gNativeToUTF8 != INVALID_ICONV_T) { |
|
403 iconv_close(gNativeToUTF8); |
|
404 gNativeToUTF8 = INVALID_ICONV_T; |
|
405 } |
|
406 if (gUTF8ToNative != INVALID_ICONV_T) { |
|
407 iconv_close(gUTF8ToNative); |
|
408 gUTF8ToNative = INVALID_ICONV_T; |
|
409 } |
|
410 if (gUnicodeToUTF8 != INVALID_ICONV_T) { |
|
411 iconv_close(gUnicodeToUTF8); |
|
412 gUnicodeToUTF8 = INVALID_ICONV_T; |
|
413 } |
|
414 if (gUTF8ToUnicode != INVALID_ICONV_T) { |
|
415 iconv_close(gUTF8ToUnicode); |
|
416 gUTF8ToUnicode = INVALID_ICONV_T; |
|
417 } |
|
418 #endif |
|
419 |
|
420 gInitialized = false; |
|
421 } |
|
422 |
|
423 nsNativeCharsetConverter::nsNativeCharsetConverter() |
|
424 { |
|
425 Lock(); |
|
426 if (!gInitialized) |
|
427 LazyInit(); |
|
428 } |
|
429 |
|
430 nsNativeCharsetConverter::~nsNativeCharsetConverter() |
|
431 { |
|
432 // reset converters for next time |
|
433 if (gNativeToUnicode != INVALID_ICONV_T) |
|
434 xp_iconv_reset(gNativeToUnicode); |
|
435 if (gUnicodeToNative != INVALID_ICONV_T) |
|
436 xp_iconv_reset(gUnicodeToNative); |
|
437 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
438 if (gNativeToUTF8 != INVALID_ICONV_T) |
|
439 xp_iconv_reset(gNativeToUTF8); |
|
440 if (gUTF8ToNative != INVALID_ICONV_T) |
|
441 xp_iconv_reset(gUTF8ToNative); |
|
442 if (gUnicodeToUTF8 != INVALID_ICONV_T) |
|
443 xp_iconv_reset(gUnicodeToUTF8); |
|
444 if (gUTF8ToUnicode != INVALID_ICONV_T) |
|
445 xp_iconv_reset(gUTF8ToUnicode); |
|
446 #endif |
|
447 Unlock(); |
|
448 } |
|
449 |
|
450 nsresult |
|
451 nsNativeCharsetConverter::NativeToUnicode(const char **input, |
|
452 uint32_t *inputLeft, |
|
453 char16_t **output, |
|
454 uint32_t *outputLeft) |
|
455 { |
|
456 size_t res = 0; |
|
457 size_t inLeft = (size_t) *inputLeft; |
|
458 size_t outLeft = (size_t) *outputLeft * 2; |
|
459 |
|
460 if (gNativeToUnicode != INVALID_ICONV_T) { |
|
461 |
|
462 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft); |
|
463 |
|
464 *inputLeft = inLeft; |
|
465 *outputLeft = outLeft / 2; |
|
466 if (res != (size_t) -1) |
|
467 return NS_OK; |
|
468 |
|
469 NS_WARNING("conversion from native to utf-16 failed"); |
|
470 |
|
471 // reset converter |
|
472 xp_iconv_reset(gNativeToUnicode); |
|
473 } |
|
474 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
475 else if ((gNativeToUTF8 != INVALID_ICONV_T) && |
|
476 (gUTF8ToUnicode != INVALID_ICONV_T)) { |
|
477 // convert first to UTF8, then from UTF8 to UCS2 |
|
478 const char *in = *input; |
|
479 |
|
480 char ubuf[1024]; |
|
481 |
|
482 // we assume we're always called with enough space in |output|, |
|
483 // so convert many chars at a time... |
|
484 while (inLeft) { |
|
485 char *p = ubuf; |
|
486 size_t n = sizeof(ubuf); |
|
487 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n); |
|
488 if (res == (size_t) -1) { |
|
489 NS_ERROR("conversion from native to utf-8 failed"); |
|
490 break; |
|
491 } |
|
492 NS_ASSERTION(outLeft > 0, "bad assumption"); |
|
493 p = ubuf; |
|
494 n = sizeof(ubuf) - n; |
|
495 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft); |
|
496 if (res == (size_t) -1) { |
|
497 NS_ERROR("conversion from utf-8 to utf-16 failed"); |
|
498 break; |
|
499 } |
|
500 } |
|
501 |
|
502 (*input) += (*inputLeft - inLeft); |
|
503 *inputLeft = inLeft; |
|
504 *outputLeft = outLeft / 2; |
|
505 |
|
506 if (res != (size_t) -1) |
|
507 return NS_OK; |
|
508 |
|
509 // reset converters |
|
510 xp_iconv_reset(gNativeToUTF8); |
|
511 xp_iconv_reset(gUTF8ToUnicode); |
|
512 } |
|
513 #endif |
|
514 |
|
515 // fallback: zero-pad and hope for the best |
|
516 // XXX This is lame and we have to do better. |
|
517 isolatin1_to_utf16(input, inputLeft, output, outputLeft); |
|
518 |
|
519 return NS_OK; |
|
520 } |
|
521 |
|
522 nsresult |
|
523 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, |
|
524 uint32_t *inputLeft, |
|
525 char **output, |
|
526 uint32_t *outputLeft) |
|
527 { |
|
528 size_t res = 0; |
|
529 size_t inLeft = (size_t) *inputLeft * 2; |
|
530 size_t outLeft = (size_t) *outputLeft; |
|
531 |
|
532 if (gUnicodeToNative != INVALID_ICONV_T) { |
|
533 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft); |
|
534 |
|
535 *inputLeft = inLeft / 2; |
|
536 *outputLeft = outLeft; |
|
537 if (res != (size_t) -1) { |
|
538 return NS_OK; |
|
539 } |
|
540 |
|
541 NS_ERROR("iconv failed"); |
|
542 |
|
543 // reset converter |
|
544 xp_iconv_reset(gUnicodeToNative); |
|
545 } |
|
546 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) |
|
547 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) && |
|
548 (gUTF8ToNative != INVALID_ICONV_T)) { |
|
549 const char *in = (const char *) *input; |
|
550 |
|
551 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes) |
|
552 |
|
553 // convert one uchar at a time... |
|
554 while (inLeft && outLeft) { |
|
555 char *p = ubuf; |
|
556 size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t); |
|
557 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n); |
|
558 if (res == (size_t) -1) { |
|
559 NS_ERROR("conversion from utf-16 to utf-8 failed"); |
|
560 break; |
|
561 } |
|
562 p = ubuf; |
|
563 n = sizeof(ubuf) - n; |
|
564 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft); |
|
565 if (res == (size_t) -1) { |
|
566 if (errno == E2BIG) { |
|
567 // not enough room for last uchar... back up and return. |
|
568 in -= sizeof(char16_t); |
|
569 res = 0; |
|
570 } |
|
571 else |
|
572 NS_ERROR("conversion from utf-8 to native failed"); |
|
573 break; |
|
574 } |
|
575 inLeft -= sizeof(char16_t); |
|
576 } |
|
577 |
|
578 (*input) += (*inputLeft - inLeft / 2); |
|
579 *inputLeft = inLeft / 2; |
|
580 *outputLeft = outLeft; |
|
581 if (res != (size_t) -1) { |
|
582 return NS_OK; |
|
583 } |
|
584 |
|
585 // reset converters |
|
586 xp_iconv_reset(gUnicodeToUTF8); |
|
587 xp_iconv_reset(gUTF8ToNative); |
|
588 } |
|
589 #endif |
|
590 |
|
591 // fallback: truncate and hope for the best |
|
592 // XXX This is lame and we have to do better. |
|
593 utf16_to_isolatin1(input, inputLeft, output, outputLeft); |
|
594 |
|
595 return NS_OK; |
|
596 } |
|
597 |
|
598 bool |
|
599 nsNativeCharsetConverter::IsNativeUTF8() |
|
600 { |
|
601 if (!gInitialized) { |
|
602 Lock(); |
|
603 if (!gInitialized) |
|
604 LazyInit(); |
|
605 Unlock(); |
|
606 } |
|
607 return gIsNativeUTF8; |
|
608 } |
|
609 |
|
610 #endif // USE_ICONV |
|
611 |
|
612 //----------------------------------------------------------------------------- |
|
613 // conversion using mb[r]towc/wc[r]tomb |
|
614 //----------------------------------------------------------------------------- |
|
615 #if defined(USE_STDCONV) |
|
616 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) |
|
617 #include <wchar.h> // mbrtowc, wcrtomb |
|
618 #endif |
|
619 |
|
620 class nsNativeCharsetConverter |
|
621 { |
|
622 public: |
|
623 nsNativeCharsetConverter(); |
|
624 |
|
625 nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, |
|
626 char16_t **output, uint32_t *outputLeft); |
|
627 nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, |
|
628 char **output, uint32_t *outputLeft); |
|
629 |
|
630 static void GlobalInit(); |
|
631 static void GlobalShutdown() { } |
|
632 static bool IsNativeUTF8(); |
|
633 |
|
634 private: |
|
635 static bool gWCharIsUnicode; |
|
636 |
|
637 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) |
|
638 mbstate_t ps; |
|
639 #endif |
|
640 }; |
|
641 |
|
642 bool nsNativeCharsetConverter::gWCharIsUnicode = false; |
|
643 |
|
644 nsNativeCharsetConverter::nsNativeCharsetConverter() |
|
645 { |
|
646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) |
|
647 memset(&ps, 0, sizeof(ps)); |
|
648 #endif |
|
649 } |
|
650 |
|
651 void |
|
652 nsNativeCharsetConverter::GlobalInit() |
|
653 { |
|
654 // verify that wchar_t for the current locale is actually unicode. |
|
655 // if it is not, then we should avoid calling mbtowc/wctomb and |
|
656 // just fallback on zero-pad/truncation conversion. |
|
657 // |
|
658 // this test cannot be done at build time because the encoding of |
|
659 // wchar_t may depend on the runtime locale. sad, but true!! |
|
660 // |
|
661 // so, if wchar_t is unicode then converting an ASCII character |
|
662 // to wchar_t should not change its numeric value. we'll just |
|
663 // check what happens with the ASCII 'a' character. |
|
664 // |
|
665 // this test is not perfect... obviously, it could yield false |
|
666 // positives, but then at least ASCII text would be converted |
|
667 // properly (or maybe just the 'a' character) -- oh well :( |
|
668 |
|
669 char a = 'a'; |
|
670 unsigned int w = 0; |
|
671 |
|
672 int res = mbtowc((wchar_t *) &w, &a, 1); |
|
673 |
|
674 gWCharIsUnicode = (res != -1 && w == 'a'); |
|
675 |
|
676 #ifdef DEBUG |
|
677 if (!gWCharIsUnicode) |
|
678 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)"); |
|
679 #endif |
|
680 } |
|
681 |
|
682 nsresult |
|
683 nsNativeCharsetConverter::NativeToUnicode(const char **input, |
|
684 uint32_t *inputLeft, |
|
685 char16_t **output, |
|
686 uint32_t *outputLeft) |
|
687 { |
|
688 if (gWCharIsUnicode) { |
|
689 int incr; |
|
690 |
|
691 // cannot use wchar_t here since it may have been redefined (e.g., |
|
692 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP. |
|
693 unsigned int tmp = 0; |
|
694 while (*inputLeft && *outputLeft) { |
|
695 #ifdef HAVE_MBRTOWC |
|
696 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps); |
|
697 #else |
|
698 // XXX is this thread-safe? |
|
699 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft); |
|
700 #endif |
|
701 if (incr < 0) { |
|
702 NS_WARNING("mbtowc failed: possible charset mismatch"); |
|
703 // zero-pad and hope for the best |
|
704 tmp = (unsigned char) **input; |
|
705 incr = 1; |
|
706 } |
|
707 **output = (char16_t) tmp; |
|
708 (*input) += incr; |
|
709 (*inputLeft) -= incr; |
|
710 (*output)++; |
|
711 (*outputLeft)--; |
|
712 } |
|
713 } |
|
714 else { |
|
715 // wchar_t isn't unicode, so the best we can do is treat the |
|
716 // input as if it is isolatin1 :( |
|
717 isolatin1_to_utf16(input, inputLeft, output, outputLeft); |
|
718 } |
|
719 |
|
720 return NS_OK; |
|
721 } |
|
722 |
|
723 nsresult |
|
724 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, |
|
725 uint32_t *inputLeft, |
|
726 char **output, |
|
727 uint32_t *outputLeft) |
|
728 { |
|
729 if (gWCharIsUnicode) { |
|
730 int incr; |
|
731 |
|
732 while (*inputLeft && *outputLeft >= MB_CUR_MAX) { |
|
733 #ifdef HAVE_WCRTOMB |
|
734 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps); |
|
735 #else |
|
736 // XXX is this thread-safe? |
|
737 incr = (int) wctomb(*output, (wchar_t) **input); |
|
738 #endif |
|
739 if (incr < 0) { |
|
740 NS_WARNING("mbtowc failed: possible charset mismatch"); |
|
741 **output = (unsigned char) **input; // truncate |
|
742 incr = 1; |
|
743 } |
|
744 // most likely we're dead anyways if this assertion should fire |
|
745 NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string"); |
|
746 (*output) += incr; |
|
747 (*outputLeft) -= incr; |
|
748 (*input)++; |
|
749 (*inputLeft)--; |
|
750 } |
|
751 } |
|
752 else { |
|
753 // wchar_t isn't unicode, so the best we can do is treat the |
|
754 // input as if it is isolatin1 :( |
|
755 utf16_to_isolatin1(input, inputLeft, output, outputLeft); |
|
756 } |
|
757 |
|
758 return NS_OK; |
|
759 } |
|
760 |
|
761 // XXX : for now, return false |
|
762 bool |
|
763 nsNativeCharsetConverter::IsNativeUTF8() |
|
764 { |
|
765 return false; |
|
766 } |
|
767 |
|
768 #endif // USE_STDCONV |
|
769 |
|
770 //----------------------------------------------------------------------------- |
|
771 // API implementation |
|
772 //----------------------------------------------------------------------------- |
|
773 |
|
774 nsresult |
|
775 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) |
|
776 { |
|
777 output.Truncate(); |
|
778 |
|
779 uint32_t inputLen = input.Length(); |
|
780 |
|
781 nsACString::const_iterator iter; |
|
782 input.BeginReading(iter); |
|
783 |
|
784 // |
|
785 // OPTIMIZATION: preallocate space for largest possible result; convert |
|
786 // directly into the result buffer to avoid intermediate buffer copy. |
|
787 // |
|
788 // this will generally result in a larger allocation, but that seems |
|
789 // better than an extra buffer copy. |
|
790 // |
|
791 if (!output.SetLength(inputLen, fallible_t())) |
|
792 return NS_ERROR_OUT_OF_MEMORY; |
|
793 nsAString::iterator out_iter; |
|
794 output.BeginWriting(out_iter); |
|
795 |
|
796 char16_t *result = out_iter.get(); |
|
797 uint32_t resultLeft = inputLen; |
|
798 |
|
799 const char *buf = iter.get(); |
|
800 uint32_t bufLeft = inputLen; |
|
801 |
|
802 nsNativeCharsetConverter conv; |
|
803 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft); |
|
804 if (NS_SUCCEEDED(rv)) { |
|
805 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer"); |
|
806 output.SetLength(inputLen - resultLeft); |
|
807 } |
|
808 return rv; |
|
809 } |
|
810 |
|
811 nsresult |
|
812 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) |
|
813 { |
|
814 output.Truncate(); |
|
815 |
|
816 nsAString::const_iterator iter, end; |
|
817 input.BeginReading(iter); |
|
818 input.EndReading(end); |
|
819 |
|
820 // cannot easily avoid intermediate buffer copy. |
|
821 char temp[4096]; |
|
822 |
|
823 nsNativeCharsetConverter conv; |
|
824 |
|
825 const char16_t *buf = iter.get(); |
|
826 uint32_t bufLeft = Distance(iter, end); |
|
827 while (bufLeft) { |
|
828 char *p = temp; |
|
829 uint32_t tempLeft = sizeof(temp); |
|
830 |
|
831 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft); |
|
832 if (NS_FAILED(rv)) return rv; |
|
833 |
|
834 if (tempLeft < sizeof(temp)) |
|
835 output.Append(temp, sizeof(temp) - tempLeft); |
|
836 } |
|
837 return NS_OK; |
|
838 } |
|
839 |
|
840 bool |
|
841 NS_IsNativeUTF8() |
|
842 { |
|
843 return nsNativeCharsetConverter::IsNativeUTF8(); |
|
844 } |
|
845 |
|
846 void |
|
847 NS_StartupNativeCharsetUtils() |
|
848 { |
|
849 // |
|
850 // need to initialize the locale or else charset conversion will fail. |
|
851 // better not delay this in case some other component alters the locale |
|
852 // settings. |
|
853 // |
|
854 // XXX we assume that we are called early enough that we should |
|
855 // always be the first to care about the locale's charset. |
|
856 // |
|
857 setlocale(LC_CTYPE, ""); |
|
858 |
|
859 nsNativeCharsetConverter::GlobalInit(); |
|
860 } |
|
861 |
|
862 void |
|
863 NS_ShutdownNativeCharsetUtils() |
|
864 { |
|
865 nsNativeCharsetConverter::GlobalShutdown(); |
|
866 } |
|
867 |
|
868 //----------------------------------------------------------------------------- |
|
869 // XP_WIN |
|
870 //----------------------------------------------------------------------------- |
|
871 #elif defined(XP_WIN) |
|
872 |
|
873 #include <windows.h> |
|
874 #include "nsString.h" |
|
875 #include "nsAString.h" |
|
876 #include "nsReadableUtils.h" |
|
877 |
|
878 using namespace mozilla; |
|
879 |
|
880 nsresult |
|
881 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) |
|
882 { |
|
883 uint32_t inputLen = input.Length(); |
|
884 |
|
885 nsACString::const_iterator iter; |
|
886 input.BeginReading(iter); |
|
887 |
|
888 const char *buf = iter.get(); |
|
889 |
|
890 // determine length of result |
|
891 uint32_t resultLen = 0; |
|
892 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0); |
|
893 if (n > 0) |
|
894 resultLen += n; |
|
895 |
|
896 // allocate sufficient space |
|
897 if (!output.SetLength(resultLen, fallible_t())) |
|
898 return NS_ERROR_OUT_OF_MEMORY; |
|
899 if (resultLen > 0) { |
|
900 nsAString::iterator out_iter; |
|
901 output.BeginWriting(out_iter); |
|
902 |
|
903 char16_t *result = out_iter.get(); |
|
904 |
|
905 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen); |
|
906 } |
|
907 return NS_OK; |
|
908 } |
|
909 |
|
910 nsresult |
|
911 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) |
|
912 { |
|
913 uint32_t inputLen = input.Length(); |
|
914 |
|
915 nsAString::const_iterator iter; |
|
916 input.BeginReading(iter); |
|
917 |
|
918 char16ptr_t buf = iter.get(); |
|
919 |
|
920 // determine length of result |
|
921 uint32_t resultLen = 0; |
|
922 |
|
923 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0, |
|
924 nullptr, nullptr); |
|
925 if (n > 0) |
|
926 resultLen += n; |
|
927 |
|
928 // allocate sufficient space |
|
929 if (!output.SetLength(resultLen, fallible_t())) |
|
930 return NS_ERROR_OUT_OF_MEMORY; |
|
931 if (resultLen > 0) { |
|
932 nsACString::iterator out_iter; |
|
933 output.BeginWriting(out_iter); |
|
934 |
|
935 // default "defaultChar" is '?', which is an illegal character on windows |
|
936 // file system. That will cause file uncreatable. Change it to '_' |
|
937 const char defaultChar = '_'; |
|
938 |
|
939 char *result = out_iter.get(); |
|
940 |
|
941 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen, |
|
942 &defaultChar, nullptr); |
|
943 } |
|
944 return NS_OK; |
|
945 } |
|
946 |
|
947 // moved from widget/windows/nsToolkit.cpp |
|
948 int32_t |
|
949 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW) |
|
950 { |
|
951 return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize); |
|
952 } |
|
953 |
|
954 int32_t |
|
955 NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut, |
|
956 char *aStrOutA, const char *aDefault) |
|
957 { |
|
958 if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0)) |
|
959 return 0; |
|
960 |
|
961 int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1, |
|
962 aStrOutA, aBufferSizeOut, |
|
963 aDefault, nullptr); |
|
964 |
|
965 if (!numCharsConverted) { |
|
966 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { |
|
967 // Overflow, add missing null termination but return 0 |
|
968 aStrOutA[aBufferSizeOut-1] = '\0'; |
|
969 } |
|
970 else { |
|
971 // Other error, clear string and return 0 |
|
972 aStrOutA[0] = '\0'; |
|
973 } |
|
974 } |
|
975 else if (numCharsConverted < aBufferSizeOut) { |
|
976 // Add 2nd null (really necessary?) |
|
977 aStrOutA[numCharsConverted] = '\0'; |
|
978 } |
|
979 |
|
980 return numCharsConverted; |
|
981 } |
|
982 |
|
983 #else |
|
984 |
|
985 #include "nsReadableUtils.h" |
|
986 |
|
987 nsresult |
|
988 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) |
|
989 { |
|
990 CopyASCIItoUTF16(input, output); |
|
991 return NS_OK; |
|
992 } |
|
993 |
|
994 nsresult |
|
995 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) |
|
996 { |
|
997 LossyCopyUTF16toASCII(input, output); |
|
998 return NS_OK; |
|
999 } |
|
1000 |
|
1001 void |
|
1002 NS_StartupNativeCharsetUtils() |
|
1003 { |
|
1004 } |
|
1005 |
|
1006 void |
|
1007 NS_ShutdownNativeCharsetUtils() |
|
1008 { |
|
1009 } |
|
1010 |
|
1011 #endif |