1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/io/nsNativeCharsetUtils.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1011 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +#include "xpcom-private.h" 1.9 + 1.10 +//----------------------------------------------------------------------------- 1.11 +// XP_MACOSX or ANDROID 1.12 +//----------------------------------------------------------------------------- 1.13 +#if defined(XP_MACOSX) || defined(ANDROID) 1.14 + 1.15 +#include "nsAString.h" 1.16 +#include "nsReadableUtils.h" 1.17 +#include "nsString.h" 1.18 + 1.19 +nsresult 1.20 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) 1.21 +{ 1.22 + CopyUTF8toUTF16(input, output); 1.23 + return NS_OK; 1.24 +} 1.25 + 1.26 +nsresult 1.27 +NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) 1.28 +{ 1.29 + CopyUTF16toUTF8(input, output); 1.30 + return NS_OK; 1.31 +} 1.32 + 1.33 +void 1.34 +NS_StartupNativeCharsetUtils() 1.35 +{ 1.36 +} 1.37 + 1.38 +void 1.39 +NS_ShutdownNativeCharsetUtils() 1.40 +{ 1.41 +} 1.42 + 1.43 + 1.44 +//----------------------------------------------------------------------------- 1.45 +// XP_UNIX 1.46 +//----------------------------------------------------------------------------- 1.47 +#elif defined(XP_UNIX) 1.48 + 1.49 +#include <stdlib.h> // mbtowc, wctomb 1.50 +#include <locale.h> // setlocale 1.51 +#include "mozilla/Mutex.h" 1.52 +#include "nscore.h" 1.53 +#include "nsAString.h" 1.54 +#include "nsReadableUtils.h" 1.55 + 1.56 +using namespace mozilla; 1.57 + 1.58 +// 1.59 +// choose a conversion library. we used to use mbrtowc/wcrtomb under Linux, 1.60 +// but that doesn't work for non-BMP characters whether we use '-fshort-wchar' 1.61 +// or not (see bug 206811 and 1.62 +// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use 1.63 +// iconv for all platforms where nltypes.h and nllanginfo.h are present 1.64 +// along with iconv. 1.65 +// 1.66 +#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET) 1.67 +#define USE_ICONV 1 1.68 +#else 1.69 +#define USE_STDCONV 1 1.70 +#endif 1.71 + 1.72 +static void 1.73 +isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft) 1.74 +{ 1.75 + while (*inputLeft && *outputLeft) { 1.76 + **output = (unsigned char) **input; 1.77 + (*input)++; 1.78 + (*inputLeft)--; 1.79 + (*output)++; 1.80 + (*outputLeft)--; 1.81 + } 1.82 +} 1.83 + 1.84 +static void 1.85 +utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft) 1.86 +{ 1.87 + while (*inputLeft && *outputLeft) { 1.88 + **output = (unsigned char) **input; 1.89 + (*input)++; 1.90 + (*inputLeft)--; 1.91 + (*output)++; 1.92 + (*outputLeft)--; 1.93 + } 1.94 +} 1.95 + 1.96 +//----------------------------------------------------------------------------- 1.97 +// conversion using iconv 1.98 +//----------------------------------------------------------------------------- 1.99 +#if defined(USE_ICONV) 1.100 +#include <nl_types.h> // CODESET 1.101 +#include <langinfo.h> // nl_langinfo 1.102 +#include <iconv.h> // iconv_open, iconv, iconv_close 1.103 +#include <errno.h> 1.104 +#include "plstr.h" 1.105 + 1.106 +#if defined(HAVE_ICONV_WITH_CONST_INPUT) 1.107 +#define ICONV_INPUT(x) (x) 1.108 +#else 1.109 +#define ICONV_INPUT(x) ((char **)x) 1.110 +#endif 1.111 + 1.112 +// solaris definitely needs this, but we'll enable it by default 1.113 +// just in case... but we know for sure that iconv(3) in glibc 1.114 +// doesn't need this. 1.115 +#if !defined(__GLIBC__) 1.116 +#define ENABLE_UTF8_FALLBACK_SUPPORT 1.117 +#endif 1.118 + 1.119 +#define INVALID_ICONV_T ((iconv_t) -1) 1.120 + 1.121 +static inline size_t 1.122 +xp_iconv(iconv_t converter, 1.123 + const char **input, 1.124 + size_t *inputLeft, 1.125 + char **output, 1.126 + size_t *outputLeft) 1.127 +{ 1.128 + size_t res, outputAvail = outputLeft ? *outputLeft : 0; 1.129 + res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft); 1.130 + if (res == (size_t) -1) { 1.131 + // on some platforms (e.g., linux) iconv will fail with 1.132 + // E2BIG if it cannot convert _all_ of its input. it'll 1.133 + // still adjust all of the in/out params correctly, so we 1.134 + // can ignore this error. the assumption is that we will 1.135 + // be called again to complete the conversion. 1.136 + if ((errno == E2BIG) && (*outputLeft < outputAvail)) 1.137 + res = 0; 1.138 + } 1.139 + return res; 1.140 +} 1.141 + 1.142 +static inline void 1.143 +xp_iconv_reset(iconv_t converter) 1.144 +{ 1.145 + // NOTE: the man pages on Solaris claim that you can pass nullptr 1.146 + // for all parameter to reset the converter, but beware the 1.147 + // evil Solaris crash if you go down this route >:-) 1.148 + 1.149 + const char *zero_char_in_ptr = nullptr; 1.150 + char *zero_char_out_ptr = nullptr; 1.151 + size_t zero_size_in = 0, 1.152 + zero_size_out = 0; 1.153 + 1.154 + xp_iconv(converter, &zero_char_in_ptr, 1.155 + &zero_size_in, 1.156 + &zero_char_out_ptr, 1.157 + &zero_size_out); 1.158 +} 1.159 + 1.160 +static inline iconv_t 1.161 +xp_iconv_open(const char **to_list, const char **from_list) 1.162 +{ 1.163 + iconv_t res; 1.164 + const char **from_name; 1.165 + const char **to_name; 1.166 + 1.167 + // try all possible combinations to locate a converter. 1.168 + to_name = to_list; 1.169 + while (*to_name) { 1.170 + if (**to_name) { 1.171 + from_name = from_list; 1.172 + while (*from_name) { 1.173 + if (**from_name) { 1.174 + res = iconv_open(*to_name, *from_name); 1.175 + if (res != INVALID_ICONV_T) 1.176 + return res; 1.177 + } 1.178 + from_name++; 1.179 + } 1.180 + } 1.181 + to_name++; 1.182 + } 1.183 + 1.184 + return INVALID_ICONV_T; 1.185 +} 1.186 + 1.187 +/* 1.188 + * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we 1.189 + * have to use UTF-16 with iconv(3) on platforms where it's supported. 1.190 + * However, the way UTF-16 and UCS-2 are interpreted varies across platforms 1.191 + * and implementations of iconv(3). On Tru64, it also depends on the environment 1.192 + * variable. To avoid the trouble arising from byte-swapping 1.193 + * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling 1.194 + * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 1.195 + * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness, 1.196 + * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE' 1.197 + * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment 1.198 + * variable ICONV_BYTEORDER is set to 'big-endian', about which not much 1.199 + * can be done other than adding a note in the release notes. (bug 206811) 1.200 + */ 1.201 +static const char *UTF_16_NAMES[] = { 1.202 +#if defined(IS_LITTLE_ENDIAN) 1.203 + "UTF-16LE", 1.204 +#if defined(__GLIBC__) 1.205 + "UNICODELITTLE", 1.206 +#endif 1.207 + "UCS-2LE", 1.208 +#else 1.209 + "UTF-16BE", 1.210 +#if defined(__GLIBC__) 1.211 + "UNICODEBIG", 1.212 +#endif 1.213 + "UCS-2BE", 1.214 +#endif 1.215 + "UTF-16", 1.216 + "UCS-2", 1.217 + "UCS2", 1.218 + "UCS_2", 1.219 + "ucs-2", 1.220 + "ucs2", 1.221 + "ucs_2", 1.222 + nullptr 1.223 +}; 1.224 + 1.225 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.226 +static const char *UTF_8_NAMES[] = { 1.227 + "UTF-8", 1.228 + "UTF8", 1.229 + "UTF_8", 1.230 + "utf-8", 1.231 + "utf8", 1.232 + "utf_8", 1.233 + nullptr 1.234 +}; 1.235 +#endif 1.236 + 1.237 +static const char *ISO_8859_1_NAMES[] = { 1.238 + "ISO-8859-1", 1.239 +#if !defined(__GLIBC__) 1.240 + "ISO8859-1", 1.241 + "ISO88591", 1.242 + "ISO_8859_1", 1.243 + "ISO8859_1", 1.244 + "iso-8859-1", 1.245 + "iso8859-1", 1.246 + "iso88591", 1.247 + "iso_8859_1", 1.248 + "iso8859_1", 1.249 +#endif 1.250 + nullptr 1.251 +}; 1.252 + 1.253 +class nsNativeCharsetConverter 1.254 +{ 1.255 +public: 1.256 + nsNativeCharsetConverter(); 1.257 + ~nsNativeCharsetConverter(); 1.258 + 1.259 + nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, 1.260 + char16_t **output, uint32_t *outputLeft); 1.261 + nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, 1.262 + char **output, uint32_t *outputLeft); 1.263 + 1.264 + static void GlobalInit(); 1.265 + static void GlobalShutdown(); 1.266 + static bool IsNativeUTF8(); 1.267 + 1.268 +private: 1.269 + static iconv_t gNativeToUnicode; 1.270 + static iconv_t gUnicodeToNative; 1.271 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.272 + static iconv_t gNativeToUTF8; 1.273 + static iconv_t gUTF8ToNative; 1.274 + static iconv_t gUnicodeToUTF8; 1.275 + static iconv_t gUTF8ToUnicode; 1.276 +#endif 1.277 + static Mutex *gLock; 1.278 + static bool gInitialized; 1.279 + static bool gIsNativeUTF8; 1.280 + 1.281 + static void LazyInit(); 1.282 + 1.283 + static void Lock() { if (gLock) gLock->Lock(); } 1.284 + static void Unlock() { if (gLock) gLock->Unlock(); } 1.285 +}; 1.286 + 1.287 +iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T; 1.288 +iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T; 1.289 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.290 +iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T; 1.291 +iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T; 1.292 +iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T; 1.293 +iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T; 1.294 +#endif 1.295 +Mutex *nsNativeCharsetConverter::gLock = nullptr; 1.296 +bool nsNativeCharsetConverter::gInitialized = false; 1.297 +bool nsNativeCharsetConverter::gIsNativeUTF8 = false; 1.298 + 1.299 +void 1.300 +nsNativeCharsetConverter::LazyInit() 1.301 +{ 1.302 + // LazyInit may be called before NS_StartupNativeCharsetUtils, but 1.303 + // the setlocale it does has to be called before nl_langinfo. Like in 1.304 + // NS_StartupNativeCharsetUtils, assume we are called early enough that 1.305 + // we are the first to care about the locale's charset. 1.306 + if (!gLock) 1.307 + setlocale(LC_CTYPE, ""); 1.308 + const char *blank_list[] = { "", nullptr }; 1.309 + const char **native_charset_list = blank_list; 1.310 + const char *native_charset = nl_langinfo(CODESET); 1.311 + if (native_charset == nullptr) { 1.312 + NS_ERROR("native charset is unknown"); 1.313 + // fallback to ISO-8859-1 1.314 + native_charset_list = ISO_8859_1_NAMES; 1.315 + } 1.316 + else 1.317 + native_charset_list[0] = native_charset; 1.318 + 1.319 + // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) 1.320 + // return 'UTF-8' (or 'utf-8') 1.321 + if (!PL_strcasecmp(native_charset, "UTF-8")) 1.322 + gIsNativeUTF8 = true; 1.323 + 1.324 + gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list); 1.325 + gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES); 1.326 + 1.327 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.328 + if (gNativeToUnicode == INVALID_ICONV_T) { 1.329 + gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list); 1.330 + gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES); 1.331 + NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter"); 1.332 + NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter"); 1.333 + } 1.334 + if (gUnicodeToNative == INVALID_ICONV_T) { 1.335 + gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES); 1.336 + gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES); 1.337 + NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter"); 1.338 + NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter"); 1.339 + } 1.340 +#else 1.341 + NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter"); 1.342 + NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter"); 1.343 +#endif 1.344 + 1.345 + /* 1.346 + * On Solaris 8 (and newer?), the iconv modules converting to UCS-2 1.347 + * prepend a byte order mark unicode character (BOM, u+FEFF) during 1.348 + * the first use of the iconv converter. The same is the case of 1.349 + * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. 1.350 + * However, we use 'UTF-16LE/BE' in both cases, instead so that we 1.351 + * should be safe. But just in case... 1.352 + * 1.353 + * This dummy conversion gets rid of the BOMs and fixes bug 153562. 1.354 + */ 1.355 + char dummy_input[1] = { ' ' }; 1.356 + char dummy_output[4]; 1.357 + 1.358 + if (gNativeToUnicode != INVALID_ICONV_T) { 1.359 + const char *input = dummy_input; 1.360 + size_t input_left = sizeof(dummy_input); 1.361 + char *output = dummy_output; 1.362 + size_t output_left = sizeof(dummy_output); 1.363 + 1.364 + xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left); 1.365 + } 1.366 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.367 + if (gUTF8ToUnicode != INVALID_ICONV_T) { 1.368 + const char *input = dummy_input; 1.369 + size_t input_left = sizeof(dummy_input); 1.370 + char *output = dummy_output; 1.371 + size_t output_left = sizeof(dummy_output); 1.372 + 1.373 + xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left); 1.374 + } 1.375 +#endif 1.376 + 1.377 + gInitialized = true; 1.378 +} 1.379 + 1.380 +void 1.381 +nsNativeCharsetConverter::GlobalInit() 1.382 +{ 1.383 + gLock = new Mutex("nsNativeCharsetConverter.gLock"); 1.384 +} 1.385 + 1.386 +void 1.387 +nsNativeCharsetConverter::GlobalShutdown() 1.388 +{ 1.389 + if (gLock) { 1.390 + delete gLock; 1.391 + gLock = nullptr; 1.392 + } 1.393 + 1.394 + if (gNativeToUnicode != INVALID_ICONV_T) { 1.395 + iconv_close(gNativeToUnicode); 1.396 + gNativeToUnicode = INVALID_ICONV_T; 1.397 + } 1.398 + 1.399 + if (gUnicodeToNative != INVALID_ICONV_T) { 1.400 + iconv_close(gUnicodeToNative); 1.401 + gUnicodeToNative = INVALID_ICONV_T; 1.402 + } 1.403 + 1.404 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.405 + if (gNativeToUTF8 != INVALID_ICONV_T) { 1.406 + iconv_close(gNativeToUTF8); 1.407 + gNativeToUTF8 = INVALID_ICONV_T; 1.408 + } 1.409 + if (gUTF8ToNative != INVALID_ICONV_T) { 1.410 + iconv_close(gUTF8ToNative); 1.411 + gUTF8ToNative = INVALID_ICONV_T; 1.412 + } 1.413 + if (gUnicodeToUTF8 != INVALID_ICONV_T) { 1.414 + iconv_close(gUnicodeToUTF8); 1.415 + gUnicodeToUTF8 = INVALID_ICONV_T; 1.416 + } 1.417 + if (gUTF8ToUnicode != INVALID_ICONV_T) { 1.418 + iconv_close(gUTF8ToUnicode); 1.419 + gUTF8ToUnicode = INVALID_ICONV_T; 1.420 + } 1.421 +#endif 1.422 + 1.423 + gInitialized = false; 1.424 +} 1.425 + 1.426 +nsNativeCharsetConverter::nsNativeCharsetConverter() 1.427 +{ 1.428 + Lock(); 1.429 + if (!gInitialized) 1.430 + LazyInit(); 1.431 +} 1.432 + 1.433 +nsNativeCharsetConverter::~nsNativeCharsetConverter() 1.434 +{ 1.435 + // reset converters for next time 1.436 + if (gNativeToUnicode != INVALID_ICONV_T) 1.437 + xp_iconv_reset(gNativeToUnicode); 1.438 + if (gUnicodeToNative != INVALID_ICONV_T) 1.439 + xp_iconv_reset(gUnicodeToNative); 1.440 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.441 + if (gNativeToUTF8 != INVALID_ICONV_T) 1.442 + xp_iconv_reset(gNativeToUTF8); 1.443 + if (gUTF8ToNative != INVALID_ICONV_T) 1.444 + xp_iconv_reset(gUTF8ToNative); 1.445 + if (gUnicodeToUTF8 != INVALID_ICONV_T) 1.446 + xp_iconv_reset(gUnicodeToUTF8); 1.447 + if (gUTF8ToUnicode != INVALID_ICONV_T) 1.448 + xp_iconv_reset(gUTF8ToUnicode); 1.449 +#endif 1.450 + Unlock(); 1.451 +} 1.452 + 1.453 +nsresult 1.454 +nsNativeCharsetConverter::NativeToUnicode(const char **input, 1.455 + uint32_t *inputLeft, 1.456 + char16_t **output, 1.457 + uint32_t *outputLeft) 1.458 +{ 1.459 + size_t res = 0; 1.460 + size_t inLeft = (size_t) *inputLeft; 1.461 + size_t outLeft = (size_t) *outputLeft * 2; 1.462 + 1.463 + if (gNativeToUnicode != INVALID_ICONV_T) { 1.464 + 1.465 + res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft); 1.466 + 1.467 + *inputLeft = inLeft; 1.468 + *outputLeft = outLeft / 2; 1.469 + if (res != (size_t) -1) 1.470 + return NS_OK; 1.471 + 1.472 + NS_WARNING("conversion from native to utf-16 failed"); 1.473 + 1.474 + // reset converter 1.475 + xp_iconv_reset(gNativeToUnicode); 1.476 + } 1.477 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.478 + else if ((gNativeToUTF8 != INVALID_ICONV_T) && 1.479 + (gUTF8ToUnicode != INVALID_ICONV_T)) { 1.480 + // convert first to UTF8, then from UTF8 to UCS2 1.481 + const char *in = *input; 1.482 + 1.483 + char ubuf[1024]; 1.484 + 1.485 + // we assume we're always called with enough space in |output|, 1.486 + // so convert many chars at a time... 1.487 + while (inLeft) { 1.488 + char *p = ubuf; 1.489 + size_t n = sizeof(ubuf); 1.490 + res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n); 1.491 + if (res == (size_t) -1) { 1.492 + NS_ERROR("conversion from native to utf-8 failed"); 1.493 + break; 1.494 + } 1.495 + NS_ASSERTION(outLeft > 0, "bad assumption"); 1.496 + p = ubuf; 1.497 + n = sizeof(ubuf) - n; 1.498 + res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft); 1.499 + if (res == (size_t) -1) { 1.500 + NS_ERROR("conversion from utf-8 to utf-16 failed"); 1.501 + break; 1.502 + } 1.503 + } 1.504 + 1.505 + (*input) += (*inputLeft - inLeft); 1.506 + *inputLeft = inLeft; 1.507 + *outputLeft = outLeft / 2; 1.508 + 1.509 + if (res != (size_t) -1) 1.510 + return NS_OK; 1.511 + 1.512 + // reset converters 1.513 + xp_iconv_reset(gNativeToUTF8); 1.514 + xp_iconv_reset(gUTF8ToUnicode); 1.515 + } 1.516 +#endif 1.517 + 1.518 + // fallback: zero-pad and hope for the best 1.519 + // XXX This is lame and we have to do better. 1.520 + isolatin1_to_utf16(input, inputLeft, output, outputLeft); 1.521 + 1.522 + return NS_OK; 1.523 +} 1.524 + 1.525 +nsresult 1.526 +nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, 1.527 + uint32_t *inputLeft, 1.528 + char **output, 1.529 + uint32_t *outputLeft) 1.530 +{ 1.531 + size_t res = 0; 1.532 + size_t inLeft = (size_t) *inputLeft * 2; 1.533 + size_t outLeft = (size_t) *outputLeft; 1.534 + 1.535 + if (gUnicodeToNative != INVALID_ICONV_T) { 1.536 + res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft); 1.537 + 1.538 + *inputLeft = inLeft / 2; 1.539 + *outputLeft = outLeft; 1.540 + if (res != (size_t) -1) { 1.541 + return NS_OK; 1.542 + } 1.543 + 1.544 + NS_ERROR("iconv failed"); 1.545 + 1.546 + // reset converter 1.547 + xp_iconv_reset(gUnicodeToNative); 1.548 + } 1.549 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) 1.550 + else if ((gUnicodeToUTF8 != INVALID_ICONV_T) && 1.551 + (gUTF8ToNative != INVALID_ICONV_T)) { 1.552 + const char *in = (const char *) *input; 1.553 + 1.554 + char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes) 1.555 + 1.556 + // convert one uchar at a time... 1.557 + while (inLeft && outLeft) { 1.558 + char *p = ubuf; 1.559 + size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t); 1.560 + res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n); 1.561 + if (res == (size_t) -1) { 1.562 + NS_ERROR("conversion from utf-16 to utf-8 failed"); 1.563 + break; 1.564 + } 1.565 + p = ubuf; 1.566 + n = sizeof(ubuf) - n; 1.567 + res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft); 1.568 + if (res == (size_t) -1) { 1.569 + if (errno == E2BIG) { 1.570 + // not enough room for last uchar... back up and return. 1.571 + in -= sizeof(char16_t); 1.572 + res = 0; 1.573 + } 1.574 + else 1.575 + NS_ERROR("conversion from utf-8 to native failed"); 1.576 + break; 1.577 + } 1.578 + inLeft -= sizeof(char16_t); 1.579 + } 1.580 + 1.581 + (*input) += (*inputLeft - inLeft / 2); 1.582 + *inputLeft = inLeft / 2; 1.583 + *outputLeft = outLeft; 1.584 + if (res != (size_t) -1) { 1.585 + return NS_OK; 1.586 + } 1.587 + 1.588 + // reset converters 1.589 + xp_iconv_reset(gUnicodeToUTF8); 1.590 + xp_iconv_reset(gUTF8ToNative); 1.591 + } 1.592 +#endif 1.593 + 1.594 + // fallback: truncate and hope for the best 1.595 + // XXX This is lame and we have to do better. 1.596 + utf16_to_isolatin1(input, inputLeft, output, outputLeft); 1.597 + 1.598 + return NS_OK; 1.599 +} 1.600 + 1.601 +bool 1.602 +nsNativeCharsetConverter::IsNativeUTF8() 1.603 +{ 1.604 + if (!gInitialized) { 1.605 + Lock(); 1.606 + if (!gInitialized) 1.607 + LazyInit(); 1.608 + Unlock(); 1.609 + } 1.610 + return gIsNativeUTF8; 1.611 +} 1.612 + 1.613 +#endif // USE_ICONV 1.614 + 1.615 +//----------------------------------------------------------------------------- 1.616 +// conversion using mb[r]towc/wc[r]tomb 1.617 +//----------------------------------------------------------------------------- 1.618 +#if defined(USE_STDCONV) 1.619 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) 1.620 +#include <wchar.h> // mbrtowc, wcrtomb 1.621 +#endif 1.622 + 1.623 +class nsNativeCharsetConverter 1.624 +{ 1.625 +public: 1.626 + nsNativeCharsetConverter(); 1.627 + 1.628 + nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, 1.629 + char16_t **output, uint32_t *outputLeft); 1.630 + nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, 1.631 + char **output, uint32_t *outputLeft); 1.632 + 1.633 + static void GlobalInit(); 1.634 + static void GlobalShutdown() { } 1.635 + static bool IsNativeUTF8(); 1.636 + 1.637 +private: 1.638 + static bool gWCharIsUnicode; 1.639 + 1.640 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) 1.641 + mbstate_t ps; 1.642 +#endif 1.643 +}; 1.644 + 1.645 +bool nsNativeCharsetConverter::gWCharIsUnicode = false; 1.646 + 1.647 +nsNativeCharsetConverter::nsNativeCharsetConverter() 1.648 +{ 1.649 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) 1.650 + memset(&ps, 0, sizeof(ps)); 1.651 +#endif 1.652 +} 1.653 + 1.654 +void 1.655 +nsNativeCharsetConverter::GlobalInit() 1.656 +{ 1.657 + // verify that wchar_t for the current locale is actually unicode. 1.658 + // if it is not, then we should avoid calling mbtowc/wctomb and 1.659 + // just fallback on zero-pad/truncation conversion. 1.660 + // 1.661 + // this test cannot be done at build time because the encoding of 1.662 + // wchar_t may depend on the runtime locale. sad, but true!! 1.663 + // 1.664 + // so, if wchar_t is unicode then converting an ASCII character 1.665 + // to wchar_t should not change its numeric value. we'll just 1.666 + // check what happens with the ASCII 'a' character. 1.667 + // 1.668 + // this test is not perfect... obviously, it could yield false 1.669 + // positives, but then at least ASCII text would be converted 1.670 + // properly (or maybe just the 'a' character) -- oh well :( 1.671 + 1.672 + char a = 'a'; 1.673 + unsigned int w = 0; 1.674 + 1.675 + int res = mbtowc((wchar_t *) &w, &a, 1); 1.676 + 1.677 + gWCharIsUnicode = (res != -1 && w == 'a'); 1.678 + 1.679 +#ifdef DEBUG 1.680 + if (!gWCharIsUnicode) 1.681 + NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)"); 1.682 +#endif 1.683 +} 1.684 + 1.685 +nsresult 1.686 +nsNativeCharsetConverter::NativeToUnicode(const char **input, 1.687 + uint32_t *inputLeft, 1.688 + char16_t **output, 1.689 + uint32_t *outputLeft) 1.690 +{ 1.691 + if (gWCharIsUnicode) { 1.692 + int incr; 1.693 + 1.694 + // cannot use wchar_t here since it may have been redefined (e.g., 1.695 + // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP. 1.696 + unsigned int tmp = 0; 1.697 + while (*inputLeft && *outputLeft) { 1.698 +#ifdef HAVE_MBRTOWC 1.699 + incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps); 1.700 +#else 1.701 + // XXX is this thread-safe? 1.702 + incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft); 1.703 +#endif 1.704 + if (incr < 0) { 1.705 + NS_WARNING("mbtowc failed: possible charset mismatch"); 1.706 + // zero-pad and hope for the best 1.707 + tmp = (unsigned char) **input; 1.708 + incr = 1; 1.709 + } 1.710 + **output = (char16_t) tmp; 1.711 + (*input) += incr; 1.712 + (*inputLeft) -= incr; 1.713 + (*output)++; 1.714 + (*outputLeft)--; 1.715 + } 1.716 + } 1.717 + else { 1.718 + // wchar_t isn't unicode, so the best we can do is treat the 1.719 + // input as if it is isolatin1 :( 1.720 + isolatin1_to_utf16(input, inputLeft, output, outputLeft); 1.721 + } 1.722 + 1.723 + return NS_OK; 1.724 +} 1.725 + 1.726 +nsresult 1.727 +nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, 1.728 + uint32_t *inputLeft, 1.729 + char **output, 1.730 + uint32_t *outputLeft) 1.731 +{ 1.732 + if (gWCharIsUnicode) { 1.733 + int incr; 1.734 + 1.735 + while (*inputLeft && *outputLeft >= MB_CUR_MAX) { 1.736 +#ifdef HAVE_WCRTOMB 1.737 + incr = (int) wcrtomb(*output, (wchar_t) **input, &ps); 1.738 +#else 1.739 + // XXX is this thread-safe? 1.740 + incr = (int) wctomb(*output, (wchar_t) **input); 1.741 +#endif 1.742 + if (incr < 0) { 1.743 + NS_WARNING("mbtowc failed: possible charset mismatch"); 1.744 + **output = (unsigned char) **input; // truncate 1.745 + incr = 1; 1.746 + } 1.747 + // most likely we're dead anyways if this assertion should fire 1.748 + NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string"); 1.749 + (*output) += incr; 1.750 + (*outputLeft) -= incr; 1.751 + (*input)++; 1.752 + (*inputLeft)--; 1.753 + } 1.754 + } 1.755 + else { 1.756 + // wchar_t isn't unicode, so the best we can do is treat the 1.757 + // input as if it is isolatin1 :( 1.758 + utf16_to_isolatin1(input, inputLeft, output, outputLeft); 1.759 + } 1.760 + 1.761 + return NS_OK; 1.762 +} 1.763 + 1.764 +// XXX : for now, return false 1.765 +bool 1.766 +nsNativeCharsetConverter::IsNativeUTF8() 1.767 +{ 1.768 + return false; 1.769 +} 1.770 + 1.771 +#endif // USE_STDCONV 1.772 + 1.773 +//----------------------------------------------------------------------------- 1.774 +// API implementation 1.775 +//----------------------------------------------------------------------------- 1.776 + 1.777 +nsresult 1.778 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) 1.779 +{ 1.780 + output.Truncate(); 1.781 + 1.782 + uint32_t inputLen = input.Length(); 1.783 + 1.784 + nsACString::const_iterator iter; 1.785 + input.BeginReading(iter); 1.786 + 1.787 + // 1.788 + // OPTIMIZATION: preallocate space for largest possible result; convert 1.789 + // directly into the result buffer to avoid intermediate buffer copy. 1.790 + // 1.791 + // this will generally result in a larger allocation, but that seems 1.792 + // better than an extra buffer copy. 1.793 + // 1.794 + if (!output.SetLength(inputLen, fallible_t())) 1.795 + return NS_ERROR_OUT_OF_MEMORY; 1.796 + nsAString::iterator out_iter; 1.797 + output.BeginWriting(out_iter); 1.798 + 1.799 + char16_t *result = out_iter.get(); 1.800 + uint32_t resultLeft = inputLen; 1.801 + 1.802 + const char *buf = iter.get(); 1.803 + uint32_t bufLeft = inputLen; 1.804 + 1.805 + nsNativeCharsetConverter conv; 1.806 + nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft); 1.807 + if (NS_SUCCEEDED(rv)) { 1.808 + NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer"); 1.809 + output.SetLength(inputLen - resultLeft); 1.810 + } 1.811 + return rv; 1.812 +} 1.813 + 1.814 +nsresult 1.815 +NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) 1.816 +{ 1.817 + output.Truncate(); 1.818 + 1.819 + nsAString::const_iterator iter, end; 1.820 + input.BeginReading(iter); 1.821 + input.EndReading(end); 1.822 + 1.823 + // cannot easily avoid intermediate buffer copy. 1.824 + char temp[4096]; 1.825 + 1.826 + nsNativeCharsetConverter conv; 1.827 + 1.828 + const char16_t *buf = iter.get(); 1.829 + uint32_t bufLeft = Distance(iter, end); 1.830 + while (bufLeft) { 1.831 + char *p = temp; 1.832 + uint32_t tempLeft = sizeof(temp); 1.833 + 1.834 + nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft); 1.835 + if (NS_FAILED(rv)) return rv; 1.836 + 1.837 + if (tempLeft < sizeof(temp)) 1.838 + output.Append(temp, sizeof(temp) - tempLeft); 1.839 + } 1.840 + return NS_OK; 1.841 +} 1.842 + 1.843 +bool 1.844 +NS_IsNativeUTF8() 1.845 +{ 1.846 + return nsNativeCharsetConverter::IsNativeUTF8(); 1.847 +} 1.848 + 1.849 +void 1.850 +NS_StartupNativeCharsetUtils() 1.851 +{ 1.852 + // 1.853 + // need to initialize the locale or else charset conversion will fail. 1.854 + // better not delay this in case some other component alters the locale 1.855 + // settings. 1.856 + // 1.857 + // XXX we assume that we are called early enough that we should 1.858 + // always be the first to care about the locale's charset. 1.859 + // 1.860 + setlocale(LC_CTYPE, ""); 1.861 + 1.862 + nsNativeCharsetConverter::GlobalInit(); 1.863 +} 1.864 + 1.865 +void 1.866 +NS_ShutdownNativeCharsetUtils() 1.867 +{ 1.868 + nsNativeCharsetConverter::GlobalShutdown(); 1.869 +} 1.870 + 1.871 +//----------------------------------------------------------------------------- 1.872 +// XP_WIN 1.873 +//----------------------------------------------------------------------------- 1.874 +#elif defined(XP_WIN) 1.875 + 1.876 +#include <windows.h> 1.877 +#include "nsString.h" 1.878 +#include "nsAString.h" 1.879 +#include "nsReadableUtils.h" 1.880 + 1.881 +using namespace mozilla; 1.882 + 1.883 +nsresult 1.884 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) 1.885 +{ 1.886 + uint32_t inputLen = input.Length(); 1.887 + 1.888 + nsACString::const_iterator iter; 1.889 + input.BeginReading(iter); 1.890 + 1.891 + const char *buf = iter.get(); 1.892 + 1.893 + // determine length of result 1.894 + uint32_t resultLen = 0; 1.895 + int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0); 1.896 + if (n > 0) 1.897 + resultLen += n; 1.898 + 1.899 + // allocate sufficient space 1.900 + if (!output.SetLength(resultLen, fallible_t())) 1.901 + return NS_ERROR_OUT_OF_MEMORY; 1.902 + if (resultLen > 0) { 1.903 + nsAString::iterator out_iter; 1.904 + output.BeginWriting(out_iter); 1.905 + 1.906 + char16_t *result = out_iter.get(); 1.907 + 1.908 + ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen); 1.909 + } 1.910 + return NS_OK; 1.911 +} 1.912 + 1.913 +nsresult 1.914 +NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) 1.915 +{ 1.916 + uint32_t inputLen = input.Length(); 1.917 + 1.918 + nsAString::const_iterator iter; 1.919 + input.BeginReading(iter); 1.920 + 1.921 + char16ptr_t buf = iter.get(); 1.922 + 1.923 + // determine length of result 1.924 + uint32_t resultLen = 0; 1.925 + 1.926 + int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0, 1.927 + nullptr, nullptr); 1.928 + if (n > 0) 1.929 + resultLen += n; 1.930 + 1.931 + // allocate sufficient space 1.932 + if (!output.SetLength(resultLen, fallible_t())) 1.933 + return NS_ERROR_OUT_OF_MEMORY; 1.934 + if (resultLen > 0) { 1.935 + nsACString::iterator out_iter; 1.936 + output.BeginWriting(out_iter); 1.937 + 1.938 + // default "defaultChar" is '?', which is an illegal character on windows 1.939 + // file system. That will cause file uncreatable. Change it to '_' 1.940 + const char defaultChar = '_'; 1.941 + 1.942 + char *result = out_iter.get(); 1.943 + 1.944 + ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen, 1.945 + &defaultChar, nullptr); 1.946 + } 1.947 + return NS_OK; 1.948 +} 1.949 + 1.950 +// moved from widget/windows/nsToolkit.cpp 1.951 +int32_t 1.952 +NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW) 1.953 +{ 1.954 + return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize); 1.955 +} 1.956 + 1.957 +int32_t 1.958 +NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut, 1.959 + char *aStrOutA, const char *aDefault) 1.960 +{ 1.961 + if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0)) 1.962 + return 0; 1.963 + 1.964 + int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1, 1.965 + aStrOutA, aBufferSizeOut, 1.966 + aDefault, nullptr); 1.967 + 1.968 + if (!numCharsConverted) { 1.969 + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 1.970 + // Overflow, add missing null termination but return 0 1.971 + aStrOutA[aBufferSizeOut-1] = '\0'; 1.972 + } 1.973 + else { 1.974 + // Other error, clear string and return 0 1.975 + aStrOutA[0] = '\0'; 1.976 + } 1.977 + } 1.978 + else if (numCharsConverted < aBufferSizeOut) { 1.979 + // Add 2nd null (really necessary?) 1.980 + aStrOutA[numCharsConverted] = '\0'; 1.981 + } 1.982 + 1.983 + return numCharsConverted; 1.984 +} 1.985 + 1.986 +#else 1.987 + 1.988 +#include "nsReadableUtils.h" 1.989 + 1.990 +nsresult 1.991 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) 1.992 +{ 1.993 + CopyASCIItoUTF16(input, output); 1.994 + return NS_OK; 1.995 +} 1.996 + 1.997 +nsresult 1.998 +NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) 1.999 +{ 1.1000 + LossyCopyUTF16toASCII(input, output); 1.1001 + return NS_OK; 1.1002 +} 1.1003 + 1.1004 +void 1.1005 +NS_StartupNativeCharsetUtils() 1.1006 +{ 1.1007 +} 1.1008 + 1.1009 +void 1.1010 +NS_ShutdownNativeCharsetUtils() 1.1011 +{ 1.1012 +} 1.1013 + 1.1014 +#endif