xpcom/io/nsNativeCharsetUtils.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4
michael@0 5 #include "xpcom-private.h"
michael@0 6
michael@0 7 //-----------------------------------------------------------------------------
michael@0 8 // XP_MACOSX or ANDROID
michael@0 9 //-----------------------------------------------------------------------------
michael@0 10 #if defined(XP_MACOSX) || defined(ANDROID)
michael@0 11
michael@0 12 #include "nsAString.h"
michael@0 13 #include "nsReadableUtils.h"
michael@0 14 #include "nsString.h"
michael@0 15
michael@0 16 nsresult
michael@0 17 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0 18 {
michael@0 19 CopyUTF8toUTF16(input, output);
michael@0 20 return NS_OK;
michael@0 21 }
michael@0 22
michael@0 23 nsresult
michael@0 24 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
michael@0 25 {
michael@0 26 CopyUTF16toUTF8(input, output);
michael@0 27 return NS_OK;
michael@0 28 }
michael@0 29
michael@0 30 void
michael@0 31 NS_StartupNativeCharsetUtils()
michael@0 32 {
michael@0 33 }
michael@0 34
michael@0 35 void
michael@0 36 NS_ShutdownNativeCharsetUtils()
michael@0 37 {
michael@0 38 }
michael@0 39
michael@0 40
michael@0 41 //-----------------------------------------------------------------------------
michael@0 42 // XP_UNIX
michael@0 43 //-----------------------------------------------------------------------------
michael@0 44 #elif defined(XP_UNIX)
michael@0 45
michael@0 46 #include <stdlib.h> // mbtowc, wctomb
michael@0 47 #include <locale.h> // setlocale
michael@0 48 #include "mozilla/Mutex.h"
michael@0 49 #include "nscore.h"
michael@0 50 #include "nsAString.h"
michael@0 51 #include "nsReadableUtils.h"
michael@0 52
michael@0 53 using namespace mozilla;
michael@0 54
michael@0 55 //
michael@0 56 // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
michael@0 57 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
michael@0 58 // or not (see bug 206811 and
michael@0 59 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
michael@0 60 // iconv for all platforms where nltypes.h and nllanginfo.h are present
michael@0 61 // along with iconv.
michael@0 62 //
michael@0 63 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
michael@0 64 #define USE_ICONV 1
michael@0 65 #else
michael@0 66 #define USE_STDCONV 1
michael@0 67 #endif
michael@0 68
michael@0 69 static void
michael@0 70 isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft)
michael@0 71 {
michael@0 72 while (*inputLeft && *outputLeft) {
michael@0 73 **output = (unsigned char) **input;
michael@0 74 (*input)++;
michael@0 75 (*inputLeft)--;
michael@0 76 (*output)++;
michael@0 77 (*outputLeft)--;
michael@0 78 }
michael@0 79 }
michael@0 80
michael@0 81 static void
michael@0 82 utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft)
michael@0 83 {
michael@0 84 while (*inputLeft && *outputLeft) {
michael@0 85 **output = (unsigned char) **input;
michael@0 86 (*input)++;
michael@0 87 (*inputLeft)--;
michael@0 88 (*output)++;
michael@0 89 (*outputLeft)--;
michael@0 90 }
michael@0 91 }
michael@0 92
michael@0 93 //-----------------------------------------------------------------------------
michael@0 94 // conversion using iconv
michael@0 95 //-----------------------------------------------------------------------------
michael@0 96 #if defined(USE_ICONV)
michael@0 97 #include <nl_types.h> // CODESET
michael@0 98 #include <langinfo.h> // nl_langinfo
michael@0 99 #include <iconv.h> // iconv_open, iconv, iconv_close
michael@0 100 #include <errno.h>
michael@0 101 #include "plstr.h"
michael@0 102
michael@0 103 #if defined(HAVE_ICONV_WITH_CONST_INPUT)
michael@0 104 #define ICONV_INPUT(x) (x)
michael@0 105 #else
michael@0 106 #define ICONV_INPUT(x) ((char **)x)
michael@0 107 #endif
michael@0 108
michael@0 109 // solaris definitely needs this, but we'll enable it by default
michael@0 110 // just in case... but we know for sure that iconv(3) in glibc
michael@0 111 // doesn't need this.
michael@0 112 #if !defined(__GLIBC__)
michael@0 113 #define ENABLE_UTF8_FALLBACK_SUPPORT
michael@0 114 #endif
michael@0 115
michael@0 116 #define INVALID_ICONV_T ((iconv_t) -1)
michael@0 117
michael@0 118 static inline size_t
michael@0 119 xp_iconv(iconv_t converter,
michael@0 120 const char **input,
michael@0 121 size_t *inputLeft,
michael@0 122 char **output,
michael@0 123 size_t *outputLeft)
michael@0 124 {
michael@0 125 size_t res, outputAvail = outputLeft ? *outputLeft : 0;
michael@0 126 res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
michael@0 127 if (res == (size_t) -1) {
michael@0 128 // on some platforms (e.g., linux) iconv will fail with
michael@0 129 // E2BIG if it cannot convert _all_ of its input. it'll
michael@0 130 // still adjust all of the in/out params correctly, so we
michael@0 131 // can ignore this error. the assumption is that we will
michael@0 132 // be called again to complete the conversion.
michael@0 133 if ((errno == E2BIG) && (*outputLeft < outputAvail))
michael@0 134 res = 0;
michael@0 135 }
michael@0 136 return res;
michael@0 137 }
michael@0 138
michael@0 139 static inline void
michael@0 140 xp_iconv_reset(iconv_t converter)
michael@0 141 {
michael@0 142 // NOTE: the man pages on Solaris claim that you can pass nullptr
michael@0 143 // for all parameter to reset the converter, but beware the
michael@0 144 // evil Solaris crash if you go down this route >:-)
michael@0 145
michael@0 146 const char *zero_char_in_ptr = nullptr;
michael@0 147 char *zero_char_out_ptr = nullptr;
michael@0 148 size_t zero_size_in = 0,
michael@0 149 zero_size_out = 0;
michael@0 150
michael@0 151 xp_iconv(converter, &zero_char_in_ptr,
michael@0 152 &zero_size_in,
michael@0 153 &zero_char_out_ptr,
michael@0 154 &zero_size_out);
michael@0 155 }
michael@0 156
michael@0 157 static inline iconv_t
michael@0 158 xp_iconv_open(const char **to_list, const char **from_list)
michael@0 159 {
michael@0 160 iconv_t res;
michael@0 161 const char **from_name;
michael@0 162 const char **to_name;
michael@0 163
michael@0 164 // try all possible combinations to locate a converter.
michael@0 165 to_name = to_list;
michael@0 166 while (*to_name) {
michael@0 167 if (**to_name) {
michael@0 168 from_name = from_list;
michael@0 169 while (*from_name) {
michael@0 170 if (**from_name) {
michael@0 171 res = iconv_open(*to_name, *from_name);
michael@0 172 if (res != INVALID_ICONV_T)
michael@0 173 return res;
michael@0 174 }
michael@0 175 from_name++;
michael@0 176 }
michael@0 177 }
michael@0 178 to_name++;
michael@0 179 }
michael@0 180
michael@0 181 return INVALID_ICONV_T;
michael@0 182 }
michael@0 183
michael@0 184 /*
michael@0 185 * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
michael@0 186 * have to use UTF-16 with iconv(3) on platforms where it's supported.
michael@0 187 * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
michael@0 188 * and implementations of iconv(3). On Tru64, it also depends on the environment
michael@0 189 * variable. To avoid the trouble arising from byte-swapping
michael@0 190 * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
michael@0 191 * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
michael@0 192 * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
michael@0 193 * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
michael@0 194 * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
michael@0 195 * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
michael@0 196 * can be done other than adding a note in the release notes. (bug 206811)
michael@0 197 */
michael@0 198 static const char *UTF_16_NAMES[] = {
michael@0 199 #if defined(IS_LITTLE_ENDIAN)
michael@0 200 "UTF-16LE",
michael@0 201 #if defined(__GLIBC__)
michael@0 202 "UNICODELITTLE",
michael@0 203 #endif
michael@0 204 "UCS-2LE",
michael@0 205 #else
michael@0 206 "UTF-16BE",
michael@0 207 #if defined(__GLIBC__)
michael@0 208 "UNICODEBIG",
michael@0 209 #endif
michael@0 210 "UCS-2BE",
michael@0 211 #endif
michael@0 212 "UTF-16",
michael@0 213 "UCS-2",
michael@0 214 "UCS2",
michael@0 215 "UCS_2",
michael@0 216 "ucs-2",
michael@0 217 "ucs2",
michael@0 218 "ucs_2",
michael@0 219 nullptr
michael@0 220 };
michael@0 221
michael@0 222 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 223 static const char *UTF_8_NAMES[] = {
michael@0 224 "UTF-8",
michael@0 225 "UTF8",
michael@0 226 "UTF_8",
michael@0 227 "utf-8",
michael@0 228 "utf8",
michael@0 229 "utf_8",
michael@0 230 nullptr
michael@0 231 };
michael@0 232 #endif
michael@0 233
michael@0 234 static const char *ISO_8859_1_NAMES[] = {
michael@0 235 "ISO-8859-1",
michael@0 236 #if !defined(__GLIBC__)
michael@0 237 "ISO8859-1",
michael@0 238 "ISO88591",
michael@0 239 "ISO_8859_1",
michael@0 240 "ISO8859_1",
michael@0 241 "iso-8859-1",
michael@0 242 "iso8859-1",
michael@0 243 "iso88591",
michael@0 244 "iso_8859_1",
michael@0 245 "iso8859_1",
michael@0 246 #endif
michael@0 247 nullptr
michael@0 248 };
michael@0 249
michael@0 250 class nsNativeCharsetConverter
michael@0 251 {
michael@0 252 public:
michael@0 253 nsNativeCharsetConverter();
michael@0 254 ~nsNativeCharsetConverter();
michael@0 255
michael@0 256 nsresult NativeToUnicode(const char **input , uint32_t *inputLeft,
michael@0 257 char16_t **output, uint32_t *outputLeft);
michael@0 258 nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
michael@0 259 char **output, uint32_t *outputLeft);
michael@0 260
michael@0 261 static void GlobalInit();
michael@0 262 static void GlobalShutdown();
michael@0 263 static bool IsNativeUTF8();
michael@0 264
michael@0 265 private:
michael@0 266 static iconv_t gNativeToUnicode;
michael@0 267 static iconv_t gUnicodeToNative;
michael@0 268 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 269 static iconv_t gNativeToUTF8;
michael@0 270 static iconv_t gUTF8ToNative;
michael@0 271 static iconv_t gUnicodeToUTF8;
michael@0 272 static iconv_t gUTF8ToUnicode;
michael@0 273 #endif
michael@0 274 static Mutex *gLock;
michael@0 275 static bool gInitialized;
michael@0 276 static bool gIsNativeUTF8;
michael@0 277
michael@0 278 static void LazyInit();
michael@0 279
michael@0 280 static void Lock() { if (gLock) gLock->Lock(); }
michael@0 281 static void Unlock() { if (gLock) gLock->Unlock(); }
michael@0 282 };
michael@0 283
michael@0 284 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
michael@0 285 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
michael@0 286 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 287 iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
michael@0 288 iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
michael@0 289 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
michael@0 290 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
michael@0 291 #endif
michael@0 292 Mutex *nsNativeCharsetConverter::gLock = nullptr;
michael@0 293 bool nsNativeCharsetConverter::gInitialized = false;
michael@0 294 bool nsNativeCharsetConverter::gIsNativeUTF8 = false;
michael@0 295
michael@0 296 void
michael@0 297 nsNativeCharsetConverter::LazyInit()
michael@0 298 {
michael@0 299 // LazyInit may be called before NS_StartupNativeCharsetUtils, but
michael@0 300 // the setlocale it does has to be called before nl_langinfo. Like in
michael@0 301 // NS_StartupNativeCharsetUtils, assume we are called early enough that
michael@0 302 // we are the first to care about the locale's charset.
michael@0 303 if (!gLock)
michael@0 304 setlocale(LC_CTYPE, "");
michael@0 305 const char *blank_list[] = { "", nullptr };
michael@0 306 const char **native_charset_list = blank_list;
michael@0 307 const char *native_charset = nl_langinfo(CODESET);
michael@0 308 if (native_charset == nullptr) {
michael@0 309 NS_ERROR("native charset is unknown");
michael@0 310 // fallback to ISO-8859-1
michael@0 311 native_charset_list = ISO_8859_1_NAMES;
michael@0 312 }
michael@0 313 else
michael@0 314 native_charset_list[0] = native_charset;
michael@0 315
michael@0 316 // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)
michael@0 317 // return 'UTF-8' (or 'utf-8')
michael@0 318 if (!PL_strcasecmp(native_charset, "UTF-8"))
michael@0 319 gIsNativeUTF8 = true;
michael@0 320
michael@0 321 gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
michael@0 322 gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
michael@0 323
michael@0 324 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 325 if (gNativeToUnicode == INVALID_ICONV_T) {
michael@0 326 gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
michael@0 327 gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
michael@0 328 NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
michael@0 329 NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
michael@0 330 }
michael@0 331 if (gUnicodeToNative == INVALID_ICONV_T) {
michael@0 332 gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
michael@0 333 gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
michael@0 334 NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
michael@0 335 NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
michael@0 336 }
michael@0 337 #else
michael@0 338 NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
michael@0 339 NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
michael@0 340 #endif
michael@0 341
michael@0 342 /*
michael@0 343 * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
michael@0 344 * prepend a byte order mark unicode character (BOM, u+FEFF) during
michael@0 345 * the first use of the iconv converter. The same is the case of
michael@0 346 * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
michael@0 347 * However, we use 'UTF-16LE/BE' in both cases, instead so that we
michael@0 348 * should be safe. But just in case...
michael@0 349 *
michael@0 350 * This dummy conversion gets rid of the BOMs and fixes bug 153562.
michael@0 351 */
michael@0 352 char dummy_input[1] = { ' ' };
michael@0 353 char dummy_output[4];
michael@0 354
michael@0 355 if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0 356 const char *input = dummy_input;
michael@0 357 size_t input_left = sizeof(dummy_input);
michael@0 358 char *output = dummy_output;
michael@0 359 size_t output_left = sizeof(dummy_output);
michael@0 360
michael@0 361 xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
michael@0 362 }
michael@0 363 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 364 if (gUTF8ToUnicode != INVALID_ICONV_T) {
michael@0 365 const char *input = dummy_input;
michael@0 366 size_t input_left = sizeof(dummy_input);
michael@0 367 char *output = dummy_output;
michael@0 368 size_t output_left = sizeof(dummy_output);
michael@0 369
michael@0 370 xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
michael@0 371 }
michael@0 372 #endif
michael@0 373
michael@0 374 gInitialized = true;
michael@0 375 }
michael@0 376
michael@0 377 void
michael@0 378 nsNativeCharsetConverter::GlobalInit()
michael@0 379 {
michael@0 380 gLock = new Mutex("nsNativeCharsetConverter.gLock");
michael@0 381 }
michael@0 382
michael@0 383 void
michael@0 384 nsNativeCharsetConverter::GlobalShutdown()
michael@0 385 {
michael@0 386 if (gLock) {
michael@0 387 delete gLock;
michael@0 388 gLock = nullptr;
michael@0 389 }
michael@0 390
michael@0 391 if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0 392 iconv_close(gNativeToUnicode);
michael@0 393 gNativeToUnicode = INVALID_ICONV_T;
michael@0 394 }
michael@0 395
michael@0 396 if (gUnicodeToNative != INVALID_ICONV_T) {
michael@0 397 iconv_close(gUnicodeToNative);
michael@0 398 gUnicodeToNative = INVALID_ICONV_T;
michael@0 399 }
michael@0 400
michael@0 401 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 402 if (gNativeToUTF8 != INVALID_ICONV_T) {
michael@0 403 iconv_close(gNativeToUTF8);
michael@0 404 gNativeToUTF8 = INVALID_ICONV_T;
michael@0 405 }
michael@0 406 if (gUTF8ToNative != INVALID_ICONV_T) {
michael@0 407 iconv_close(gUTF8ToNative);
michael@0 408 gUTF8ToNative = INVALID_ICONV_T;
michael@0 409 }
michael@0 410 if (gUnicodeToUTF8 != INVALID_ICONV_T) {
michael@0 411 iconv_close(gUnicodeToUTF8);
michael@0 412 gUnicodeToUTF8 = INVALID_ICONV_T;
michael@0 413 }
michael@0 414 if (gUTF8ToUnicode != INVALID_ICONV_T) {
michael@0 415 iconv_close(gUTF8ToUnicode);
michael@0 416 gUTF8ToUnicode = INVALID_ICONV_T;
michael@0 417 }
michael@0 418 #endif
michael@0 419
michael@0 420 gInitialized = false;
michael@0 421 }
michael@0 422
michael@0 423 nsNativeCharsetConverter::nsNativeCharsetConverter()
michael@0 424 {
michael@0 425 Lock();
michael@0 426 if (!gInitialized)
michael@0 427 LazyInit();
michael@0 428 }
michael@0 429
michael@0 430 nsNativeCharsetConverter::~nsNativeCharsetConverter()
michael@0 431 {
michael@0 432 // reset converters for next time
michael@0 433 if (gNativeToUnicode != INVALID_ICONV_T)
michael@0 434 xp_iconv_reset(gNativeToUnicode);
michael@0 435 if (gUnicodeToNative != INVALID_ICONV_T)
michael@0 436 xp_iconv_reset(gUnicodeToNative);
michael@0 437 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 438 if (gNativeToUTF8 != INVALID_ICONV_T)
michael@0 439 xp_iconv_reset(gNativeToUTF8);
michael@0 440 if (gUTF8ToNative != INVALID_ICONV_T)
michael@0 441 xp_iconv_reset(gUTF8ToNative);
michael@0 442 if (gUnicodeToUTF8 != INVALID_ICONV_T)
michael@0 443 xp_iconv_reset(gUnicodeToUTF8);
michael@0 444 if (gUTF8ToUnicode != INVALID_ICONV_T)
michael@0 445 xp_iconv_reset(gUTF8ToUnicode);
michael@0 446 #endif
michael@0 447 Unlock();
michael@0 448 }
michael@0 449
michael@0 450 nsresult
michael@0 451 nsNativeCharsetConverter::NativeToUnicode(const char **input,
michael@0 452 uint32_t *inputLeft,
michael@0 453 char16_t **output,
michael@0 454 uint32_t *outputLeft)
michael@0 455 {
michael@0 456 size_t res = 0;
michael@0 457 size_t inLeft = (size_t) *inputLeft;
michael@0 458 size_t outLeft = (size_t) *outputLeft * 2;
michael@0 459
michael@0 460 if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0 461
michael@0 462 res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
michael@0 463
michael@0 464 *inputLeft = inLeft;
michael@0 465 *outputLeft = outLeft / 2;
michael@0 466 if (res != (size_t) -1)
michael@0 467 return NS_OK;
michael@0 468
michael@0 469 NS_WARNING("conversion from native to utf-16 failed");
michael@0 470
michael@0 471 // reset converter
michael@0 472 xp_iconv_reset(gNativeToUnicode);
michael@0 473 }
michael@0 474 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 475 else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
michael@0 476 (gUTF8ToUnicode != INVALID_ICONV_T)) {
michael@0 477 // convert first to UTF8, then from UTF8 to UCS2
michael@0 478 const char *in = *input;
michael@0 479
michael@0 480 char ubuf[1024];
michael@0 481
michael@0 482 // we assume we're always called with enough space in |output|,
michael@0 483 // so convert many chars at a time...
michael@0 484 while (inLeft) {
michael@0 485 char *p = ubuf;
michael@0 486 size_t n = sizeof(ubuf);
michael@0 487 res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
michael@0 488 if (res == (size_t) -1) {
michael@0 489 NS_ERROR("conversion from native to utf-8 failed");
michael@0 490 break;
michael@0 491 }
michael@0 492 NS_ASSERTION(outLeft > 0, "bad assumption");
michael@0 493 p = ubuf;
michael@0 494 n = sizeof(ubuf) - n;
michael@0 495 res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
michael@0 496 if (res == (size_t) -1) {
michael@0 497 NS_ERROR("conversion from utf-8 to utf-16 failed");
michael@0 498 break;
michael@0 499 }
michael@0 500 }
michael@0 501
michael@0 502 (*input) += (*inputLeft - inLeft);
michael@0 503 *inputLeft = inLeft;
michael@0 504 *outputLeft = outLeft / 2;
michael@0 505
michael@0 506 if (res != (size_t) -1)
michael@0 507 return NS_OK;
michael@0 508
michael@0 509 // reset converters
michael@0 510 xp_iconv_reset(gNativeToUTF8);
michael@0 511 xp_iconv_reset(gUTF8ToUnicode);
michael@0 512 }
michael@0 513 #endif
michael@0 514
michael@0 515 // fallback: zero-pad and hope for the best
michael@0 516 // XXX This is lame and we have to do better.
michael@0 517 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
michael@0 518
michael@0 519 return NS_OK;
michael@0 520 }
michael@0 521
michael@0 522 nsresult
michael@0 523 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
michael@0 524 uint32_t *inputLeft,
michael@0 525 char **output,
michael@0 526 uint32_t *outputLeft)
michael@0 527 {
michael@0 528 size_t res = 0;
michael@0 529 size_t inLeft = (size_t) *inputLeft * 2;
michael@0 530 size_t outLeft = (size_t) *outputLeft;
michael@0 531
michael@0 532 if (gUnicodeToNative != INVALID_ICONV_T) {
michael@0 533 res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
michael@0 534
michael@0 535 *inputLeft = inLeft / 2;
michael@0 536 *outputLeft = outLeft;
michael@0 537 if (res != (size_t) -1) {
michael@0 538 return NS_OK;
michael@0 539 }
michael@0 540
michael@0 541 NS_ERROR("iconv failed");
michael@0 542
michael@0 543 // reset converter
michael@0 544 xp_iconv_reset(gUnicodeToNative);
michael@0 545 }
michael@0 546 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0 547 else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
michael@0 548 (gUTF8ToNative != INVALID_ICONV_T)) {
michael@0 549 const char *in = (const char *) *input;
michael@0 550
michael@0 551 char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
michael@0 552
michael@0 553 // convert one uchar at a time...
michael@0 554 while (inLeft && outLeft) {
michael@0 555 char *p = ubuf;
michael@0 556 size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);
michael@0 557 res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
michael@0 558 if (res == (size_t) -1) {
michael@0 559 NS_ERROR("conversion from utf-16 to utf-8 failed");
michael@0 560 break;
michael@0 561 }
michael@0 562 p = ubuf;
michael@0 563 n = sizeof(ubuf) - n;
michael@0 564 res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
michael@0 565 if (res == (size_t) -1) {
michael@0 566 if (errno == E2BIG) {
michael@0 567 // not enough room for last uchar... back up and return.
michael@0 568 in -= sizeof(char16_t);
michael@0 569 res = 0;
michael@0 570 }
michael@0 571 else
michael@0 572 NS_ERROR("conversion from utf-8 to native failed");
michael@0 573 break;
michael@0 574 }
michael@0 575 inLeft -= sizeof(char16_t);
michael@0 576 }
michael@0 577
michael@0 578 (*input) += (*inputLeft - inLeft / 2);
michael@0 579 *inputLeft = inLeft / 2;
michael@0 580 *outputLeft = outLeft;
michael@0 581 if (res != (size_t) -1) {
michael@0 582 return NS_OK;
michael@0 583 }
michael@0 584
michael@0 585 // reset converters
michael@0 586 xp_iconv_reset(gUnicodeToUTF8);
michael@0 587 xp_iconv_reset(gUTF8ToNative);
michael@0 588 }
michael@0 589 #endif
michael@0 590
michael@0 591 // fallback: truncate and hope for the best
michael@0 592 // XXX This is lame and we have to do better.
michael@0 593 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
michael@0 594
michael@0 595 return NS_OK;
michael@0 596 }
michael@0 597
michael@0 598 bool
michael@0 599 nsNativeCharsetConverter::IsNativeUTF8()
michael@0 600 {
michael@0 601 if (!gInitialized) {
michael@0 602 Lock();
michael@0 603 if (!gInitialized)
michael@0 604 LazyInit();
michael@0 605 Unlock();
michael@0 606 }
michael@0 607 return gIsNativeUTF8;
michael@0 608 }
michael@0 609
michael@0 610 #endif // USE_ICONV
michael@0 611
michael@0 612 //-----------------------------------------------------------------------------
michael@0 613 // conversion using mb[r]towc/wc[r]tomb
michael@0 614 //-----------------------------------------------------------------------------
michael@0 615 #if defined(USE_STDCONV)
michael@0 616 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0 617 #include <wchar.h> // mbrtowc, wcrtomb
michael@0 618 #endif
michael@0 619
michael@0 620 class nsNativeCharsetConverter
michael@0 621 {
michael@0 622 public:
michael@0 623 nsNativeCharsetConverter();
michael@0 624
michael@0 625 nsresult NativeToUnicode(const char **input , uint32_t *inputLeft,
michael@0 626 char16_t **output, uint32_t *outputLeft);
michael@0 627 nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
michael@0 628 char **output, uint32_t *outputLeft);
michael@0 629
michael@0 630 static void GlobalInit();
michael@0 631 static void GlobalShutdown() { }
michael@0 632 static bool IsNativeUTF8();
michael@0 633
michael@0 634 private:
michael@0 635 static bool gWCharIsUnicode;
michael@0 636
michael@0 637 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0 638 mbstate_t ps;
michael@0 639 #endif
michael@0 640 };
michael@0 641
michael@0 642 bool nsNativeCharsetConverter::gWCharIsUnicode = false;
michael@0 643
michael@0 644 nsNativeCharsetConverter::nsNativeCharsetConverter()
michael@0 645 {
michael@0 646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0 647 memset(&ps, 0, sizeof(ps));
michael@0 648 #endif
michael@0 649 }
michael@0 650
michael@0 651 void
michael@0 652 nsNativeCharsetConverter::GlobalInit()
michael@0 653 {
michael@0 654 // verify that wchar_t for the current locale is actually unicode.
michael@0 655 // if it is not, then we should avoid calling mbtowc/wctomb and
michael@0 656 // just fallback on zero-pad/truncation conversion.
michael@0 657 //
michael@0 658 // this test cannot be done at build time because the encoding of
michael@0 659 // wchar_t may depend on the runtime locale. sad, but true!!
michael@0 660 //
michael@0 661 // so, if wchar_t is unicode then converting an ASCII character
michael@0 662 // to wchar_t should not change its numeric value. we'll just
michael@0 663 // check what happens with the ASCII 'a' character.
michael@0 664 //
michael@0 665 // this test is not perfect... obviously, it could yield false
michael@0 666 // positives, but then at least ASCII text would be converted
michael@0 667 // properly (or maybe just the 'a' character) -- oh well :(
michael@0 668
michael@0 669 char a = 'a';
michael@0 670 unsigned int w = 0;
michael@0 671
michael@0 672 int res = mbtowc((wchar_t *) &w, &a, 1);
michael@0 673
michael@0 674 gWCharIsUnicode = (res != -1 && w == 'a');
michael@0 675
michael@0 676 #ifdef DEBUG
michael@0 677 if (!gWCharIsUnicode)
michael@0 678 NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
michael@0 679 #endif
michael@0 680 }
michael@0 681
michael@0 682 nsresult
michael@0 683 nsNativeCharsetConverter::NativeToUnicode(const char **input,
michael@0 684 uint32_t *inputLeft,
michael@0 685 char16_t **output,
michael@0 686 uint32_t *outputLeft)
michael@0 687 {
michael@0 688 if (gWCharIsUnicode) {
michael@0 689 int incr;
michael@0 690
michael@0 691 // cannot use wchar_t here since it may have been redefined (e.g.,
michael@0 692 // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
michael@0 693 unsigned int tmp = 0;
michael@0 694 while (*inputLeft && *outputLeft) {
michael@0 695 #ifdef HAVE_MBRTOWC
michael@0 696 incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
michael@0 697 #else
michael@0 698 // XXX is this thread-safe?
michael@0 699 incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
michael@0 700 #endif
michael@0 701 if (incr < 0) {
michael@0 702 NS_WARNING("mbtowc failed: possible charset mismatch");
michael@0 703 // zero-pad and hope for the best
michael@0 704 tmp = (unsigned char) **input;
michael@0 705 incr = 1;
michael@0 706 }
michael@0 707 **output = (char16_t) tmp;
michael@0 708 (*input) += incr;
michael@0 709 (*inputLeft) -= incr;
michael@0 710 (*output)++;
michael@0 711 (*outputLeft)--;
michael@0 712 }
michael@0 713 }
michael@0 714 else {
michael@0 715 // wchar_t isn't unicode, so the best we can do is treat the
michael@0 716 // input as if it is isolatin1 :(
michael@0 717 isolatin1_to_utf16(input, inputLeft, output, outputLeft);
michael@0 718 }
michael@0 719
michael@0 720 return NS_OK;
michael@0 721 }
michael@0 722
michael@0 723 nsresult
michael@0 724 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
michael@0 725 uint32_t *inputLeft,
michael@0 726 char **output,
michael@0 727 uint32_t *outputLeft)
michael@0 728 {
michael@0 729 if (gWCharIsUnicode) {
michael@0 730 int incr;
michael@0 731
michael@0 732 while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
michael@0 733 #ifdef HAVE_WCRTOMB
michael@0 734 incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
michael@0 735 #else
michael@0 736 // XXX is this thread-safe?
michael@0 737 incr = (int) wctomb(*output, (wchar_t) **input);
michael@0 738 #endif
michael@0 739 if (incr < 0) {
michael@0 740 NS_WARNING("mbtowc failed: possible charset mismatch");
michael@0 741 **output = (unsigned char) **input; // truncate
michael@0 742 incr = 1;
michael@0 743 }
michael@0 744 // most likely we're dead anyways if this assertion should fire
michael@0 745 NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string");
michael@0 746 (*output) += incr;
michael@0 747 (*outputLeft) -= incr;
michael@0 748 (*input)++;
michael@0 749 (*inputLeft)--;
michael@0 750 }
michael@0 751 }
michael@0 752 else {
michael@0 753 // wchar_t isn't unicode, so the best we can do is treat the
michael@0 754 // input as if it is isolatin1 :(
michael@0 755 utf16_to_isolatin1(input, inputLeft, output, outputLeft);
michael@0 756 }
michael@0 757
michael@0 758 return NS_OK;
michael@0 759 }
michael@0 760
michael@0 761 // XXX : for now, return false
michael@0 762 bool
michael@0 763 nsNativeCharsetConverter::IsNativeUTF8()
michael@0 764 {
michael@0 765 return false;
michael@0 766 }
michael@0 767
michael@0 768 #endif // USE_STDCONV
michael@0 769
michael@0 770 //-----------------------------------------------------------------------------
michael@0 771 // API implementation
michael@0 772 //-----------------------------------------------------------------------------
michael@0 773
michael@0 774 nsresult
michael@0 775 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0 776 {
michael@0 777 output.Truncate();
michael@0 778
michael@0 779 uint32_t inputLen = input.Length();
michael@0 780
michael@0 781 nsACString::const_iterator iter;
michael@0 782 input.BeginReading(iter);
michael@0 783
michael@0 784 //
michael@0 785 // OPTIMIZATION: preallocate space for largest possible result; convert
michael@0 786 // directly into the result buffer to avoid intermediate buffer copy.
michael@0 787 //
michael@0 788 // this will generally result in a larger allocation, but that seems
michael@0 789 // better than an extra buffer copy.
michael@0 790 //
michael@0 791 if (!output.SetLength(inputLen, fallible_t()))
michael@0 792 return NS_ERROR_OUT_OF_MEMORY;
michael@0 793 nsAString::iterator out_iter;
michael@0 794 output.BeginWriting(out_iter);
michael@0 795
michael@0 796 char16_t *result = out_iter.get();
michael@0 797 uint32_t resultLeft = inputLen;
michael@0 798
michael@0 799 const char *buf = iter.get();
michael@0 800 uint32_t bufLeft = inputLen;
michael@0 801
michael@0 802 nsNativeCharsetConverter conv;
michael@0 803 nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
michael@0 804 if (NS_SUCCEEDED(rv)) {
michael@0 805 NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
michael@0 806 output.SetLength(inputLen - resultLeft);
michael@0 807 }
michael@0 808 return rv;
michael@0 809 }
michael@0 810
michael@0 811 nsresult
michael@0 812 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
michael@0 813 {
michael@0 814 output.Truncate();
michael@0 815
michael@0 816 nsAString::const_iterator iter, end;
michael@0 817 input.BeginReading(iter);
michael@0 818 input.EndReading(end);
michael@0 819
michael@0 820 // cannot easily avoid intermediate buffer copy.
michael@0 821 char temp[4096];
michael@0 822
michael@0 823 nsNativeCharsetConverter conv;
michael@0 824
michael@0 825 const char16_t *buf = iter.get();
michael@0 826 uint32_t bufLeft = Distance(iter, end);
michael@0 827 while (bufLeft) {
michael@0 828 char *p = temp;
michael@0 829 uint32_t tempLeft = sizeof(temp);
michael@0 830
michael@0 831 nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
michael@0 832 if (NS_FAILED(rv)) return rv;
michael@0 833
michael@0 834 if (tempLeft < sizeof(temp))
michael@0 835 output.Append(temp, sizeof(temp) - tempLeft);
michael@0 836 }
michael@0 837 return NS_OK;
michael@0 838 }
michael@0 839
michael@0 840 bool
michael@0 841 NS_IsNativeUTF8()
michael@0 842 {
michael@0 843 return nsNativeCharsetConverter::IsNativeUTF8();
michael@0 844 }
michael@0 845
michael@0 846 void
michael@0 847 NS_StartupNativeCharsetUtils()
michael@0 848 {
michael@0 849 //
michael@0 850 // need to initialize the locale or else charset conversion will fail.
michael@0 851 // better not delay this in case some other component alters the locale
michael@0 852 // settings.
michael@0 853 //
michael@0 854 // XXX we assume that we are called early enough that we should
michael@0 855 // always be the first to care about the locale's charset.
michael@0 856 //
michael@0 857 setlocale(LC_CTYPE, "");
michael@0 858
michael@0 859 nsNativeCharsetConverter::GlobalInit();
michael@0 860 }
michael@0 861
michael@0 862 void
michael@0 863 NS_ShutdownNativeCharsetUtils()
michael@0 864 {
michael@0 865 nsNativeCharsetConverter::GlobalShutdown();
michael@0 866 }
michael@0 867
michael@0 868 //-----------------------------------------------------------------------------
michael@0 869 // XP_WIN
michael@0 870 //-----------------------------------------------------------------------------
michael@0 871 #elif defined(XP_WIN)
michael@0 872
michael@0 873 #include <windows.h>
michael@0 874 #include "nsString.h"
michael@0 875 #include "nsAString.h"
michael@0 876 #include "nsReadableUtils.h"
michael@0 877
michael@0 878 using namespace mozilla;
michael@0 879
michael@0 880 nsresult
michael@0 881 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0 882 {
michael@0 883 uint32_t inputLen = input.Length();
michael@0 884
michael@0 885 nsACString::const_iterator iter;
michael@0 886 input.BeginReading(iter);
michael@0 887
michael@0 888 const char *buf = iter.get();
michael@0 889
michael@0 890 // determine length of result
michael@0 891 uint32_t resultLen = 0;
michael@0 892 int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);
michael@0 893 if (n > 0)
michael@0 894 resultLen += n;
michael@0 895
michael@0 896 // allocate sufficient space
michael@0 897 if (!output.SetLength(resultLen, fallible_t()))
michael@0 898 return NS_ERROR_OUT_OF_MEMORY;
michael@0 899 if (resultLen > 0) {
michael@0 900 nsAString::iterator out_iter;
michael@0 901 output.BeginWriting(out_iter);
michael@0 902
michael@0 903 char16_t *result = out_iter.get();
michael@0 904
michael@0 905 ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen);
michael@0 906 }
michael@0 907 return NS_OK;
michael@0 908 }
michael@0 909
michael@0 910 nsresult
michael@0 911 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
michael@0 912 {
michael@0 913 uint32_t inputLen = input.Length();
michael@0 914
michael@0 915 nsAString::const_iterator iter;
michael@0 916 input.BeginReading(iter);
michael@0 917
michael@0 918 char16ptr_t buf = iter.get();
michael@0 919
michael@0 920 // determine length of result
michael@0 921 uint32_t resultLen = 0;
michael@0 922
michael@0 923 int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,
michael@0 924 nullptr, nullptr);
michael@0 925 if (n > 0)
michael@0 926 resultLen += n;
michael@0 927
michael@0 928 // allocate sufficient space
michael@0 929 if (!output.SetLength(resultLen, fallible_t()))
michael@0 930 return NS_ERROR_OUT_OF_MEMORY;
michael@0 931 if (resultLen > 0) {
michael@0 932 nsACString::iterator out_iter;
michael@0 933 output.BeginWriting(out_iter);
michael@0 934
michael@0 935 // default "defaultChar" is '?', which is an illegal character on windows
michael@0 936 // file system. That will cause file uncreatable. Change it to '_'
michael@0 937 const char defaultChar = '_';
michael@0 938
michael@0 939 char *result = out_iter.get();
michael@0 940
michael@0 941 ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
michael@0 942 &defaultChar, nullptr);
michael@0 943 }
michael@0 944 return NS_OK;
michael@0 945 }
michael@0 946
michael@0 947 // moved from widget/windows/nsToolkit.cpp
michael@0 948 int32_t
michael@0 949 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW)
michael@0 950 {
michael@0 951 return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize);
michael@0 952 }
michael@0 953
michael@0 954 int32_t
michael@0 955 NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut,
michael@0 956 char *aStrOutA, const char *aDefault)
michael@0 957 {
michael@0 958 if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
michael@0 959 return 0;
michael@0 960
michael@0 961 int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1,
michael@0 962 aStrOutA, aBufferSizeOut,
michael@0 963 aDefault, nullptr);
michael@0 964
michael@0 965 if (!numCharsConverted) {
michael@0 966 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
michael@0 967 // Overflow, add missing null termination but return 0
michael@0 968 aStrOutA[aBufferSizeOut-1] = '\0';
michael@0 969 }
michael@0 970 else {
michael@0 971 // Other error, clear string and return 0
michael@0 972 aStrOutA[0] = '\0';
michael@0 973 }
michael@0 974 }
michael@0 975 else if (numCharsConverted < aBufferSizeOut) {
michael@0 976 // Add 2nd null (really necessary?)
michael@0 977 aStrOutA[numCharsConverted] = '\0';
michael@0 978 }
michael@0 979
michael@0 980 return numCharsConverted;
michael@0 981 }
michael@0 982
michael@0 983 #else
michael@0 984
michael@0 985 #include "nsReadableUtils.h"
michael@0 986
michael@0 987 nsresult
michael@0 988 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0 989 {
michael@0 990 CopyASCIItoUTF16(input, output);
michael@0 991 return NS_OK;
michael@0 992 }
michael@0 993
michael@0 994 nsresult
michael@0 995 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
michael@0 996 {
michael@0 997 LossyCopyUTF16toASCII(input, output);
michael@0 998 return NS_OK;
michael@0 999 }
michael@0 1000
michael@0 1001 void
michael@0 1002 NS_StartupNativeCharsetUtils()
michael@0 1003 {
michael@0 1004 }
michael@0 1005
michael@0 1006 void
michael@0 1007 NS_ShutdownNativeCharsetUtils()
michael@0 1008 {
michael@0 1009 }
michael@0 1010
michael@0 1011 #endif

mercurial