michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "xpcom-private.h" michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // XP_MACOSX or ANDROID michael@0: //----------------------------------------------------------------------------- michael@0: #if defined(XP_MACOSX) || defined(ANDROID) michael@0: michael@0: #include "nsAString.h" michael@0: #include "nsReadableUtils.h" michael@0: #include "nsString.h" michael@0: michael@0: nsresult michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) michael@0: { michael@0: CopyUTF8toUTF16(input, output); michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) michael@0: { michael@0: CopyUTF16toUTF8(input, output); michael@0: return NS_OK; michael@0: } michael@0: michael@0: void michael@0: NS_StartupNativeCharsetUtils() michael@0: { michael@0: } michael@0: michael@0: void michael@0: NS_ShutdownNativeCharsetUtils() michael@0: { michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // XP_UNIX michael@0: //----------------------------------------------------------------------------- michael@0: #elif defined(XP_UNIX) michael@0: michael@0: #include // mbtowc, wctomb michael@0: #include // setlocale michael@0: #include "mozilla/Mutex.h" michael@0: #include "nscore.h" michael@0: #include "nsAString.h" michael@0: #include "nsReadableUtils.h" michael@0: michael@0: using namespace mozilla; michael@0: michael@0: // michael@0: // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux, michael@0: // but that doesn't work for non-BMP characters whether we use '-fshort-wchar' michael@0: // or not (see bug 206811 and michael@0: // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use michael@0: // iconv for all platforms where nltypes.h and nllanginfo.h are present michael@0: // along with iconv. michael@0: // michael@0: #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET) michael@0: #define USE_ICONV 1 michael@0: #else michael@0: #define USE_STDCONV 1 michael@0: #endif michael@0: michael@0: static void michael@0: isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft) michael@0: { michael@0: while (*inputLeft && *outputLeft) { michael@0: **output = (unsigned char) **input; michael@0: (*input)++; michael@0: (*inputLeft)--; michael@0: (*output)++; michael@0: (*outputLeft)--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft) michael@0: { michael@0: while (*inputLeft && *outputLeft) { michael@0: **output = (unsigned char) **input; michael@0: (*input)++; michael@0: (*inputLeft)--; michael@0: (*output)++; michael@0: (*outputLeft)--; michael@0: } michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // conversion using iconv michael@0: //----------------------------------------------------------------------------- michael@0: #if defined(USE_ICONV) michael@0: #include // CODESET michael@0: #include // nl_langinfo michael@0: #include // iconv_open, iconv, iconv_close michael@0: #include michael@0: #include "plstr.h" michael@0: michael@0: #if defined(HAVE_ICONV_WITH_CONST_INPUT) michael@0: #define ICONV_INPUT(x) (x) michael@0: #else michael@0: #define ICONV_INPUT(x) ((char **)x) michael@0: #endif michael@0: michael@0: // solaris definitely needs this, but we'll enable it by default michael@0: // just in case... but we know for sure that iconv(3) in glibc michael@0: // doesn't need this. michael@0: #if !defined(__GLIBC__) michael@0: #define ENABLE_UTF8_FALLBACK_SUPPORT michael@0: #endif michael@0: michael@0: #define INVALID_ICONV_T ((iconv_t) -1) michael@0: michael@0: static inline size_t michael@0: xp_iconv(iconv_t converter, michael@0: const char **input, michael@0: size_t *inputLeft, michael@0: char **output, michael@0: size_t *outputLeft) michael@0: { michael@0: size_t res, outputAvail = outputLeft ? *outputLeft : 0; michael@0: res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft); michael@0: if (res == (size_t) -1) { michael@0: // on some platforms (e.g., linux) iconv will fail with michael@0: // E2BIG if it cannot convert _all_ of its input. it'll michael@0: // still adjust all of the in/out params correctly, so we michael@0: // can ignore this error. the assumption is that we will michael@0: // be called again to complete the conversion. michael@0: if ((errno == E2BIG) && (*outputLeft < outputAvail)) michael@0: res = 0; michael@0: } michael@0: return res; michael@0: } michael@0: michael@0: static inline void michael@0: xp_iconv_reset(iconv_t converter) michael@0: { michael@0: // NOTE: the man pages on Solaris claim that you can pass nullptr michael@0: // for all parameter to reset the converter, but beware the michael@0: // evil Solaris crash if you go down this route >:-) michael@0: michael@0: const char *zero_char_in_ptr = nullptr; michael@0: char *zero_char_out_ptr = nullptr; michael@0: size_t zero_size_in = 0, michael@0: zero_size_out = 0; michael@0: michael@0: xp_iconv(converter, &zero_char_in_ptr, michael@0: &zero_size_in, michael@0: &zero_char_out_ptr, michael@0: &zero_size_out); michael@0: } michael@0: michael@0: static inline iconv_t michael@0: xp_iconv_open(const char **to_list, const char **from_list) michael@0: { michael@0: iconv_t res; michael@0: const char **from_name; michael@0: const char **to_name; michael@0: michael@0: // try all possible combinations to locate a converter. michael@0: to_name = to_list; michael@0: while (*to_name) { michael@0: if (**to_name) { michael@0: from_name = from_list; michael@0: while (*from_name) { michael@0: if (**from_name) { michael@0: res = iconv_open(*to_name, *from_name); michael@0: if (res != INVALID_ICONV_T) michael@0: return res; michael@0: } michael@0: from_name++; michael@0: } michael@0: } michael@0: to_name++; michael@0: } michael@0: michael@0: return INVALID_ICONV_T; michael@0: } michael@0: michael@0: /* michael@0: * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we michael@0: * have to use UTF-16 with iconv(3) on platforms where it's supported. michael@0: * However, the way UTF-16 and UCS-2 are interpreted varies across platforms michael@0: * and implementations of iconv(3). On Tru64, it also depends on the environment michael@0: * variable. To avoid the trouble arising from byte-swapping michael@0: * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling michael@0: * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 michael@0: * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness, michael@0: * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE' michael@0: * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment michael@0: * variable ICONV_BYTEORDER is set to 'big-endian', about which not much michael@0: * can be done other than adding a note in the release notes. (bug 206811) michael@0: */ michael@0: static const char *UTF_16_NAMES[] = { michael@0: #if defined(IS_LITTLE_ENDIAN) michael@0: "UTF-16LE", michael@0: #if defined(__GLIBC__) michael@0: "UNICODELITTLE", michael@0: #endif michael@0: "UCS-2LE", michael@0: #else michael@0: "UTF-16BE", michael@0: #if defined(__GLIBC__) michael@0: "UNICODEBIG", michael@0: #endif michael@0: "UCS-2BE", michael@0: #endif michael@0: "UTF-16", michael@0: "UCS-2", michael@0: "UCS2", michael@0: "UCS_2", michael@0: "ucs-2", michael@0: "ucs2", michael@0: "ucs_2", michael@0: nullptr michael@0: }; michael@0: michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: static const char *UTF_8_NAMES[] = { michael@0: "UTF-8", michael@0: "UTF8", michael@0: "UTF_8", michael@0: "utf-8", michael@0: "utf8", michael@0: "utf_8", michael@0: nullptr michael@0: }; michael@0: #endif michael@0: michael@0: static const char *ISO_8859_1_NAMES[] = { michael@0: "ISO-8859-1", michael@0: #if !defined(__GLIBC__) michael@0: "ISO8859-1", michael@0: "ISO88591", michael@0: "ISO_8859_1", michael@0: "ISO8859_1", michael@0: "iso-8859-1", michael@0: "iso8859-1", michael@0: "iso88591", michael@0: "iso_8859_1", michael@0: "iso8859_1", michael@0: #endif michael@0: nullptr michael@0: }; michael@0: michael@0: class nsNativeCharsetConverter michael@0: { michael@0: public: michael@0: nsNativeCharsetConverter(); michael@0: ~nsNativeCharsetConverter(); michael@0: michael@0: nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, michael@0: char16_t **output, uint32_t *outputLeft); michael@0: nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, michael@0: char **output, uint32_t *outputLeft); michael@0: michael@0: static void GlobalInit(); michael@0: static void GlobalShutdown(); michael@0: static bool IsNativeUTF8(); michael@0: michael@0: private: michael@0: static iconv_t gNativeToUnicode; michael@0: static iconv_t gUnicodeToNative; michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: static iconv_t gNativeToUTF8; michael@0: static iconv_t gUTF8ToNative; michael@0: static iconv_t gUnicodeToUTF8; michael@0: static iconv_t gUTF8ToUnicode; michael@0: #endif michael@0: static Mutex *gLock; michael@0: static bool gInitialized; michael@0: static bool gIsNativeUTF8; michael@0: michael@0: static void LazyInit(); michael@0: michael@0: static void Lock() { if (gLock) gLock->Lock(); } michael@0: static void Unlock() { if (gLock) gLock->Unlock(); } michael@0: }; michael@0: michael@0: iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T; michael@0: iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T; michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T; michael@0: iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T; michael@0: iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T; michael@0: iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T; michael@0: #endif michael@0: Mutex *nsNativeCharsetConverter::gLock = nullptr; michael@0: bool nsNativeCharsetConverter::gInitialized = false; michael@0: bool nsNativeCharsetConverter::gIsNativeUTF8 = false; michael@0: michael@0: void michael@0: nsNativeCharsetConverter::LazyInit() michael@0: { michael@0: // LazyInit may be called before NS_StartupNativeCharsetUtils, but michael@0: // the setlocale it does has to be called before nl_langinfo. Like in michael@0: // NS_StartupNativeCharsetUtils, assume we are called early enough that michael@0: // we are the first to care about the locale's charset. michael@0: if (!gLock) michael@0: setlocale(LC_CTYPE, ""); michael@0: const char *blank_list[] = { "", nullptr }; michael@0: const char **native_charset_list = blank_list; michael@0: const char *native_charset = nl_langinfo(CODESET); michael@0: if (native_charset == nullptr) { michael@0: NS_ERROR("native charset is unknown"); michael@0: // fallback to ISO-8859-1 michael@0: native_charset_list = ISO_8859_1_NAMES; michael@0: } michael@0: else michael@0: native_charset_list[0] = native_charset; michael@0: michael@0: // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) michael@0: // return 'UTF-8' (or 'utf-8') michael@0: if (!PL_strcasecmp(native_charset, "UTF-8")) michael@0: gIsNativeUTF8 = true; michael@0: michael@0: gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list); michael@0: gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES); michael@0: michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: if (gNativeToUnicode == INVALID_ICONV_T) { michael@0: gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list); michael@0: gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES); michael@0: NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter"); michael@0: NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter"); michael@0: } michael@0: if (gUnicodeToNative == INVALID_ICONV_T) { michael@0: gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES); michael@0: gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES); michael@0: NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter"); michael@0: NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter"); michael@0: } michael@0: #else michael@0: NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter"); michael@0: NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter"); michael@0: #endif michael@0: michael@0: /* michael@0: * On Solaris 8 (and newer?), the iconv modules converting to UCS-2 michael@0: * prepend a byte order mark unicode character (BOM, u+FEFF) during michael@0: * the first use of the iconv converter. The same is the case of michael@0: * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. michael@0: * However, we use 'UTF-16LE/BE' in both cases, instead so that we michael@0: * should be safe. But just in case... michael@0: * michael@0: * This dummy conversion gets rid of the BOMs and fixes bug 153562. michael@0: */ michael@0: char dummy_input[1] = { ' ' }; michael@0: char dummy_output[4]; michael@0: michael@0: if (gNativeToUnicode != INVALID_ICONV_T) { michael@0: const char *input = dummy_input; michael@0: size_t input_left = sizeof(dummy_input); michael@0: char *output = dummy_output; michael@0: size_t output_left = sizeof(dummy_output); michael@0: michael@0: xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left); michael@0: } michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: if (gUTF8ToUnicode != INVALID_ICONV_T) { michael@0: const char *input = dummy_input; michael@0: size_t input_left = sizeof(dummy_input); michael@0: char *output = dummy_output; michael@0: size_t output_left = sizeof(dummy_output); michael@0: michael@0: xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left); michael@0: } michael@0: #endif michael@0: michael@0: gInitialized = true; michael@0: } michael@0: michael@0: void michael@0: nsNativeCharsetConverter::GlobalInit() michael@0: { michael@0: gLock = new Mutex("nsNativeCharsetConverter.gLock"); michael@0: } michael@0: michael@0: void michael@0: nsNativeCharsetConverter::GlobalShutdown() michael@0: { michael@0: if (gLock) { michael@0: delete gLock; michael@0: gLock = nullptr; michael@0: } michael@0: michael@0: if (gNativeToUnicode != INVALID_ICONV_T) { michael@0: iconv_close(gNativeToUnicode); michael@0: gNativeToUnicode = INVALID_ICONV_T; michael@0: } michael@0: michael@0: if (gUnicodeToNative != INVALID_ICONV_T) { michael@0: iconv_close(gUnicodeToNative); michael@0: gUnicodeToNative = INVALID_ICONV_T; michael@0: } michael@0: michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: if (gNativeToUTF8 != INVALID_ICONV_T) { michael@0: iconv_close(gNativeToUTF8); michael@0: gNativeToUTF8 = INVALID_ICONV_T; michael@0: } michael@0: if (gUTF8ToNative != INVALID_ICONV_T) { michael@0: iconv_close(gUTF8ToNative); michael@0: gUTF8ToNative = INVALID_ICONV_T; michael@0: } michael@0: if (gUnicodeToUTF8 != INVALID_ICONV_T) { michael@0: iconv_close(gUnicodeToUTF8); michael@0: gUnicodeToUTF8 = INVALID_ICONV_T; michael@0: } michael@0: if (gUTF8ToUnicode != INVALID_ICONV_T) { michael@0: iconv_close(gUTF8ToUnicode); michael@0: gUTF8ToUnicode = INVALID_ICONV_T; michael@0: } michael@0: #endif michael@0: michael@0: gInitialized = false; michael@0: } michael@0: michael@0: nsNativeCharsetConverter::nsNativeCharsetConverter() michael@0: { michael@0: Lock(); michael@0: if (!gInitialized) michael@0: LazyInit(); michael@0: } michael@0: michael@0: nsNativeCharsetConverter::~nsNativeCharsetConverter() michael@0: { michael@0: // reset converters for next time michael@0: if (gNativeToUnicode != INVALID_ICONV_T) michael@0: xp_iconv_reset(gNativeToUnicode); michael@0: if (gUnicodeToNative != INVALID_ICONV_T) michael@0: xp_iconv_reset(gUnicodeToNative); michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: if (gNativeToUTF8 != INVALID_ICONV_T) michael@0: xp_iconv_reset(gNativeToUTF8); michael@0: if (gUTF8ToNative != INVALID_ICONV_T) michael@0: xp_iconv_reset(gUTF8ToNative); michael@0: if (gUnicodeToUTF8 != INVALID_ICONV_T) michael@0: xp_iconv_reset(gUnicodeToUTF8); michael@0: if (gUTF8ToUnicode != INVALID_ICONV_T) michael@0: xp_iconv_reset(gUTF8ToUnicode); michael@0: #endif michael@0: Unlock(); michael@0: } michael@0: michael@0: nsresult michael@0: nsNativeCharsetConverter::NativeToUnicode(const char **input, michael@0: uint32_t *inputLeft, michael@0: char16_t **output, michael@0: uint32_t *outputLeft) michael@0: { michael@0: size_t res = 0; michael@0: size_t inLeft = (size_t) *inputLeft; michael@0: size_t outLeft = (size_t) *outputLeft * 2; michael@0: michael@0: if (gNativeToUnicode != INVALID_ICONV_T) { michael@0: michael@0: res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft); michael@0: michael@0: *inputLeft = inLeft; michael@0: *outputLeft = outLeft / 2; michael@0: if (res != (size_t) -1) michael@0: return NS_OK; michael@0: michael@0: NS_WARNING("conversion from native to utf-16 failed"); michael@0: michael@0: // reset converter michael@0: xp_iconv_reset(gNativeToUnicode); michael@0: } michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: else if ((gNativeToUTF8 != INVALID_ICONV_T) && michael@0: (gUTF8ToUnicode != INVALID_ICONV_T)) { michael@0: // convert first to UTF8, then from UTF8 to UCS2 michael@0: const char *in = *input; michael@0: michael@0: char ubuf[1024]; michael@0: michael@0: // we assume we're always called with enough space in |output|, michael@0: // so convert many chars at a time... michael@0: while (inLeft) { michael@0: char *p = ubuf; michael@0: size_t n = sizeof(ubuf); michael@0: res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n); michael@0: if (res == (size_t) -1) { michael@0: NS_ERROR("conversion from native to utf-8 failed"); michael@0: break; michael@0: } michael@0: NS_ASSERTION(outLeft > 0, "bad assumption"); michael@0: p = ubuf; michael@0: n = sizeof(ubuf) - n; michael@0: res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft); michael@0: if (res == (size_t) -1) { michael@0: NS_ERROR("conversion from utf-8 to utf-16 failed"); michael@0: break; michael@0: } michael@0: } michael@0: michael@0: (*input) += (*inputLeft - inLeft); michael@0: *inputLeft = inLeft; michael@0: *outputLeft = outLeft / 2; michael@0: michael@0: if (res != (size_t) -1) michael@0: return NS_OK; michael@0: michael@0: // reset converters michael@0: xp_iconv_reset(gNativeToUTF8); michael@0: xp_iconv_reset(gUTF8ToUnicode); michael@0: } michael@0: #endif michael@0: michael@0: // fallback: zero-pad and hope for the best michael@0: // XXX This is lame and we have to do better. michael@0: isolatin1_to_utf16(input, inputLeft, output, outputLeft); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, michael@0: uint32_t *inputLeft, michael@0: char **output, michael@0: uint32_t *outputLeft) michael@0: { michael@0: size_t res = 0; michael@0: size_t inLeft = (size_t) *inputLeft * 2; michael@0: size_t outLeft = (size_t) *outputLeft; michael@0: michael@0: if (gUnicodeToNative != INVALID_ICONV_T) { michael@0: res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft); michael@0: michael@0: *inputLeft = inLeft / 2; michael@0: *outputLeft = outLeft; michael@0: if (res != (size_t) -1) { michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_ERROR("iconv failed"); michael@0: michael@0: // reset converter michael@0: xp_iconv_reset(gUnicodeToNative); michael@0: } michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT) michael@0: else if ((gUnicodeToUTF8 != INVALID_ICONV_T) && michael@0: (gUTF8ToNative != INVALID_ICONV_T)) { michael@0: const char *in = (const char *) *input; michael@0: michael@0: char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes) michael@0: michael@0: // convert one uchar at a time... michael@0: while (inLeft && outLeft) { michael@0: char *p = ubuf; michael@0: size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t); michael@0: res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n); michael@0: if (res == (size_t) -1) { michael@0: NS_ERROR("conversion from utf-16 to utf-8 failed"); michael@0: break; michael@0: } michael@0: p = ubuf; michael@0: n = sizeof(ubuf) - n; michael@0: res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft); michael@0: if (res == (size_t) -1) { michael@0: if (errno == E2BIG) { michael@0: // not enough room for last uchar... back up and return. michael@0: in -= sizeof(char16_t); michael@0: res = 0; michael@0: } michael@0: else michael@0: NS_ERROR("conversion from utf-8 to native failed"); michael@0: break; michael@0: } michael@0: inLeft -= sizeof(char16_t); michael@0: } michael@0: michael@0: (*input) += (*inputLeft - inLeft / 2); michael@0: *inputLeft = inLeft / 2; michael@0: *outputLeft = outLeft; michael@0: if (res != (size_t) -1) { michael@0: return NS_OK; michael@0: } michael@0: michael@0: // reset converters michael@0: xp_iconv_reset(gUnicodeToUTF8); michael@0: xp_iconv_reset(gUTF8ToNative); michael@0: } michael@0: #endif michael@0: michael@0: // fallback: truncate and hope for the best michael@0: // XXX This is lame and we have to do better. michael@0: utf16_to_isolatin1(input, inputLeft, output, outputLeft); michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: bool michael@0: nsNativeCharsetConverter::IsNativeUTF8() michael@0: { michael@0: if (!gInitialized) { michael@0: Lock(); michael@0: if (!gInitialized) michael@0: LazyInit(); michael@0: Unlock(); michael@0: } michael@0: return gIsNativeUTF8; michael@0: } michael@0: michael@0: #endif // USE_ICONV michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // conversion using mb[r]towc/wc[r]tomb michael@0: //----------------------------------------------------------------------------- michael@0: #if defined(USE_STDCONV) michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) michael@0: #include // mbrtowc, wcrtomb michael@0: #endif michael@0: michael@0: class nsNativeCharsetConverter michael@0: { michael@0: public: michael@0: nsNativeCharsetConverter(); michael@0: michael@0: nsresult NativeToUnicode(const char **input , uint32_t *inputLeft, michael@0: char16_t **output, uint32_t *outputLeft); michael@0: nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft, michael@0: char **output, uint32_t *outputLeft); michael@0: michael@0: static void GlobalInit(); michael@0: static void GlobalShutdown() { } michael@0: static bool IsNativeUTF8(); michael@0: michael@0: private: michael@0: static bool gWCharIsUnicode; michael@0: michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) michael@0: mbstate_t ps; michael@0: #endif michael@0: }; michael@0: michael@0: bool nsNativeCharsetConverter::gWCharIsUnicode = false; michael@0: michael@0: nsNativeCharsetConverter::nsNativeCharsetConverter() michael@0: { michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC) michael@0: memset(&ps, 0, sizeof(ps)); michael@0: #endif michael@0: } michael@0: michael@0: void michael@0: nsNativeCharsetConverter::GlobalInit() michael@0: { michael@0: // verify that wchar_t for the current locale is actually unicode. michael@0: // if it is not, then we should avoid calling mbtowc/wctomb and michael@0: // just fallback on zero-pad/truncation conversion. michael@0: // michael@0: // this test cannot be done at build time because the encoding of michael@0: // wchar_t may depend on the runtime locale. sad, but true!! michael@0: // michael@0: // so, if wchar_t is unicode then converting an ASCII character michael@0: // to wchar_t should not change its numeric value. we'll just michael@0: // check what happens with the ASCII 'a' character. michael@0: // michael@0: // this test is not perfect... obviously, it could yield false michael@0: // positives, but then at least ASCII text would be converted michael@0: // properly (or maybe just the 'a' character) -- oh well :( michael@0: michael@0: char a = 'a'; michael@0: unsigned int w = 0; michael@0: michael@0: int res = mbtowc((wchar_t *) &w, &a, 1); michael@0: michael@0: gWCharIsUnicode = (res != -1 && w == 'a'); michael@0: michael@0: #ifdef DEBUG michael@0: if (!gWCharIsUnicode) michael@0: NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)"); michael@0: #endif michael@0: } michael@0: michael@0: nsresult michael@0: nsNativeCharsetConverter::NativeToUnicode(const char **input, michael@0: uint32_t *inputLeft, michael@0: char16_t **output, michael@0: uint32_t *outputLeft) michael@0: { michael@0: if (gWCharIsUnicode) { michael@0: int incr; michael@0: michael@0: // cannot use wchar_t here since it may have been redefined (e.g., michael@0: // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP. michael@0: unsigned int tmp = 0; michael@0: while (*inputLeft && *outputLeft) { michael@0: #ifdef HAVE_MBRTOWC michael@0: incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps); michael@0: #else michael@0: // XXX is this thread-safe? michael@0: incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft); michael@0: #endif michael@0: if (incr < 0) { michael@0: NS_WARNING("mbtowc failed: possible charset mismatch"); michael@0: // zero-pad and hope for the best michael@0: tmp = (unsigned char) **input; michael@0: incr = 1; michael@0: } michael@0: **output = (char16_t) tmp; michael@0: (*input) += incr; michael@0: (*inputLeft) -= incr; michael@0: (*output)++; michael@0: (*outputLeft)--; michael@0: } michael@0: } michael@0: else { michael@0: // wchar_t isn't unicode, so the best we can do is treat the michael@0: // input as if it is isolatin1 :( michael@0: isolatin1_to_utf16(input, inputLeft, output, outputLeft); michael@0: } michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: nsNativeCharsetConverter::UnicodeToNative(const char16_t **input, michael@0: uint32_t *inputLeft, michael@0: char **output, michael@0: uint32_t *outputLeft) michael@0: { michael@0: if (gWCharIsUnicode) { michael@0: int incr; michael@0: michael@0: while (*inputLeft && *outputLeft >= MB_CUR_MAX) { michael@0: #ifdef HAVE_WCRTOMB michael@0: incr = (int) wcrtomb(*output, (wchar_t) **input, &ps); michael@0: #else michael@0: // XXX is this thread-safe? michael@0: incr = (int) wctomb(*output, (wchar_t) **input); michael@0: #endif michael@0: if (incr < 0) { michael@0: NS_WARNING("mbtowc failed: possible charset mismatch"); michael@0: **output = (unsigned char) **input; // truncate michael@0: incr = 1; michael@0: } michael@0: // most likely we're dead anyways if this assertion should fire michael@0: NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string"); michael@0: (*output) += incr; michael@0: (*outputLeft) -= incr; michael@0: (*input)++; michael@0: (*inputLeft)--; michael@0: } michael@0: } michael@0: else { michael@0: // wchar_t isn't unicode, so the best we can do is treat the michael@0: // input as if it is isolatin1 :( michael@0: utf16_to_isolatin1(input, inputLeft, output, outputLeft); michael@0: } michael@0: michael@0: return NS_OK; michael@0: } michael@0: michael@0: // XXX : for now, return false michael@0: bool michael@0: nsNativeCharsetConverter::IsNativeUTF8() michael@0: { michael@0: return false; michael@0: } michael@0: michael@0: #endif // USE_STDCONV michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // API implementation michael@0: //----------------------------------------------------------------------------- michael@0: michael@0: nsresult michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) michael@0: { michael@0: output.Truncate(); michael@0: michael@0: uint32_t inputLen = input.Length(); michael@0: michael@0: nsACString::const_iterator iter; michael@0: input.BeginReading(iter); michael@0: michael@0: // michael@0: // OPTIMIZATION: preallocate space for largest possible result; convert michael@0: // directly into the result buffer to avoid intermediate buffer copy. michael@0: // michael@0: // this will generally result in a larger allocation, but that seems michael@0: // better than an extra buffer copy. michael@0: // michael@0: if (!output.SetLength(inputLen, fallible_t())) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: nsAString::iterator out_iter; michael@0: output.BeginWriting(out_iter); michael@0: michael@0: char16_t *result = out_iter.get(); michael@0: uint32_t resultLeft = inputLen; michael@0: michael@0: const char *buf = iter.get(); michael@0: uint32_t bufLeft = inputLen; michael@0: michael@0: nsNativeCharsetConverter conv; michael@0: nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft); michael@0: if (NS_SUCCEEDED(rv)) { michael@0: NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer"); michael@0: output.SetLength(inputLen - resultLeft); michael@0: } michael@0: return rv; michael@0: } michael@0: michael@0: nsresult michael@0: NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) michael@0: { michael@0: output.Truncate(); michael@0: michael@0: nsAString::const_iterator iter, end; michael@0: input.BeginReading(iter); michael@0: input.EndReading(end); michael@0: michael@0: // cannot easily avoid intermediate buffer copy. michael@0: char temp[4096]; michael@0: michael@0: nsNativeCharsetConverter conv; michael@0: michael@0: const char16_t *buf = iter.get(); michael@0: uint32_t bufLeft = Distance(iter, end); michael@0: while (bufLeft) { michael@0: char *p = temp; michael@0: uint32_t tempLeft = sizeof(temp); michael@0: michael@0: nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft); michael@0: if (NS_FAILED(rv)) return rv; michael@0: michael@0: if (tempLeft < sizeof(temp)) michael@0: output.Append(temp, sizeof(temp) - tempLeft); michael@0: } michael@0: return NS_OK; michael@0: } michael@0: michael@0: bool michael@0: NS_IsNativeUTF8() michael@0: { michael@0: return nsNativeCharsetConverter::IsNativeUTF8(); michael@0: } michael@0: michael@0: void michael@0: NS_StartupNativeCharsetUtils() michael@0: { michael@0: // michael@0: // need to initialize the locale or else charset conversion will fail. michael@0: // better not delay this in case some other component alters the locale michael@0: // settings. michael@0: // michael@0: // XXX we assume that we are called early enough that we should michael@0: // always be the first to care about the locale's charset. michael@0: // michael@0: setlocale(LC_CTYPE, ""); michael@0: michael@0: nsNativeCharsetConverter::GlobalInit(); michael@0: } michael@0: michael@0: void michael@0: NS_ShutdownNativeCharsetUtils() michael@0: { michael@0: nsNativeCharsetConverter::GlobalShutdown(); michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // XP_WIN michael@0: //----------------------------------------------------------------------------- michael@0: #elif defined(XP_WIN) michael@0: michael@0: #include michael@0: #include "nsString.h" michael@0: #include "nsAString.h" michael@0: #include "nsReadableUtils.h" michael@0: michael@0: using namespace mozilla; michael@0: michael@0: nsresult michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) michael@0: { michael@0: uint32_t inputLen = input.Length(); michael@0: michael@0: nsACString::const_iterator iter; michael@0: input.BeginReading(iter); michael@0: michael@0: const char *buf = iter.get(); michael@0: michael@0: // determine length of result michael@0: uint32_t resultLen = 0; michael@0: int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0); michael@0: if (n > 0) michael@0: resultLen += n; michael@0: michael@0: // allocate sufficient space michael@0: if (!output.SetLength(resultLen, fallible_t())) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: if (resultLen > 0) { michael@0: nsAString::iterator out_iter; michael@0: output.BeginWriting(out_iter); michael@0: michael@0: char16_t *result = out_iter.get(); michael@0: michael@0: ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen); michael@0: } michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) michael@0: { michael@0: uint32_t inputLen = input.Length(); michael@0: michael@0: nsAString::const_iterator iter; michael@0: input.BeginReading(iter); michael@0: michael@0: char16ptr_t buf = iter.get(); michael@0: michael@0: // determine length of result michael@0: uint32_t resultLen = 0; michael@0: michael@0: int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0, michael@0: nullptr, nullptr); michael@0: if (n > 0) michael@0: resultLen += n; michael@0: michael@0: // allocate sufficient space michael@0: if (!output.SetLength(resultLen, fallible_t())) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: if (resultLen > 0) { michael@0: nsACString::iterator out_iter; michael@0: output.BeginWriting(out_iter); michael@0: michael@0: // default "defaultChar" is '?', which is an illegal character on windows michael@0: // file system. That will cause file uncreatable. Change it to '_' michael@0: const char defaultChar = '_'; michael@0: michael@0: char *result = out_iter.get(); michael@0: michael@0: ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen, michael@0: &defaultChar, nullptr); michael@0: } michael@0: return NS_OK; michael@0: } michael@0: michael@0: // moved from widget/windows/nsToolkit.cpp michael@0: int32_t michael@0: NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW) michael@0: { michael@0: return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize); michael@0: } michael@0: michael@0: int32_t michael@0: NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut, michael@0: char *aStrOutA, const char *aDefault) michael@0: { michael@0: if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0)) michael@0: return 0; michael@0: michael@0: int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1, michael@0: aStrOutA, aBufferSizeOut, michael@0: aDefault, nullptr); michael@0: michael@0: if (!numCharsConverted) { michael@0: if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { michael@0: // Overflow, add missing null termination but return 0 michael@0: aStrOutA[aBufferSizeOut-1] = '\0'; michael@0: } michael@0: else { michael@0: // Other error, clear string and return 0 michael@0: aStrOutA[0] = '\0'; michael@0: } michael@0: } michael@0: else if (numCharsConverted < aBufferSizeOut) { michael@0: // Add 2nd null (really necessary?) michael@0: aStrOutA[numCharsConverted] = '\0'; michael@0: } michael@0: michael@0: return numCharsConverted; michael@0: } michael@0: michael@0: #else michael@0: michael@0: #include "nsReadableUtils.h" michael@0: michael@0: nsresult michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output) michael@0: { michael@0: CopyASCIItoUTF16(input, output); michael@0: return NS_OK; michael@0: } michael@0: michael@0: nsresult michael@0: NS_CopyUnicodeToNative(const nsAString &input, nsACString &output) michael@0: { michael@0: LossyCopyUTF16toASCII(input, output); michael@0: return NS_OK; michael@0: } michael@0: michael@0: void michael@0: NS_StartupNativeCharsetUtils() michael@0: { michael@0: } michael@0: michael@0: void michael@0: NS_ShutdownNativeCharsetUtils() michael@0: { michael@0: } michael@0: michael@0: #endif