michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #include "xpcom-private.h"
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // XP_MACOSX or ANDROID
michael@0: //-----------------------------------------------------------------------------
michael@0: #if defined(XP_MACOSX) || defined(ANDROID)
michael@0: 
michael@0: #include "nsAString.h"
michael@0: #include "nsReadableUtils.h"
michael@0: #include "nsString.h"
michael@0: 
michael@0: nsresult
michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
michael@0: {
michael@0:     CopyUTF8toUTF16(input, output);
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
michael@0: {
michael@0:     CopyUTF16toUTF8(input, output);
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_StartupNativeCharsetUtils()
michael@0: {
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_ShutdownNativeCharsetUtils()
michael@0: {
michael@0: }
michael@0: 
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // XP_UNIX
michael@0: //-----------------------------------------------------------------------------
michael@0: #elif defined(XP_UNIX)
michael@0: 
michael@0: #include <stdlib.h>   // mbtowc, wctomb
michael@0: #include <locale.h>   // setlocale
michael@0: #include "mozilla/Mutex.h"
michael@0: #include "nscore.h"
michael@0: #include "nsAString.h"
michael@0: #include "nsReadableUtils.h"
michael@0: 
michael@0: using namespace mozilla;
michael@0: 
michael@0: //
michael@0: // choose a conversion library.  we used to use mbrtowc/wcrtomb under Linux,
michael@0: // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
michael@0: // or not (see bug 206811 and 
michael@0: // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
michael@0: // iconv for all platforms where nltypes.h and nllanginfo.h are present 
michael@0: // along with iconv.
michael@0: //
michael@0: #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
michael@0: #define USE_ICONV 1
michael@0: #else
michael@0: #define USE_STDCONV 1
michael@0: #endif
michael@0: 
michael@0: static void
michael@0: isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft)
michael@0: {
michael@0:     while (*inputLeft && *outputLeft) {
michael@0:         **output = (unsigned char) **input;
michael@0:         (*input)++;
michael@0:         (*inputLeft)--;
michael@0:         (*output)++;
michael@0:         (*outputLeft)--;
michael@0:     }
michael@0: }
michael@0: 
michael@0: static void
michael@0: utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft)
michael@0: {
michael@0:     while (*inputLeft && *outputLeft) {
michael@0:         **output = (unsigned char) **input;
michael@0:         (*input)++;
michael@0:         (*inputLeft)--;
michael@0:         (*output)++;
michael@0:         (*outputLeft)--;
michael@0:     }
michael@0: }
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // conversion using iconv
michael@0: //-----------------------------------------------------------------------------
michael@0: #if defined(USE_ICONV)
michael@0: #include <nl_types.h> // CODESET
michael@0: #include <langinfo.h> // nl_langinfo
michael@0: #include <iconv.h>    // iconv_open, iconv, iconv_close
michael@0: #include <errno.h>
michael@0: #include "plstr.h"
michael@0: 
michael@0: #if defined(HAVE_ICONV_WITH_CONST_INPUT)
michael@0: #define ICONV_INPUT(x) (x)
michael@0: #else
michael@0: #define ICONV_INPUT(x) ((char **)x)
michael@0: #endif
michael@0: 
michael@0: // solaris definitely needs this, but we'll enable it by default
michael@0: // just in case... but we know for sure that iconv(3) in glibc
michael@0: // doesn't need this.
michael@0: #if !defined(__GLIBC__)
michael@0: #define ENABLE_UTF8_FALLBACK_SUPPORT
michael@0: #endif
michael@0: 
michael@0: #define INVALID_ICONV_T ((iconv_t) -1)
michael@0: 
michael@0: static inline size_t
michael@0: xp_iconv(iconv_t converter,
michael@0:          const char **input,
michael@0:          size_t      *inputLeft,
michael@0:          char       **output,
michael@0:          size_t      *outputLeft)
michael@0: {
michael@0:     size_t res, outputAvail = outputLeft ? *outputLeft : 0;
michael@0:     res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
michael@0:     if (res == (size_t) -1) {
michael@0:         // on some platforms (e.g., linux) iconv will fail with
michael@0:         // E2BIG if it cannot convert _all_ of its input.  it'll
michael@0:         // still adjust all of the in/out params correctly, so we
michael@0:         // can ignore this error.  the assumption is that we will
michael@0:         // be called again to complete the conversion.
michael@0:         if ((errno == E2BIG) && (*outputLeft < outputAvail))
michael@0:             res = 0;
michael@0:     }
michael@0:     return res;
michael@0: }
michael@0: 
michael@0: static inline void
michael@0: xp_iconv_reset(iconv_t converter)
michael@0: {
michael@0:     // NOTE: the man pages on Solaris claim that you can pass nullptr
michael@0:     // for all parameter to reset the converter, but beware the 
michael@0:     // evil Solaris crash if you go down this route >:-)
michael@0:     
michael@0:     const char *zero_char_in_ptr  = nullptr;
michael@0:     char       *zero_char_out_ptr = nullptr;
michael@0:     size_t      zero_size_in      = 0,
michael@0:                 zero_size_out     = 0;
michael@0: 
michael@0:     xp_iconv(converter, &zero_char_in_ptr,
michael@0:                         &zero_size_in,
michael@0:                         &zero_char_out_ptr,
michael@0:                         &zero_size_out);
michael@0: }
michael@0: 
michael@0: static inline iconv_t
michael@0: xp_iconv_open(const char **to_list, const char **from_list)
michael@0: {
michael@0:     iconv_t res;
michael@0:     const char **from_name;
michael@0:     const char **to_name;
michael@0: 
michael@0:     // try all possible combinations to locate a converter.
michael@0:     to_name = to_list;
michael@0:     while (*to_name) {
michael@0:         if (**to_name) {
michael@0:             from_name = from_list;
michael@0:             while (*from_name) {
michael@0:                 if (**from_name) {
michael@0:                     res = iconv_open(*to_name, *from_name);
michael@0:                     if (res != INVALID_ICONV_T)
michael@0:                         return res;
michael@0:                 }
michael@0:                 from_name++;
michael@0:             }
michael@0:         }
michael@0:         to_name++;
michael@0:     }
michael@0: 
michael@0:     return INVALID_ICONV_T;
michael@0: }
michael@0: 
michael@0: /* 
michael@0:  * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
michael@0:  * have to use UTF-16 with iconv(3) on platforms where it's supported.
michael@0:  * However, the way UTF-16 and UCS-2 are interpreted varies across platforms 
michael@0:  * and implementations of iconv(3). On Tru64, it also depends on the environment
michael@0:  * variable. To avoid the trouble arising from byte-swapping 
michael@0:  * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling 
michael@0:  * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 
michael@0:  * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
michael@0:  * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
michael@0:  * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
michael@0:  * variable ICONV_BYTEORDER is set to 'big-endian', about which not much 
michael@0:  * can be done other than adding a note in the release notes. (bug 206811)
michael@0:  */
michael@0: static const char *UTF_16_NAMES[] = {
michael@0: #if defined(IS_LITTLE_ENDIAN)
michael@0:     "UTF-16LE",
michael@0: #if defined(__GLIBC__)
michael@0:     "UNICODELITTLE",
michael@0: #endif
michael@0:     "UCS-2LE",
michael@0: #else
michael@0:     "UTF-16BE",
michael@0: #if defined(__GLIBC__)
michael@0:     "UNICODEBIG",
michael@0: #endif
michael@0:     "UCS-2BE",
michael@0: #endif
michael@0:     "UTF-16",
michael@0:     "UCS-2",
michael@0:     "UCS2",
michael@0:     "UCS_2",
michael@0:     "ucs-2",
michael@0:     "ucs2",
michael@0:     "ucs_2",
michael@0:     nullptr
michael@0: };
michael@0: 
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0: static const char *UTF_8_NAMES[] = {
michael@0:     "UTF-8",
michael@0:     "UTF8",
michael@0:     "UTF_8",
michael@0:     "utf-8",
michael@0:     "utf8",
michael@0:     "utf_8",
michael@0:     nullptr
michael@0: };
michael@0: #endif
michael@0: 
michael@0: static const char *ISO_8859_1_NAMES[] = {
michael@0:     "ISO-8859-1",
michael@0: #if !defined(__GLIBC__)
michael@0:     "ISO8859-1",
michael@0:     "ISO88591",
michael@0:     "ISO_8859_1",
michael@0:     "ISO8859_1",
michael@0:     "iso-8859-1",
michael@0:     "iso8859-1",
michael@0:     "iso88591",
michael@0:     "iso_8859_1",
michael@0:     "iso8859_1",
michael@0: #endif
michael@0:     nullptr
michael@0: };
michael@0: 
michael@0: class nsNativeCharsetConverter
michael@0: {
michael@0: public:
michael@0:     nsNativeCharsetConverter();
michael@0:    ~nsNativeCharsetConverter();
michael@0: 
michael@0:     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
michael@0:                              char16_t       **output, uint32_t *outputLeft);
michael@0:     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
michael@0:                              char            **output, uint32_t *outputLeft);
michael@0: 
michael@0:     static void GlobalInit();
michael@0:     static void GlobalShutdown();
michael@0:     static bool IsNativeUTF8();
michael@0: 
michael@0: private:
michael@0:     static iconv_t gNativeToUnicode;
michael@0:     static iconv_t gUnicodeToNative;
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     static iconv_t gNativeToUTF8;
michael@0:     static iconv_t gUTF8ToNative;
michael@0:     static iconv_t gUnicodeToUTF8;
michael@0:     static iconv_t gUTF8ToUnicode;
michael@0: #endif
michael@0:     static Mutex  *gLock;
michael@0:     static bool    gInitialized;
michael@0:     static bool    gIsNativeUTF8;
michael@0: 
michael@0:     static void LazyInit();
michael@0: 
michael@0:     static void Lock()   { if (gLock) gLock->Lock();   }
michael@0:     static void Unlock() { if (gLock) gLock->Unlock(); }
michael@0: };
michael@0: 
michael@0: iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
michael@0: iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0: iconv_t nsNativeCharsetConverter::gNativeToUTF8    = INVALID_ICONV_T;
michael@0: iconv_t nsNativeCharsetConverter::gUTF8ToNative    = INVALID_ICONV_T;
michael@0: iconv_t nsNativeCharsetConverter::gUnicodeToUTF8   = INVALID_ICONV_T;
michael@0: iconv_t nsNativeCharsetConverter::gUTF8ToUnicode   = INVALID_ICONV_T;
michael@0: #endif
michael@0: Mutex  *nsNativeCharsetConverter::gLock            = nullptr;
michael@0: bool    nsNativeCharsetConverter::gInitialized     = false;
michael@0: bool    nsNativeCharsetConverter::gIsNativeUTF8    = false;
michael@0: 
michael@0: void
michael@0: nsNativeCharsetConverter::LazyInit()
michael@0: {
michael@0:     // LazyInit may be called before NS_StartupNativeCharsetUtils, but
michael@0:     // the setlocale it does has to be called before nl_langinfo. Like in
michael@0:     // NS_StartupNativeCharsetUtils, assume we are called early enough that
michael@0:     // we are the first to care about the locale's charset.
michael@0:     if (!gLock)
michael@0:       setlocale(LC_CTYPE, "");
michael@0:     const char  *blank_list[] = { "", nullptr };
michael@0:     const char **native_charset_list = blank_list;
michael@0:     const char  *native_charset = nl_langinfo(CODESET);
michael@0:     if (native_charset == nullptr) {
michael@0:         NS_ERROR("native charset is unknown");
michael@0:         // fallback to ISO-8859-1
michael@0:         native_charset_list = ISO_8859_1_NAMES;
michael@0:     }
michael@0:     else
michael@0:         native_charset_list[0] = native_charset;
michael@0: 
michael@0:     // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) 
michael@0:     // return 'UTF-8' (or 'utf-8')
michael@0:     if (!PL_strcasecmp(native_charset, "UTF-8"))
michael@0:         gIsNativeUTF8 = true;
michael@0: 
michael@0:     gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
michael@0:     gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
michael@0: 
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     if (gNativeToUnicode == INVALID_ICONV_T) {
michael@0:         gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
michael@0:         gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
michael@0:         NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
michael@0:         NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
michael@0:     }
michael@0:     if (gUnicodeToNative == INVALID_ICONV_T) {
michael@0:         gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
michael@0:         gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
michael@0:         NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
michael@0:         NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
michael@0:     }
michael@0: #else
michael@0:     NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
michael@0:     NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
michael@0: #endif
michael@0: 
michael@0:     /*
michael@0:      * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
michael@0:      * prepend a byte order mark unicode character (BOM, u+FEFF) during
michael@0:      * the first use of the iconv converter. The same is the case of 
michael@0:      * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. 
michael@0:      * However, we use 'UTF-16LE/BE' in both cases, instead so that we 
michael@0:      * should be safe. But just in case...
michael@0:      *
michael@0:      * This dummy conversion gets rid of the BOMs and fixes bug 153562.
michael@0:      */
michael@0:     char dummy_input[1] = { ' ' };
michael@0:     char dummy_output[4];
michael@0: 
michael@0:     if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0: 	const char *input = dummy_input;
michael@0: 	size_t input_left = sizeof(dummy_input);
michael@0: 	char *output = dummy_output;
michael@0: 	size_t output_left = sizeof(dummy_output);
michael@0: 
michael@0: 	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
michael@0:     }
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     if (gUTF8ToUnicode != INVALID_ICONV_T) {
michael@0: 	const char *input = dummy_input;
michael@0: 	size_t input_left = sizeof(dummy_input);
michael@0: 	char *output = dummy_output;
michael@0: 	size_t output_left = sizeof(dummy_output);
michael@0: 
michael@0: 	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
michael@0:     }
michael@0: #endif
michael@0: 
michael@0:     gInitialized = true;
michael@0: }
michael@0: 
michael@0: void
michael@0: nsNativeCharsetConverter::GlobalInit()
michael@0: {
michael@0:     gLock = new Mutex("nsNativeCharsetConverter.gLock");
michael@0: }
michael@0: 
michael@0: void
michael@0: nsNativeCharsetConverter::GlobalShutdown()
michael@0: {
michael@0:     if (gLock) {
michael@0:         delete gLock;
michael@0:         gLock = nullptr;
michael@0:     }
michael@0: 
michael@0:     if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0:         iconv_close(gNativeToUnicode);
michael@0:         gNativeToUnicode = INVALID_ICONV_T;
michael@0:     }
michael@0: 
michael@0:     if (gUnicodeToNative != INVALID_ICONV_T) {
michael@0:         iconv_close(gUnicodeToNative);
michael@0:         gUnicodeToNative = INVALID_ICONV_T;
michael@0:     }
michael@0: 
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     if (gNativeToUTF8 != INVALID_ICONV_T) {
michael@0:         iconv_close(gNativeToUTF8);
michael@0:         gNativeToUTF8 = INVALID_ICONV_T;
michael@0:     }
michael@0:     if (gUTF8ToNative != INVALID_ICONV_T) {
michael@0:         iconv_close(gUTF8ToNative);
michael@0:         gUTF8ToNative = INVALID_ICONV_T;
michael@0:     }
michael@0:     if (gUnicodeToUTF8 != INVALID_ICONV_T) {
michael@0:         iconv_close(gUnicodeToUTF8);
michael@0:         gUnicodeToUTF8 = INVALID_ICONV_T;
michael@0:     }
michael@0:     if (gUTF8ToUnicode != INVALID_ICONV_T) {
michael@0:         iconv_close(gUTF8ToUnicode);
michael@0:         gUTF8ToUnicode = INVALID_ICONV_T;
michael@0:     }
michael@0: #endif
michael@0: 
michael@0:     gInitialized = false;
michael@0: }
michael@0: 
michael@0: nsNativeCharsetConverter::nsNativeCharsetConverter()
michael@0: {
michael@0:     Lock();
michael@0:     if (!gInitialized)
michael@0:         LazyInit();
michael@0: }
michael@0: 
michael@0: nsNativeCharsetConverter::~nsNativeCharsetConverter()
michael@0: {
michael@0:     // reset converters for next time
michael@0:     if (gNativeToUnicode != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gNativeToUnicode);
michael@0:     if (gUnicodeToNative != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gUnicodeToNative);
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     if (gNativeToUTF8 != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gNativeToUTF8);
michael@0:     if (gUTF8ToNative != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gUTF8ToNative);
michael@0:     if (gUnicodeToUTF8 != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gUnicodeToUTF8);
michael@0:     if (gUTF8ToUnicode != INVALID_ICONV_T)
michael@0:         xp_iconv_reset(gUTF8ToUnicode);
michael@0: #endif
michael@0:     Unlock();
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: nsNativeCharsetConverter::NativeToUnicode(const char **input,
michael@0:                                           uint32_t    *inputLeft,
michael@0:                                           char16_t  **output,
michael@0:                                           uint32_t    *outputLeft)
michael@0: {
michael@0:     size_t res = 0;
michael@0:     size_t inLeft = (size_t) *inputLeft;
michael@0:     size_t outLeft = (size_t) *outputLeft * 2;
michael@0: 
michael@0:     if (gNativeToUnicode != INVALID_ICONV_T) {
michael@0: 
michael@0:         res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
michael@0: 
michael@0:         *inputLeft = inLeft;
michael@0:         *outputLeft = outLeft / 2;
michael@0:         if (res != (size_t) -1) 
michael@0:             return NS_OK;
michael@0: 
michael@0:         NS_WARNING("conversion from native to utf-16 failed");
michael@0: 
michael@0:         // reset converter
michael@0:         xp_iconv_reset(gNativeToUnicode);
michael@0:     }
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
michael@0:              (gUTF8ToUnicode != INVALID_ICONV_T)) {
michael@0:         // convert first to UTF8, then from UTF8 to UCS2
michael@0:         const char *in = *input;
michael@0: 
michael@0:         char ubuf[1024];
michael@0: 
michael@0:         // we assume we're always called with enough space in |output|,
michael@0:         // so convert many chars at a time...
michael@0:         while (inLeft) {
michael@0:             char *p = ubuf;
michael@0:             size_t n = sizeof(ubuf);
michael@0:             res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
michael@0:             if (res == (size_t) -1) {
michael@0:                 NS_ERROR("conversion from native to utf-8 failed");
michael@0:                 break;
michael@0:             }
michael@0:             NS_ASSERTION(outLeft > 0, "bad assumption");
michael@0:             p = ubuf;
michael@0:             n = sizeof(ubuf) - n;
michael@0:             res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
michael@0:             if (res == (size_t) -1) {
michael@0:                 NS_ERROR("conversion from utf-8 to utf-16 failed");
michael@0:                 break;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         (*input) += (*inputLeft - inLeft);
michael@0:         *inputLeft = inLeft;
michael@0:         *outputLeft = outLeft / 2;
michael@0: 
michael@0:         if (res != (size_t) -1) 
michael@0:             return NS_OK;
michael@0: 
michael@0:         // reset converters
michael@0:         xp_iconv_reset(gNativeToUTF8);
michael@0:         xp_iconv_reset(gUTF8ToUnicode);
michael@0:     }
michael@0: #endif
michael@0: 
michael@0:     // fallback: zero-pad and hope for the best
michael@0:     // XXX This is lame and we have to do better.
michael@0:     isolatin1_to_utf16(input, inputLeft, output, outputLeft);
michael@0: 
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
michael@0:                                           uint32_t         *inputLeft,
michael@0:                                           char            **output,
michael@0:                                           uint32_t         *outputLeft)
michael@0: {
michael@0:     size_t res = 0;
michael@0:     size_t inLeft = (size_t) *inputLeft * 2;
michael@0:     size_t outLeft = (size_t) *outputLeft;
michael@0: 
michael@0:     if (gUnicodeToNative != INVALID_ICONV_T) {
michael@0:         res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
michael@0: 
michael@0:         *inputLeft = inLeft / 2;
michael@0:         *outputLeft = outLeft;
michael@0:         if (res != (size_t) -1) {
michael@0:             return NS_OK;
michael@0:         }
michael@0: 
michael@0:         NS_ERROR("iconv failed");
michael@0: 
michael@0:         // reset converter
michael@0:         xp_iconv_reset(gUnicodeToNative);
michael@0:     }
michael@0: #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
michael@0:     else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
michael@0:              (gUTF8ToNative != INVALID_ICONV_T)) {
michael@0:         const char *in = (const char *) *input;
michael@0: 
michael@0:         char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
michael@0: 
michael@0:         // convert one uchar at a time...
michael@0:         while (inLeft && outLeft) {
michael@0:             char *p = ubuf;
michael@0:             size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);
michael@0:             res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
michael@0:             if (res == (size_t) -1) {
michael@0:                 NS_ERROR("conversion from utf-16 to utf-8 failed");
michael@0:                 break;
michael@0:             }
michael@0:             p = ubuf;
michael@0:             n = sizeof(ubuf) - n;
michael@0:             res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
michael@0:             if (res == (size_t) -1) {
michael@0:                 if (errno == E2BIG) {
michael@0:                     // not enough room for last uchar... back up and return.
michael@0:                     in -= sizeof(char16_t);
michael@0:                     res = 0;
michael@0:                 }
michael@0:                 else
michael@0:                     NS_ERROR("conversion from utf-8 to native failed");
michael@0:                 break;
michael@0:             }
michael@0:             inLeft -= sizeof(char16_t);
michael@0:         }
michael@0: 
michael@0:         (*input) += (*inputLeft - inLeft / 2);
michael@0:         *inputLeft = inLeft / 2;
michael@0:         *outputLeft = outLeft;
michael@0:         if (res != (size_t) -1) {
michael@0:             return NS_OK;
michael@0:         }
michael@0: 
michael@0:         // reset converters
michael@0:         xp_iconv_reset(gUnicodeToUTF8);
michael@0:         xp_iconv_reset(gUTF8ToNative);
michael@0:     }
michael@0: #endif
michael@0: 
michael@0:     // fallback: truncate and hope for the best
michael@0:     // XXX This is lame and we have to do better.
michael@0:     utf16_to_isolatin1(input, inputLeft, output, outputLeft);
michael@0: 
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: bool
michael@0: nsNativeCharsetConverter::IsNativeUTF8()
michael@0: {
michael@0:     if (!gInitialized) {
michael@0:         Lock();
michael@0:         if (!gInitialized)
michael@0:            LazyInit();
michael@0:         Unlock();
michael@0:     }
michael@0:     return gIsNativeUTF8; 
michael@0: }
michael@0: 
michael@0: #endif // USE_ICONV
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // conversion using mb[r]towc/wc[r]tomb
michael@0: //-----------------------------------------------------------------------------
michael@0: #if defined(USE_STDCONV)
michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0: #include <wchar.h>    // mbrtowc, wcrtomb
michael@0: #endif
michael@0: 
michael@0: class nsNativeCharsetConverter
michael@0: {
michael@0: public:
michael@0:     nsNativeCharsetConverter();
michael@0: 
michael@0:     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
michael@0:                              char16_t       **output, uint32_t *outputLeft);
michael@0:     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
michael@0:                              char            **output, uint32_t *outputLeft);
michael@0: 
michael@0:     static void GlobalInit();
michael@0:     static void GlobalShutdown() { }
michael@0:     static bool IsNativeUTF8();
michael@0: 
michael@0: private:
michael@0:     static bool gWCharIsUnicode;
michael@0: 
michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0:     mbstate_t ps;
michael@0: #endif
michael@0: };
michael@0: 
michael@0: bool nsNativeCharsetConverter::gWCharIsUnicode = false;
michael@0: 
michael@0: nsNativeCharsetConverter::nsNativeCharsetConverter()
michael@0: {
michael@0: #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
michael@0:     memset(&ps, 0, sizeof(ps));
michael@0: #endif
michael@0: }
michael@0: 
michael@0: void
michael@0: nsNativeCharsetConverter::GlobalInit()
michael@0: {
michael@0:     // verify that wchar_t for the current locale is actually unicode.
michael@0:     // if it is not, then we should avoid calling mbtowc/wctomb and
michael@0:     // just fallback on zero-pad/truncation conversion.
michael@0:     //
michael@0:     // this test cannot be done at build time because the encoding of
michael@0:     // wchar_t may depend on the runtime locale.  sad, but true!!
michael@0:     //
michael@0:     // so, if wchar_t is unicode then converting an ASCII character
michael@0:     // to wchar_t should not change its numeric value.  we'll just
michael@0:     // check what happens with the ASCII 'a' character.
michael@0:     //
michael@0:     // this test is not perfect... obviously, it could yield false
michael@0:     // positives, but then at least ASCII text would be converted
michael@0:     // properly (or maybe just the 'a' character) -- oh well :(
michael@0: 
michael@0:     char a = 'a';
michael@0:     unsigned int w = 0;
michael@0: 
michael@0:     int res = mbtowc((wchar_t *) &w, &a, 1);
michael@0: 
michael@0:     gWCharIsUnicode = (res != -1 && w == 'a');
michael@0: 
michael@0: #ifdef DEBUG
michael@0:     if (!gWCharIsUnicode)
michael@0:         NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
michael@0: #endif
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: nsNativeCharsetConverter::NativeToUnicode(const char **input,
michael@0:                                           uint32_t    *inputLeft,
michael@0:                                           char16_t  **output,
michael@0:                                           uint32_t    *outputLeft)
michael@0: {
michael@0:     if (gWCharIsUnicode) {
michael@0:         int incr;
michael@0: 
michael@0:         // cannot use wchar_t here since it may have been redefined (e.g.,
michael@0:         // via -fshort-wchar).  hopefully, sizeof(tmp) is sufficient XP.
michael@0:         unsigned int tmp = 0;
michael@0:         while (*inputLeft && *outputLeft) {
michael@0: #ifdef HAVE_MBRTOWC
michael@0:             incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
michael@0: #else
michael@0:             // XXX is this thread-safe?
michael@0:             incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
michael@0: #endif
michael@0:             if (incr < 0) {
michael@0:                 NS_WARNING("mbtowc failed: possible charset mismatch");
michael@0:                 // zero-pad and hope for the best
michael@0:                 tmp = (unsigned char) **input;
michael@0:                 incr = 1;
michael@0:             }
michael@0:             **output = (char16_t) tmp;
michael@0:             (*input) += incr;
michael@0:             (*inputLeft) -= incr;
michael@0:             (*output)++;
michael@0:             (*outputLeft)--;
michael@0:         }
michael@0:     }
michael@0:     else {
michael@0:         // wchar_t isn't unicode, so the best we can do is treat the
michael@0:         // input as if it is isolatin1 :(
michael@0:         isolatin1_to_utf16(input, inputLeft, output, outputLeft);
michael@0:     }
michael@0: 
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
michael@0:                                           uint32_t         *inputLeft,
michael@0:                                           char            **output,
michael@0:                                           uint32_t         *outputLeft)
michael@0: {
michael@0:     if (gWCharIsUnicode) {
michael@0:         int incr;
michael@0: 
michael@0:         while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
michael@0: #ifdef HAVE_WCRTOMB
michael@0:             incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
michael@0: #else
michael@0:             // XXX is this thread-safe?
michael@0:             incr = (int) wctomb(*output, (wchar_t) **input);
michael@0: #endif
michael@0:             if (incr < 0) {
michael@0:                 NS_WARNING("mbtowc failed: possible charset mismatch");
michael@0:                 **output = (unsigned char) **input; // truncate
michael@0:                 incr = 1;
michael@0:             }
michael@0:             // most likely we're dead anyways if this assertion should fire
michael@0:             NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string");
michael@0:             (*output) += incr;
michael@0:             (*outputLeft) -= incr;
michael@0:             (*input)++;
michael@0:             (*inputLeft)--;
michael@0:         }
michael@0:     }
michael@0:     else {
michael@0:         // wchar_t isn't unicode, so the best we can do is treat the
michael@0:         // input as if it is isolatin1 :(
michael@0:         utf16_to_isolatin1(input, inputLeft, output, outputLeft);
michael@0:     }
michael@0: 
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: // XXX : for now, return false
michael@0: bool
michael@0: nsNativeCharsetConverter::IsNativeUTF8()
michael@0: {
michael@0:     return false;
michael@0: }
michael@0: 
michael@0: #endif // USE_STDCONV
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // API implementation
michael@0: //-----------------------------------------------------------------------------
michael@0: 
michael@0: nsresult
michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0: {
michael@0:     output.Truncate();
michael@0: 
michael@0:     uint32_t inputLen = input.Length();
michael@0: 
michael@0:     nsACString::const_iterator iter;
michael@0:     input.BeginReading(iter);
michael@0: 
michael@0:     //
michael@0:     // OPTIMIZATION: preallocate space for largest possible result; convert
michael@0:     // directly into the result buffer to avoid intermediate buffer copy.
michael@0:     //
michael@0:     // this will generally result in a larger allocation, but that seems
michael@0:     // better than an extra buffer copy.
michael@0:     //
michael@0:     if (!output.SetLength(inputLen, fallible_t()))
michael@0:         return NS_ERROR_OUT_OF_MEMORY;
michael@0:     nsAString::iterator out_iter;
michael@0:     output.BeginWriting(out_iter);
michael@0: 
michael@0:     char16_t *result = out_iter.get();
michael@0:     uint32_t resultLeft = inputLen;
michael@0: 
michael@0:     const char *buf = iter.get();
michael@0:     uint32_t bufLeft = inputLen;
michael@0: 
michael@0:     nsNativeCharsetConverter conv;
michael@0:     nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
michael@0:     if (NS_SUCCEEDED(rv)) {
michael@0:         NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
michael@0:         output.SetLength(inputLen - resultLeft);
michael@0:     }
michael@0:     return rv;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
michael@0: {
michael@0:     output.Truncate();
michael@0: 
michael@0:     nsAString::const_iterator iter, end;
michael@0:     input.BeginReading(iter);
michael@0:     input.EndReading(end);
michael@0: 
michael@0:     // cannot easily avoid intermediate buffer copy.
michael@0:     char temp[4096];
michael@0: 
michael@0:     nsNativeCharsetConverter conv;
michael@0: 
michael@0:     const char16_t *buf = iter.get();
michael@0:     uint32_t bufLeft = Distance(iter, end);
michael@0:     while (bufLeft) {
michael@0:         char *p = temp;
michael@0:         uint32_t tempLeft = sizeof(temp);
michael@0: 
michael@0:         nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
michael@0:         if (NS_FAILED(rv)) return rv;
michael@0: 
michael@0:         if (tempLeft < sizeof(temp))
michael@0:             output.Append(temp, sizeof(temp) - tempLeft);
michael@0:     }
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: bool
michael@0: NS_IsNativeUTF8()
michael@0: {
michael@0:     return nsNativeCharsetConverter::IsNativeUTF8();
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_StartupNativeCharsetUtils()
michael@0: {
michael@0:     //
michael@0:     // need to initialize the locale or else charset conversion will fail.
michael@0:     // better not delay this in case some other component alters the locale
michael@0:     // settings.
michael@0:     //
michael@0:     // XXX we assume that we are called early enough that we should
michael@0:     // always be the first to care about the locale's charset.
michael@0:     //
michael@0:     setlocale(LC_CTYPE, "");
michael@0: 
michael@0:     nsNativeCharsetConverter::GlobalInit();
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_ShutdownNativeCharsetUtils()
michael@0: {
michael@0:     nsNativeCharsetConverter::GlobalShutdown();
michael@0: }
michael@0: 
michael@0: //-----------------------------------------------------------------------------
michael@0: // XP_WIN
michael@0: //-----------------------------------------------------------------------------
michael@0: #elif defined(XP_WIN)
michael@0: 
michael@0: #include <windows.h>
michael@0: #include "nsString.h"
michael@0: #include "nsAString.h"
michael@0: #include "nsReadableUtils.h"
michael@0: 
michael@0: using namespace mozilla;
michael@0: 
michael@0: nsresult
michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
michael@0: {
michael@0:     uint32_t inputLen = input.Length();
michael@0: 
michael@0:     nsACString::const_iterator iter;
michael@0:     input.BeginReading(iter);
michael@0: 
michael@0:     const char *buf = iter.get();
michael@0: 
michael@0:     // determine length of result
michael@0:     uint32_t resultLen = 0;
michael@0:     int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);
michael@0:     if (n > 0)
michael@0:         resultLen += n;
michael@0: 
michael@0:     // allocate sufficient space
michael@0:     if (!output.SetLength(resultLen, fallible_t()))
michael@0:         return NS_ERROR_OUT_OF_MEMORY;
michael@0:     if (resultLen > 0) {
michael@0:         nsAString::iterator out_iter;
michael@0:         output.BeginWriting(out_iter);
michael@0: 
michael@0:         char16_t *result = out_iter.get();
michael@0: 
michael@0:         ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen);
michael@0:     }
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
michael@0: {
michael@0:     uint32_t inputLen = input.Length();
michael@0: 
michael@0:     nsAString::const_iterator iter;
michael@0:     input.BeginReading(iter);
michael@0: 
michael@0:     char16ptr_t buf = iter.get();
michael@0: 
michael@0:     // determine length of result
michael@0:     uint32_t resultLen = 0;
michael@0: 
michael@0:     int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,
michael@0:                                   nullptr, nullptr);
michael@0:     if (n > 0)
michael@0:         resultLen += n;
michael@0: 
michael@0:     // allocate sufficient space
michael@0:     if (!output.SetLength(resultLen, fallible_t()))
michael@0:         return NS_ERROR_OUT_OF_MEMORY;
michael@0:     if (resultLen > 0) {
michael@0:         nsACString::iterator out_iter;
michael@0:         output.BeginWriting(out_iter);
michael@0: 
michael@0:         // default "defaultChar" is '?', which is an illegal character on windows
michael@0:         // file system.  That will cause file uncreatable. Change it to '_'
michael@0:         const char defaultChar = '_';
michael@0: 
michael@0:         char *result = out_iter.get();
michael@0: 
michael@0:         ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
michael@0:                               &defaultChar, nullptr);
michael@0:     }
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: // moved from widget/windows/nsToolkit.cpp
michael@0: int32_t 
michael@0: NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW)
michael@0: {
michael@0:     return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize);
michael@0: }
michael@0: 
michael@0: int32_t 
michael@0: NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut,
michael@0:                char *aStrOutA, const char *aDefault)
michael@0: {
michael@0:     if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
michael@0:         return 0;
michael@0: 
michael@0:     int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1,
michael@0:                                                 aStrOutA, aBufferSizeOut,
michael@0:                                                 aDefault, nullptr);
michael@0: 
michael@0:     if (!numCharsConverted) {
michael@0:         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
michael@0:             // Overflow, add missing null termination but return 0
michael@0:             aStrOutA[aBufferSizeOut-1] = '\0';
michael@0:         }
michael@0:         else {
michael@0:             // Other error, clear string and return 0
michael@0:             aStrOutA[0] = '\0';
michael@0:         }
michael@0:     }
michael@0:     else if (numCharsConverted < aBufferSizeOut) {
michael@0:         // Add 2nd null (really necessary?)
michael@0:         aStrOutA[numCharsConverted] = '\0';
michael@0:     }
michael@0: 
michael@0:     return numCharsConverted;
michael@0: }
michael@0: 
michael@0: #else
michael@0: 
michael@0: #include "nsReadableUtils.h"
michael@0: 
michael@0: nsresult
michael@0: NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
michael@0: {
michael@0:     CopyASCIItoUTF16(input, output);
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: nsresult
michael@0: NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
michael@0: {
michael@0:     LossyCopyUTF16toASCII(input, output);
michael@0:     return NS_OK;
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_StartupNativeCharsetUtils()
michael@0: {
michael@0: }
michael@0: 
michael@0: void
michael@0: NS_ShutdownNativeCharsetUtils()
michael@0: {
michael@0: }
michael@0: 
michael@0: #endif