The Tor Browser: diff xpcom/io/nsNativeCharsetUtils.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/xpcom/io/nsNativeCharsetUtils.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1011 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +#include "xpcom-private.h"
     1.9 +
    1.10 +//-----------------------------------------------------------------------------
    1.11 +// XP_MACOSX or ANDROID
    1.12 +//-----------------------------------------------------------------------------
    1.13 +#if defined(XP_MACOSX) || defined(ANDROID)
    1.14 +
    1.15 +#include "nsAString.h"
    1.16 +#include "nsReadableUtils.h"
    1.17 +#include "nsString.h"
    1.18 +
    1.19 +nsresult
    1.20 +NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
    1.21 +{
    1.22 +    CopyUTF8toUTF16(input, output);
    1.23 +    return NS_OK;
    1.24 +}
    1.25 +
    1.26 +nsresult
    1.27 +NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
    1.28 +{
    1.29 +    CopyUTF16toUTF8(input, output);
    1.30 +    return NS_OK;
    1.31 +}
    1.32 +
    1.33 +void
    1.34 +NS_StartupNativeCharsetUtils()
    1.35 +{
    1.36 +}
    1.37 +
    1.38 +void
    1.39 +NS_ShutdownNativeCharsetUtils()
    1.40 +{
    1.41 +}
    1.42 +
    1.43 +
    1.44 +//-----------------------------------------------------------------------------
    1.45 +// XP_UNIX
    1.46 +//-----------------------------------------------------------------------------
    1.47 +#elif defined(XP_UNIX)
    1.48 +
    1.49 +#include <stdlib.h>   // mbtowc, wctomb
    1.50 +#include <locale.h>   // setlocale
    1.51 +#include "mozilla/Mutex.h"
    1.52 +#include "nscore.h"
    1.53 +#include "nsAString.h"
    1.54 +#include "nsReadableUtils.h"
    1.55 +
    1.56 +using namespace mozilla;
    1.57 +
    1.58 +//
    1.59 +// choose a conversion library.  we used to use mbrtowc/wcrtomb under Linux,
    1.60 +// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
    1.61 +// or not (see bug 206811 and 
    1.62 +// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
    1.63 +// iconv for all platforms where nltypes.h and nllanginfo.h are present 
    1.64 +// along with iconv.
    1.65 +//
    1.66 +#if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
    1.67 +#define USE_ICONV 1
    1.68 +#else
    1.69 +#define USE_STDCONV 1
    1.70 +#endif
    1.71 +
    1.72 +static void
    1.73 +isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft)
    1.74 +{
    1.75 +    while (*inputLeft && *outputLeft) {
    1.76 +        **output = (unsigned char) **input;
    1.77 +        (*input)++;
    1.78 +        (*inputLeft)--;
    1.79 +        (*output)++;
    1.80 +        (*outputLeft)--;
    1.81 +    }
    1.82 +}
    1.83 +
    1.84 +static void
    1.85 +utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft)
    1.86 +{
    1.87 +    while (*inputLeft && *outputLeft) {
    1.88 +        **output = (unsigned char) **input;
    1.89 +        (*input)++;
    1.90 +        (*inputLeft)--;
    1.91 +        (*output)++;
    1.92 +        (*outputLeft)--;
    1.93 +    }
    1.94 +}
    1.95 +
    1.96 +//-----------------------------------------------------------------------------
    1.97 +// conversion using iconv
    1.98 +//-----------------------------------------------------------------------------
    1.99 +#if defined(USE_ICONV)
   1.100 +#include <nl_types.h> // CODESET
   1.101 +#include <langinfo.h> // nl_langinfo
   1.102 +#include <iconv.h>    // iconv_open, iconv, iconv_close
   1.103 +#include <errno.h>
   1.104 +#include "plstr.h"
   1.105 +
   1.106 +#if defined(HAVE_ICONV_WITH_CONST_INPUT)
   1.107 +#define ICONV_INPUT(x) (x)
   1.108 +#else
   1.109 +#define ICONV_INPUT(x) ((char **)x)
   1.110 +#endif
   1.111 +
   1.112 +// solaris definitely needs this, but we'll enable it by default
   1.113 +// just in case... but we know for sure that iconv(3) in glibc
   1.114 +// doesn't need this.
   1.115 +#if !defined(__GLIBC__)
   1.116 +#define ENABLE_UTF8_FALLBACK_SUPPORT
   1.117 +#endif
   1.118 +
   1.119 +#define INVALID_ICONV_T ((iconv_t) -1)
   1.120 +
   1.121 +static inline size_t
   1.122 +xp_iconv(iconv_t converter,
   1.123 +         const char **input,
   1.124 +         size_t      *inputLeft,
   1.125 +         char       **output,
   1.126 +         size_t      *outputLeft)
   1.127 +{
   1.128 +    size_t res, outputAvail = outputLeft ? *outputLeft : 0;
   1.129 +    res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
   1.130 +    if (res == (size_t) -1) {
   1.131 +        // on some platforms (e.g., linux) iconv will fail with
   1.132 +        // E2BIG if it cannot convert _all_ of its input.  it'll
   1.133 +        // still adjust all of the in/out params correctly, so we
   1.134 +        // can ignore this error.  the assumption is that we will
   1.135 +        // be called again to complete the conversion.
   1.136 +        if ((errno == E2BIG) && (*outputLeft < outputAvail))
   1.137 +            res = 0;
   1.138 +    }
   1.139 +    return res;
   1.140 +}
   1.141 +
   1.142 +static inline void
   1.143 +xp_iconv_reset(iconv_t converter)
   1.144 +{
   1.145 +    // NOTE: the man pages on Solaris claim that you can pass nullptr
   1.146 +    // for all parameter to reset the converter, but beware the 
   1.147 +    // evil Solaris crash if you go down this route >:-)
   1.148 +    
   1.149 +    const char *zero_char_in_ptr  = nullptr;
   1.150 +    char       *zero_char_out_ptr = nullptr;
   1.151 +    size_t      zero_size_in      = 0,
   1.152 +                zero_size_out     = 0;
   1.153 +
   1.154 +    xp_iconv(converter, &zero_char_in_ptr,
   1.155 +                        &zero_size_in,
   1.156 +                        &zero_char_out_ptr,
   1.157 +                        &zero_size_out);
   1.158 +}
   1.159 +
   1.160 +static inline iconv_t
   1.161 +xp_iconv_open(const char **to_list, const char **from_list)
   1.162 +{
   1.163 +    iconv_t res;
   1.164 +    const char **from_name;
   1.165 +    const char **to_name;
   1.166 +
   1.167 +    // try all possible combinations to locate a converter.
   1.168 +    to_name = to_list;
   1.169 +    while (*to_name) {
   1.170 +        if (**to_name) {
   1.171 +            from_name = from_list;
   1.172 +            while (*from_name) {
   1.173 +                if (**from_name) {
   1.174 +                    res = iconv_open(*to_name, *from_name);
   1.175 +                    if (res != INVALID_ICONV_T)
   1.176 +                        return res;
   1.177 +                }
   1.178 +                from_name++;
   1.179 +            }
   1.180 +        }
   1.181 +        to_name++;
   1.182 +    }
   1.183 +
   1.184 +    return INVALID_ICONV_T;
   1.185 +}
   1.186 +
   1.187 +/* 
   1.188 + * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
   1.189 + * have to use UTF-16 with iconv(3) on platforms where it's supported.
   1.190 + * However, the way UTF-16 and UCS-2 are interpreted varies across platforms 
   1.191 + * and implementations of iconv(3). On Tru64, it also depends on the environment
   1.192 + * variable. To avoid the trouble arising from byte-swapping 
   1.193 + * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling 
   1.194 + * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 
   1.195 + * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
   1.196 + * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
   1.197 + * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
   1.198 + * variable ICONV_BYTEORDER is set to 'big-endian', about which not much 
   1.199 + * can be done other than adding a note in the release notes. (bug 206811)
   1.200 + */
   1.201 +static const char *UTF_16_NAMES[] = {
   1.202 +#if defined(IS_LITTLE_ENDIAN)
   1.203 +    "UTF-16LE",
   1.204 +#if defined(__GLIBC__)
   1.205 +    "UNICODELITTLE",
   1.206 +#endif
   1.207 +    "UCS-2LE",
   1.208 +#else
   1.209 +    "UTF-16BE",
   1.210 +#if defined(__GLIBC__)
   1.211 +    "UNICODEBIG",
   1.212 +#endif
   1.213 +    "UCS-2BE",
   1.214 +#endif
   1.215 +    "UTF-16",
   1.216 +    "UCS-2",
   1.217 +    "UCS2",
   1.218 +    "UCS_2",
   1.219 +    "ucs-2",
   1.220 +    "ucs2",
   1.221 +    "ucs_2",
   1.222 +    nullptr
   1.223 +};
   1.224 +
   1.225 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.226 +static const char *UTF_8_NAMES[] = {
   1.227 +    "UTF-8",
   1.228 +    "UTF8",
   1.229 +    "UTF_8",
   1.230 +    "utf-8",
   1.231 +    "utf8",
   1.232 +    "utf_8",
   1.233 +    nullptr
   1.234 +};
   1.235 +#endif
   1.236 +
   1.237 +static const char *ISO_8859_1_NAMES[] = {
   1.238 +    "ISO-8859-1",
   1.239 +#if !defined(__GLIBC__)
   1.240 +    "ISO8859-1",
   1.241 +    "ISO88591",
   1.242 +    "ISO_8859_1",
   1.243 +    "ISO8859_1",
   1.244 +    "iso-8859-1",
   1.245 +    "iso8859-1",
   1.246 +    "iso88591",
   1.247 +    "iso_8859_1",
   1.248 +    "iso8859_1",
   1.249 +#endif
   1.250 +    nullptr
   1.251 +};
   1.252 +
   1.253 +class nsNativeCharsetConverter
   1.254 +{
   1.255 +public:
   1.256 +    nsNativeCharsetConverter();
   1.257 +   ~nsNativeCharsetConverter();
   1.258 +
   1.259 +    nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
   1.260 +                             char16_t       **output, uint32_t *outputLeft);
   1.261 +    nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
   1.262 +                             char            **output, uint32_t *outputLeft);
   1.263 +
   1.264 +    static void GlobalInit();
   1.265 +    static void GlobalShutdown();
   1.266 +    static bool IsNativeUTF8();
   1.267 +
   1.268 +private:
   1.269 +    static iconv_t gNativeToUnicode;
   1.270 +    static iconv_t gUnicodeToNative;
   1.271 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.272 +    static iconv_t gNativeToUTF8;
   1.273 +    static iconv_t gUTF8ToNative;
   1.274 +    static iconv_t gUnicodeToUTF8;
   1.275 +    static iconv_t gUTF8ToUnicode;
   1.276 +#endif
   1.277 +    static Mutex  *gLock;
   1.278 +    static bool    gInitialized;
   1.279 +    static bool    gIsNativeUTF8;
   1.280 +
   1.281 +    static void LazyInit();
   1.282 +
   1.283 +    static void Lock()   { if (gLock) gLock->Lock();   }
   1.284 +    static void Unlock() { if (gLock) gLock->Unlock(); }
   1.285 +};
   1.286 +
   1.287 +iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
   1.288 +iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
   1.289 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.290 +iconv_t nsNativeCharsetConverter::gNativeToUTF8    = INVALID_ICONV_T;
   1.291 +iconv_t nsNativeCharsetConverter::gUTF8ToNative    = INVALID_ICONV_T;
   1.292 +iconv_t nsNativeCharsetConverter::gUnicodeToUTF8   = INVALID_ICONV_T;
   1.293 +iconv_t nsNativeCharsetConverter::gUTF8ToUnicode   = INVALID_ICONV_T;
   1.294 +#endif
   1.295 +Mutex  *nsNativeCharsetConverter::gLock            = nullptr;
   1.296 +bool    nsNativeCharsetConverter::gInitialized     = false;
   1.297 +bool    nsNativeCharsetConverter::gIsNativeUTF8    = false;
   1.298 +
   1.299 +void
   1.300 +nsNativeCharsetConverter::LazyInit()
   1.301 +{
   1.302 +    // LazyInit may be called before NS_StartupNativeCharsetUtils, but
   1.303 +    // the setlocale it does has to be called before nl_langinfo. Like in
   1.304 +    // NS_StartupNativeCharsetUtils, assume we are called early enough that
   1.305 +    // we are the first to care about the locale's charset.
   1.306 +    if (!gLock)
   1.307 +      setlocale(LC_CTYPE, "");
   1.308 +    const char  *blank_list[] = { "", nullptr };
   1.309 +    const char **native_charset_list = blank_list;
   1.310 +    const char  *native_charset = nl_langinfo(CODESET);
   1.311 +    if (native_charset == nullptr) {
   1.312 +        NS_ERROR("native charset is unknown");
   1.313 +        // fallback to ISO-8859-1
   1.314 +        native_charset_list = ISO_8859_1_NAMES;
   1.315 +    }
   1.316 +    else
   1.317 +        native_charset_list[0] = native_charset;
   1.318 +
   1.319 +    // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) 
   1.320 +    // return 'UTF-8' (or 'utf-8')
   1.321 +    if (!PL_strcasecmp(native_charset, "UTF-8"))
   1.322 +        gIsNativeUTF8 = true;
   1.323 +
   1.324 +    gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
   1.325 +    gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
   1.326 +
   1.327 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.328 +    if (gNativeToUnicode == INVALID_ICONV_T) {
   1.329 +        gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
   1.330 +        gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
   1.331 +        NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
   1.332 +        NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
   1.333 +    }
   1.334 +    if (gUnicodeToNative == INVALID_ICONV_T) {
   1.335 +        gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
   1.336 +        gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
   1.337 +        NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
   1.338 +        NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
   1.339 +    }
   1.340 +#else
   1.341 +    NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
   1.342 +    NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
   1.343 +#endif
   1.344 +
   1.345 +    /*
   1.346 +     * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
   1.347 +     * prepend a byte order mark unicode character (BOM, u+FEFF) during
   1.348 +     * the first use of the iconv converter. The same is the case of 
   1.349 +     * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. 
   1.350 +     * However, we use 'UTF-16LE/BE' in both cases, instead so that we 
   1.351 +     * should be safe. But just in case...
   1.352 +     *
   1.353 +     * This dummy conversion gets rid of the BOMs and fixes bug 153562.
   1.354 +     */
   1.355 +    char dummy_input[1] = { ' ' };
   1.356 +    char dummy_output[4];
   1.357 +
   1.358 +    if (gNativeToUnicode != INVALID_ICONV_T) {
   1.359 +	const char *input = dummy_input;
   1.360 +	size_t input_left = sizeof(dummy_input);
   1.361 +	char *output = dummy_output;
   1.362 +	size_t output_left = sizeof(dummy_output);
   1.363 +
   1.364 +	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
   1.365 +    }
   1.366 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.367 +    if (gUTF8ToUnicode != INVALID_ICONV_T) {
   1.368 +	const char *input = dummy_input;
   1.369 +	size_t input_left = sizeof(dummy_input);
   1.370 +	char *output = dummy_output;
   1.371 +	size_t output_left = sizeof(dummy_output);
   1.372 +
   1.373 +	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
   1.374 +    }
   1.375 +#endif
   1.376 +
   1.377 +    gInitialized = true;
   1.378 +}
   1.379 +
   1.380 +void
   1.381 +nsNativeCharsetConverter::GlobalInit()
   1.382 +{
   1.383 +    gLock = new Mutex("nsNativeCharsetConverter.gLock");
   1.384 +}
   1.385 +
   1.386 +void
   1.387 +nsNativeCharsetConverter::GlobalShutdown()
   1.388 +{
   1.389 +    if (gLock) {
   1.390 +        delete gLock;
   1.391 +        gLock = nullptr;
   1.392 +    }
   1.393 +
   1.394 +    if (gNativeToUnicode != INVALID_ICONV_T) {
   1.395 +        iconv_close(gNativeToUnicode);
   1.396 +        gNativeToUnicode = INVALID_ICONV_T;
   1.397 +    }
   1.398 +
   1.399 +    if (gUnicodeToNative != INVALID_ICONV_T) {
   1.400 +        iconv_close(gUnicodeToNative);
   1.401 +        gUnicodeToNative = INVALID_ICONV_T;
   1.402 +    }
   1.403 +
   1.404 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.405 +    if (gNativeToUTF8 != INVALID_ICONV_T) {
   1.406 +        iconv_close(gNativeToUTF8);
   1.407 +        gNativeToUTF8 = INVALID_ICONV_T;
   1.408 +    }
   1.409 +    if (gUTF8ToNative != INVALID_ICONV_T) {
   1.410 +        iconv_close(gUTF8ToNative);
   1.411 +        gUTF8ToNative = INVALID_ICONV_T;
   1.412 +    }
   1.413 +    if (gUnicodeToUTF8 != INVALID_ICONV_T) {
   1.414 +        iconv_close(gUnicodeToUTF8);
   1.415 +        gUnicodeToUTF8 = INVALID_ICONV_T;
   1.416 +    }
   1.417 +    if (gUTF8ToUnicode != INVALID_ICONV_T) {
   1.418 +        iconv_close(gUTF8ToUnicode);
   1.419 +        gUTF8ToUnicode = INVALID_ICONV_T;
   1.420 +    }
   1.421 +#endif
   1.422 +
   1.423 +    gInitialized = false;
   1.424 +}
   1.425 +
   1.426 +nsNativeCharsetConverter::nsNativeCharsetConverter()
   1.427 +{
   1.428 +    Lock();
   1.429 +    if (!gInitialized)
   1.430 +        LazyInit();
   1.431 +}
   1.432 +
   1.433 +nsNativeCharsetConverter::~nsNativeCharsetConverter()
   1.434 +{
   1.435 +    // reset converters for next time
   1.436 +    if (gNativeToUnicode != INVALID_ICONV_T)
   1.437 +        xp_iconv_reset(gNativeToUnicode);
   1.438 +    if (gUnicodeToNative != INVALID_ICONV_T)
   1.439 +        xp_iconv_reset(gUnicodeToNative);
   1.440 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.441 +    if (gNativeToUTF8 != INVALID_ICONV_T)
   1.442 +        xp_iconv_reset(gNativeToUTF8);
   1.443 +    if (gUTF8ToNative != INVALID_ICONV_T)
   1.444 +        xp_iconv_reset(gUTF8ToNative);
   1.445 +    if (gUnicodeToUTF8 != INVALID_ICONV_T)
   1.446 +        xp_iconv_reset(gUnicodeToUTF8);
   1.447 +    if (gUTF8ToUnicode != INVALID_ICONV_T)
   1.448 +        xp_iconv_reset(gUTF8ToUnicode);
   1.449 +#endif
   1.450 +    Unlock();
   1.451 +}
   1.452 +
   1.453 +nsresult
   1.454 +nsNativeCharsetConverter::NativeToUnicode(const char **input,
   1.455 +                                          uint32_t    *inputLeft,
   1.456 +                                          char16_t  **output,
   1.457 +                                          uint32_t    *outputLeft)
   1.458 +{
   1.459 +    size_t res = 0;
   1.460 +    size_t inLeft = (size_t) *inputLeft;
   1.461 +    size_t outLeft = (size_t) *outputLeft * 2;
   1.462 +
   1.463 +    if (gNativeToUnicode != INVALID_ICONV_T) {
   1.464 +
   1.465 +        res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
   1.466 +
   1.467 +        *inputLeft = inLeft;
   1.468 +        *outputLeft = outLeft / 2;
   1.469 +        if (res != (size_t) -1) 
   1.470 +            return NS_OK;
   1.471 +
   1.472 +        NS_WARNING("conversion from native to utf-16 failed");
   1.473 +
   1.474 +        // reset converter
   1.475 +        xp_iconv_reset(gNativeToUnicode);
   1.476 +    }
   1.477 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.478 +    else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
   1.479 +             (gUTF8ToUnicode != INVALID_ICONV_T)) {
   1.480 +        // convert first to UTF8, then from UTF8 to UCS2
   1.481 +        const char *in = *input;
   1.482 +
   1.483 +        char ubuf[1024];
   1.484 +
   1.485 +        // we assume we're always called with enough space in |output|,
   1.486 +        // so convert many chars at a time...
   1.487 +        while (inLeft) {
   1.488 +            char *p = ubuf;
   1.489 +            size_t n = sizeof(ubuf);
   1.490 +            res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
   1.491 +            if (res == (size_t) -1) {
   1.492 +                NS_ERROR("conversion from native to utf-8 failed");
   1.493 +                break;
   1.494 +            }
   1.495 +            NS_ASSERTION(outLeft > 0, "bad assumption");
   1.496 +            p = ubuf;
   1.497 +            n = sizeof(ubuf) - n;
   1.498 +            res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
   1.499 +            if (res == (size_t) -1) {
   1.500 +                NS_ERROR("conversion from utf-8 to utf-16 failed");
   1.501 +                break;
   1.502 +            }
   1.503 +        }
   1.504 +
   1.505 +        (*input) += (*inputLeft - inLeft);
   1.506 +        *inputLeft = inLeft;
   1.507 +        *outputLeft = outLeft / 2;
   1.508 +
   1.509 +        if (res != (size_t) -1) 
   1.510 +            return NS_OK;
   1.511 +
   1.512 +        // reset converters
   1.513 +        xp_iconv_reset(gNativeToUTF8);
   1.514 +        xp_iconv_reset(gUTF8ToUnicode);
   1.515 +    }
   1.516 +#endif
   1.517 +
   1.518 +    // fallback: zero-pad and hope for the best
   1.519 +    // XXX This is lame and we have to do better.
   1.520 +    isolatin1_to_utf16(input, inputLeft, output, outputLeft);
   1.521 +
   1.522 +    return NS_OK;
   1.523 +}
   1.524 +
   1.525 +nsresult
   1.526 +nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
   1.527 +                                          uint32_t         *inputLeft,
   1.528 +                                          char            **output,
   1.529 +                                          uint32_t         *outputLeft)
   1.530 +{
   1.531 +    size_t res = 0;
   1.532 +    size_t inLeft = (size_t) *inputLeft * 2;
   1.533 +    size_t outLeft = (size_t) *outputLeft;
   1.534 +
   1.535 +    if (gUnicodeToNative != INVALID_ICONV_T) {
   1.536 +        res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
   1.537 +
   1.538 +        *inputLeft = inLeft / 2;
   1.539 +        *outputLeft = outLeft;
   1.540 +        if (res != (size_t) -1) {
   1.541 +            return NS_OK;
   1.542 +        }
   1.543 +
   1.544 +        NS_ERROR("iconv failed");
   1.545 +
   1.546 +        // reset converter
   1.547 +        xp_iconv_reset(gUnicodeToNative);
   1.548 +    }
   1.549 +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   1.550 +    else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
   1.551 +             (gUTF8ToNative != INVALID_ICONV_T)) {
   1.552 +        const char *in = (const char *) *input;
   1.553 +
   1.554 +        char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
   1.555 +
   1.556 +        // convert one uchar at a time...
   1.557 +        while (inLeft && outLeft) {
   1.558 +            char *p = ubuf;
   1.559 +            size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);
   1.560 +            res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
   1.561 +            if (res == (size_t) -1) {
   1.562 +                NS_ERROR("conversion from utf-16 to utf-8 failed");
   1.563 +                break;
   1.564 +            }
   1.565 +            p = ubuf;
   1.566 +            n = sizeof(ubuf) - n;
   1.567 +            res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
   1.568 +            if (res == (size_t) -1) {
   1.569 +                if (errno == E2BIG) {
   1.570 +                    // not enough room for last uchar... back up and return.
   1.571 +                    in -= sizeof(char16_t);
   1.572 +                    res = 0;
   1.573 +                }
   1.574 +                else
   1.575 +                    NS_ERROR("conversion from utf-8 to native failed");
   1.576 +                break;
   1.577 +            }
   1.578 +            inLeft -= sizeof(char16_t);
   1.579 +        }
   1.580 +
   1.581 +        (*input) += (*inputLeft - inLeft / 2);
   1.582 +        *inputLeft = inLeft / 2;
   1.583 +        *outputLeft = outLeft;
   1.584 +        if (res != (size_t) -1) {
   1.585 +            return NS_OK;
   1.586 +        }
   1.587 +
   1.588 +        // reset converters
   1.589 +        xp_iconv_reset(gUnicodeToUTF8);
   1.590 +        xp_iconv_reset(gUTF8ToNative);
   1.591 +    }
   1.592 +#endif
   1.593 +
   1.594 +    // fallback: truncate and hope for the best
   1.595 +    // XXX This is lame and we have to do better.
   1.596 +    utf16_to_isolatin1(input, inputLeft, output, outputLeft);
   1.597 +
   1.598 +    return NS_OK;
   1.599 +}
   1.600 +
   1.601 +bool
   1.602 +nsNativeCharsetConverter::IsNativeUTF8()
   1.603 +{
   1.604 +    if (!gInitialized) {
   1.605 +        Lock();
   1.606 +        if (!gInitialized)
   1.607 +           LazyInit();
   1.608 +        Unlock();
   1.609 +    }
   1.610 +    return gIsNativeUTF8; 
   1.611 +}
   1.612 +
   1.613 +#endif // USE_ICONV
   1.614 +
   1.615 +//-----------------------------------------------------------------------------
   1.616 +// conversion using mb[r]towc/wc[r]tomb
   1.617 +//-----------------------------------------------------------------------------
   1.618 +#if defined(USE_STDCONV)
   1.619 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   1.620 +#include <wchar.h>    // mbrtowc, wcrtomb
   1.621 +#endif
   1.622 +
   1.623 +class nsNativeCharsetConverter
   1.624 +{
   1.625 +public:
   1.626 +    nsNativeCharsetConverter();
   1.627 +
   1.628 +    nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
   1.629 +                             char16_t       **output, uint32_t *outputLeft);
   1.630 +    nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
   1.631 +                             char            **output, uint32_t *outputLeft);
   1.632 +
   1.633 +    static void GlobalInit();
   1.634 +    static void GlobalShutdown() { }
   1.635 +    static bool IsNativeUTF8();
   1.636 +
   1.637 +private:
   1.638 +    static bool gWCharIsUnicode;
   1.639 +
   1.640 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   1.641 +    mbstate_t ps;
   1.642 +#endif
   1.643 +};
   1.644 +
   1.645 +bool nsNativeCharsetConverter::gWCharIsUnicode = false;
   1.646 +
   1.647 +nsNativeCharsetConverter::nsNativeCharsetConverter()
   1.648 +{
   1.649 +#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   1.650 +    memset(&ps, 0, sizeof(ps));
   1.651 +#endif
   1.652 +}
   1.653 +
   1.654 +void
   1.655 +nsNativeCharsetConverter::GlobalInit()
   1.656 +{
   1.657 +    // verify that wchar_t for the current locale is actually unicode.
   1.658 +    // if it is not, then we should avoid calling mbtowc/wctomb and
   1.659 +    // just fallback on zero-pad/truncation conversion.
   1.660 +    //
   1.661 +    // this test cannot be done at build time because the encoding of
   1.662 +    // wchar_t may depend on the runtime locale.  sad, but true!!
   1.663 +    //
   1.664 +    // so, if wchar_t is unicode then converting an ASCII character
   1.665 +    // to wchar_t should not change its numeric value.  we'll just
   1.666 +    // check what happens with the ASCII 'a' character.
   1.667 +    //
   1.668 +    // this test is not perfect... obviously, it could yield false
   1.669 +    // positives, but then at least ASCII text would be converted
   1.670 +    // properly (or maybe just the 'a' character) -- oh well :(
   1.671 +
   1.672 +    char a = 'a';
   1.673 +    unsigned int w = 0;
   1.674 +
   1.675 +    int res = mbtowc((wchar_t *) &w, &a, 1);
   1.676 +
   1.677 +    gWCharIsUnicode = (res != -1 && w == 'a');
   1.678 +
   1.679 +#ifdef DEBUG
   1.680 +    if (!gWCharIsUnicode)
   1.681 +        NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
   1.682 +#endif
   1.683 +}
   1.684 +
   1.685 +nsresult
   1.686 +nsNativeCharsetConverter::NativeToUnicode(const char **input,
   1.687 +                                          uint32_t    *inputLeft,
   1.688 +                                          char16_t  **output,
   1.689 +                                          uint32_t    *outputLeft)
   1.690 +{
   1.691 +    if (gWCharIsUnicode) {
   1.692 +        int incr;
   1.693 +
   1.694 +        // cannot use wchar_t here since it may have been redefined (e.g.,
   1.695 +        // via -fshort-wchar).  hopefully, sizeof(tmp) is sufficient XP.
   1.696 +        unsigned int tmp = 0;
   1.697 +        while (*inputLeft && *outputLeft) {
   1.698 +#ifdef HAVE_MBRTOWC
   1.699 +            incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
   1.700 +#else
   1.701 +            // XXX is this thread-safe?
   1.702 +            incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
   1.703 +#endif
   1.704 +            if (incr < 0) {
   1.705 +                NS_WARNING("mbtowc failed: possible charset mismatch");
   1.706 +                // zero-pad and hope for the best
   1.707 +                tmp = (unsigned char) **input;
   1.708 +                incr = 1;
   1.709 +            }
   1.710 +            **output = (char16_t) tmp;
   1.711 +            (*input) += incr;
   1.712 +            (*inputLeft) -= incr;
   1.713 +            (*output)++;
   1.714 +            (*outputLeft)--;
   1.715 +        }
   1.716 +    }
   1.717 +    else {
   1.718 +        // wchar_t isn't unicode, so the best we can do is treat the
   1.719 +        // input as if it is isolatin1 :(
   1.720 +        isolatin1_to_utf16(input, inputLeft, output, outputLeft);
   1.721 +    }
   1.722 +
   1.723 +    return NS_OK;
   1.724 +}
   1.725 +
   1.726 +nsresult
   1.727 +nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
   1.728 +                                          uint32_t         *inputLeft,
   1.729 +                                          char            **output,
   1.730 +                                          uint32_t         *outputLeft)
   1.731 +{
   1.732 +    if (gWCharIsUnicode) {
   1.733 +        int incr;
   1.734 +
   1.735 +        while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
   1.736 +#ifdef HAVE_WCRTOMB
   1.737 +            incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
   1.738 +#else
   1.739 +            // XXX is this thread-safe?
   1.740 +            incr = (int) wctomb(*output, (wchar_t) **input);
   1.741 +#endif
   1.742 +            if (incr < 0) {
   1.743 +                NS_WARNING("mbtowc failed: possible charset mismatch");
   1.744 +                **output = (unsigned char) **input; // truncate
   1.745 +                incr = 1;
   1.746 +            }
   1.747 +            // most likely we're dead anyways if this assertion should fire
   1.748 +            NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string");
   1.749 +            (*output) += incr;
   1.750 +            (*outputLeft) -= incr;
   1.751 +            (*input)++;
   1.752 +            (*inputLeft)--;
   1.753 +        }
   1.754 +    }
   1.755 +    else {
   1.756 +        // wchar_t isn't unicode, so the best we can do is treat the
   1.757 +        // input as if it is isolatin1 :(
   1.758 +        utf16_to_isolatin1(input, inputLeft, output, outputLeft);
   1.759 +    }
   1.760 +
   1.761 +    return NS_OK;
   1.762 +}
   1.763 +
   1.764 +// XXX : for now, return false
   1.765 +bool
   1.766 +nsNativeCharsetConverter::IsNativeUTF8()
   1.767 +{
   1.768 +    return false;
   1.769 +}
   1.770 +
   1.771 +#endif // USE_STDCONV
   1.772 +
   1.773 +//-----------------------------------------------------------------------------
   1.774 +// API implementation
   1.775 +//-----------------------------------------------------------------------------
   1.776 +
   1.777 +nsresult
   1.778 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
   1.779 +{
   1.780 +    output.Truncate();
   1.781 +
   1.782 +    uint32_t inputLen = input.Length();
   1.783 +
   1.784 +    nsACString::const_iterator iter;
   1.785 +    input.BeginReading(iter);
   1.786 +
   1.787 +    //
   1.788 +    // OPTIMIZATION: preallocate space for largest possible result; convert
   1.789 +    // directly into the result buffer to avoid intermediate buffer copy.
   1.790 +    //
   1.791 +    // this will generally result in a larger allocation, but that seems
   1.792 +    // better than an extra buffer copy.
   1.793 +    //
   1.794 +    if (!output.SetLength(inputLen, fallible_t()))
   1.795 +        return NS_ERROR_OUT_OF_MEMORY;
   1.796 +    nsAString::iterator out_iter;
   1.797 +    output.BeginWriting(out_iter);
   1.798 +
   1.799 +    char16_t *result = out_iter.get();
   1.800 +    uint32_t resultLeft = inputLen;
   1.801 +
   1.802 +    const char *buf = iter.get();
   1.803 +    uint32_t bufLeft = inputLen;
   1.804 +
   1.805 +    nsNativeCharsetConverter conv;
   1.806 +    nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
   1.807 +    if (NS_SUCCEEDED(rv)) {
   1.808 +        NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
   1.809 +        output.SetLength(inputLen - resultLeft);
   1.810 +    }
   1.811 +    return rv;
   1.812 +}
   1.813 +
   1.814 +nsresult
   1.815 +NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
   1.816 +{
   1.817 +    output.Truncate();
   1.818 +
   1.819 +    nsAString::const_iterator iter, end;
   1.820 +    input.BeginReading(iter);
   1.821 +    input.EndReading(end);
   1.822 +
   1.823 +    // cannot easily avoid intermediate buffer copy.
   1.824 +    char temp[4096];
   1.825 +
   1.826 +    nsNativeCharsetConverter conv;
   1.827 +
   1.828 +    const char16_t *buf = iter.get();
   1.829 +    uint32_t bufLeft = Distance(iter, end);
   1.830 +    while (bufLeft) {
   1.831 +        char *p = temp;
   1.832 +        uint32_t tempLeft = sizeof(temp);
   1.833 +
   1.834 +        nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
   1.835 +        if (NS_FAILED(rv)) return rv;
   1.836 +
   1.837 +        if (tempLeft < sizeof(temp))
   1.838 +            output.Append(temp, sizeof(temp) - tempLeft);
   1.839 +    }
   1.840 +    return NS_OK;
   1.841 +}
   1.842 +
   1.843 +bool
   1.844 +NS_IsNativeUTF8()
   1.845 +{
   1.846 +    return nsNativeCharsetConverter::IsNativeUTF8();
   1.847 +}
   1.848 +
   1.849 +void
   1.850 +NS_StartupNativeCharsetUtils()
   1.851 +{
   1.852 +    //
   1.853 +    // need to initialize the locale or else charset conversion will fail.
   1.854 +    // better not delay this in case some other component alters the locale
   1.855 +    // settings.
   1.856 +    //
   1.857 +    // XXX we assume that we are called early enough that we should
   1.858 +    // always be the first to care about the locale's charset.
   1.859 +    //
   1.860 +    setlocale(LC_CTYPE, "");
   1.861 +
   1.862 +    nsNativeCharsetConverter::GlobalInit();
   1.863 +}
   1.864 +
   1.865 +void
   1.866 +NS_ShutdownNativeCharsetUtils()
   1.867 +{
   1.868 +    nsNativeCharsetConverter::GlobalShutdown();
   1.869 +}
   1.870 +
   1.871 +//-----------------------------------------------------------------------------
   1.872 +// XP_WIN
   1.873 +//-----------------------------------------------------------------------------
   1.874 +#elif defined(XP_WIN)
   1.875 +
   1.876 +#include <windows.h>
   1.877 +#include "nsString.h"
   1.878 +#include "nsAString.h"
   1.879 +#include "nsReadableUtils.h"
   1.880 +
   1.881 +using namespace mozilla;
   1.882 +
   1.883 +nsresult
   1.884 +NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
   1.885 +{
   1.886 +    uint32_t inputLen = input.Length();
   1.887 +
   1.888 +    nsACString::const_iterator iter;
   1.889 +    input.BeginReading(iter);
   1.890 +
   1.891 +    const char *buf = iter.get();
   1.892 +
   1.893 +    // determine length of result
   1.894 +    uint32_t resultLen = 0;
   1.895 +    int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);
   1.896 +    if (n > 0)
   1.897 +        resultLen += n;
   1.898 +
   1.899 +    // allocate sufficient space
   1.900 +    if (!output.SetLength(resultLen, fallible_t()))
   1.901 +        return NS_ERROR_OUT_OF_MEMORY;
   1.902 +    if (resultLen > 0) {
   1.903 +        nsAString::iterator out_iter;
   1.904 +        output.BeginWriting(out_iter);
   1.905 +
   1.906 +        char16_t *result = out_iter.get();
   1.907 +
   1.908 +        ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen);
   1.909 +    }
   1.910 +    return NS_OK;
   1.911 +}
   1.912 +
   1.913 +nsresult
   1.914 +NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
   1.915 +{
   1.916 +    uint32_t inputLen = input.Length();
   1.917 +
   1.918 +    nsAString::const_iterator iter;
   1.919 +    input.BeginReading(iter);
   1.920 +
   1.921 +    char16ptr_t buf = iter.get();
   1.922 +
   1.923 +    // determine length of result
   1.924 +    uint32_t resultLen = 0;
   1.925 +
   1.926 +    int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,
   1.927 +                                  nullptr, nullptr);
   1.928 +    if (n > 0)
   1.929 +        resultLen += n;
   1.930 +
   1.931 +    // allocate sufficient space
   1.932 +    if (!output.SetLength(resultLen, fallible_t()))
   1.933 +        return NS_ERROR_OUT_OF_MEMORY;
   1.934 +    if (resultLen > 0) {
   1.935 +        nsACString::iterator out_iter;
   1.936 +        output.BeginWriting(out_iter);
   1.937 +
   1.938 +        // default "defaultChar" is '?', which is an illegal character on windows
   1.939 +        // file system.  That will cause file uncreatable. Change it to '_'
   1.940 +        const char defaultChar = '_';
   1.941 +
   1.942 +        char *result = out_iter.get();
   1.943 +
   1.944 +        ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
   1.945 +                              &defaultChar, nullptr);
   1.946 +    }
   1.947 +    return NS_OK;
   1.948 +}
   1.949 +
   1.950 +// moved from widget/windows/nsToolkit.cpp
   1.951 +int32_t 
   1.952 +NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW)
   1.953 +{
   1.954 +    return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize);
   1.955 +}
   1.956 +
   1.957 +int32_t 
   1.958 +NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut,
   1.959 +               char *aStrOutA, const char *aDefault)
   1.960 +{
   1.961 +    if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
   1.962 +        return 0;
   1.963 +
   1.964 +    int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1,
   1.965 +                                                aStrOutA, aBufferSizeOut,
   1.966 +                                                aDefault, nullptr);
   1.967 +
   1.968 +    if (!numCharsConverted) {
   1.969 +        if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
   1.970 +            // Overflow, add missing null termination but return 0
   1.971 +            aStrOutA[aBufferSizeOut-1] = '\0';
   1.972 +        }
   1.973 +        else {
   1.974 +            // Other error, clear string and return 0
   1.975 +            aStrOutA[0] = '\0';
   1.976 +        }
   1.977 +    }
   1.978 +    else if (numCharsConverted < aBufferSizeOut) {
   1.979 +        // Add 2nd null (really necessary?)
   1.980 +        aStrOutA[numCharsConverted] = '\0';
   1.981 +    }
   1.982 +
   1.983 +    return numCharsConverted;
   1.984 +}
   1.985 +
   1.986 +#else
   1.987 +
   1.988 +#include "nsReadableUtils.h"
   1.989 +
   1.990 +nsresult
   1.991 +NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
   1.992 +{
   1.993 +    CopyASCIItoUTF16(input, output);
   1.994 +    return NS_OK;
   1.995 +}
   1.996 +
   1.997 +nsresult
   1.998 +NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
   1.999 +{
  1.1000 +    LossyCopyUTF16toASCII(input, output);
  1.1001 +    return NS_OK;
  1.1002 +}
  1.1003 +
  1.1004 +void
  1.1005 +NS_StartupNativeCharsetUtils()
  1.1006 +{
  1.1007 +}
  1.1008 +
  1.1009 +void
  1.1010 +NS_ShutdownNativeCharsetUtils()
  1.1011 +{
  1.1012 +}
  1.1013 +
  1.1014 +#endif
The Tor Browser / file diff

diff: xpcom/io/nsNativeCharsetUtils.cpp

xpcom/io/nsNativeCharsetUtils.cpp