The Tor Browser: xpcom/io/nsNativeCharsetUtils.cpp@97036ab72558

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* This Source Code Form is subject to the terms of the Mozilla Public

     2  * License, v. 2.0. If a copy of the MPL was not distributed with this

     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     5 #include "xpcom-private.h"

     7 //-----------------------------------------------------------------------------

     8 // XP_MACOSX or ANDROID

     9 //-----------------------------------------------------------------------------

    10 #if defined(XP_MACOSX) || defined(ANDROID)

    12 #include "nsAString.h"

    13 #include "nsReadableUtils.h"

    14 #include "nsString.h"

    16 nsresult

    17 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)

    18 {

    19     CopyUTF8toUTF16(input, output);

    20     return NS_OK;

    21 }

    23 nsresult

    24 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)

    25 {

    26     CopyUTF16toUTF8(input, output);

    27     return NS_OK;

    28 }

    30 void

    31 NS_StartupNativeCharsetUtils()

    32 {

    33 }

    35 void

    36 NS_ShutdownNativeCharsetUtils()

    37 {

    38 }

    41 //-----------------------------------------------------------------------------

    42 // XP_UNIX

    43 //-----------------------------------------------------------------------------

    44 #elif defined(XP_UNIX)

    46 #include <stdlib.h>   // mbtowc, wctomb

    47 #include <locale.h>   // setlocale

    48 #include "mozilla/Mutex.h"

    49 #include "nscore.h"

    50 #include "nsAString.h"

    51 #include "nsReadableUtils.h"

    53 using namespace mozilla;

    55 //

    56 // choose a conversion library.  we used to use mbrtowc/wcrtomb under Linux,

    57 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'

    58 // or not (see bug 206811 and

    59 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use

    60 // iconv for all platforms where nltypes.h and nllanginfo.h are present

    61 // along with iconv.

    62 //

    63 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)

    64 #define USE_ICONV 1

    65 #else

    66 #define USE_STDCONV 1

    67 #endif

    69 static void

    70 isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft)

    71 {

    72     while (*inputLeft && *outputLeft) {

    73         **output = (unsigned char) **input;

    74         (*input)++;

    75         (*inputLeft)--;

    76         (*output)++;

    77         (*outputLeft)--;

    78     }

    79 }

    81 static void

    82 utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft)

    83 {

    84     while (*inputLeft && *outputLeft) {

    85         **output = (unsigned char) **input;

    86         (*input)++;

    87         (*inputLeft)--;

    88         (*output)++;

    89         (*outputLeft)--;

    90     }

    91 }

    93 //-----------------------------------------------------------------------------

    94 // conversion using iconv

    95 //-----------------------------------------------------------------------------

    96 #if defined(USE_ICONV)

    97 #include <nl_types.h> // CODESET

    98 #include <langinfo.h> // nl_langinfo

    99 #include <iconv.h>    // iconv_open, iconv, iconv_close

   100 #include <errno.h>

   101 #include "plstr.h"

   103 #if defined(HAVE_ICONV_WITH_CONST_INPUT)

   104 #define ICONV_INPUT(x) (x)

   105 #else

   106 #define ICONV_INPUT(x) ((char **)x)

   107 #endif

   109 // solaris definitely needs this, but we'll enable it by default

   110 // just in case... but we know for sure that iconv(3) in glibc

   111 // doesn't need this.

   112 #if !defined(__GLIBC__)

   113 #define ENABLE_UTF8_FALLBACK_SUPPORT

   114 #endif

   116 #define INVALID_ICONV_T ((iconv_t) -1)

   118 static inline size_t

   119 xp_iconv(iconv_t converter,

   120          const char **input,

   121          size_t      *inputLeft,

   122          char       **output,

   123          size_t      *outputLeft)

   124 {

   125     size_t res, outputAvail = outputLeft ? *outputLeft : 0;

   126     res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);

   127     if (res == (size_t) -1) {

   128         // on some platforms (e.g., linux) iconv will fail with

   129         // E2BIG if it cannot convert _all_ of its input.  it'll

   130         // still adjust all of the in/out params correctly, so we

   131         // can ignore this error.  the assumption is that we will

   132         // be called again to complete the conversion.

   133         if ((errno == E2BIG) && (*outputLeft < outputAvail))

   134             res = 0;

   135     }

   136     return res;

   137 }

   139 static inline void

   140 xp_iconv_reset(iconv_t converter)

   141 {

   142     // NOTE: the man pages on Solaris claim that you can pass nullptr

   143     // for all parameter to reset the converter, but beware the

   144     // evil Solaris crash if you go down this route >:-)

   146     const char *zero_char_in_ptr  = nullptr;

   147     char       *zero_char_out_ptr = nullptr;

   148     size_t      zero_size_in      = 0,

   149                 zero_size_out     = 0;

   151     xp_iconv(converter, &zero_char_in_ptr,

   152                         &zero_size_in,

   153                         &zero_char_out_ptr,

   154                         &zero_size_out);

   155 }

   157 static inline iconv_t

   158 xp_iconv_open(const char **to_list, const char **from_list)

   159 {

   160     iconv_t res;

   161     const char **from_name;

   162     const char **to_name;

   164     // try all possible combinations to locate a converter.

   165     to_name = to_list;

   166     while (*to_name) {

   167         if (**to_name) {

   168             from_name = from_list;

   169             while (*from_name) {

   170                 if (**from_name) {

   171                     res = iconv_open(*to_name, *from_name);

   172                     if (res != INVALID_ICONV_T)

   173                         return res;

   174                 }

   175                 from_name++;

   176             }

   177         }

   178         to_name++;

   179     }

   181     return INVALID_ICONV_T;

   182 }

   184 /*

   185  * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we

   186  * have to use UTF-16 with iconv(3) on platforms where it's supported.

   187  * However, the way UTF-16 and UCS-2 are interpreted varies across platforms

   188  * and implementations of iconv(3). On Tru64, it also depends on the environment

   189  * variable. To avoid the trouble arising from byte-swapping

   190  * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling

   191  * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2

   192  * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,

   193  * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'

   194  * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment

   195  * variable ICONV_BYTEORDER is set to 'big-endian', about which not much

   196  * can be done other than adding a note in the release notes. (bug 206811)

   197  */

   198 static const char *UTF_16_NAMES[] = {

   199 #if defined(IS_LITTLE_ENDIAN)

   200     "UTF-16LE",

   201 #if defined(__GLIBC__)

   202     "UNICODELITTLE",

   203 #endif

   204     "UCS-2LE",

   205 #else

   206     "UTF-16BE",

   207 #if defined(__GLIBC__)

   208     "UNICODEBIG",

   209 #endif

   210     "UCS-2BE",

   211 #endif

   212     "UTF-16",

   213     "UCS-2",

   214     "UCS2",

   215     "UCS_2",

   216     "ucs-2",

   217     "ucs2",

   218     "ucs_2",

   219     nullptr

   220 };

   222 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   223 static const char *UTF_8_NAMES[] = {

   224     "UTF-8",

   225     "UTF8",

   226     "UTF_8",

   227     "utf-8",

   228     "utf8",

   229     "utf_8",

   230     nullptr

   231 };

   232 #endif

   234 static const char *ISO_8859_1_NAMES[] = {

   235     "ISO-8859-1",

   236 #if !defined(__GLIBC__)

   237     "ISO8859-1",

   238     "ISO88591",

   239     "ISO_8859_1",

   240     "ISO8859_1",

   241     "iso-8859-1",

   242     "iso8859-1",

   243     "iso88591",

   244     "iso_8859_1",

   245     "iso8859_1",

   246 #endif

   247     nullptr

   248 };

   250 class nsNativeCharsetConverter

   251 {

   252 public:

   253     nsNativeCharsetConverter();

   254    ~nsNativeCharsetConverter();

   256     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,

   257                              char16_t       **output, uint32_t *outputLeft);

   258     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,

   259                              char            **output, uint32_t *outputLeft);

   261     static void GlobalInit();

   262     static void GlobalShutdown();

   263     static bool IsNativeUTF8();

   265 private:

   266     static iconv_t gNativeToUnicode;

   267     static iconv_t gUnicodeToNative;

   268 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   269     static iconv_t gNativeToUTF8;

   270     static iconv_t gUTF8ToNative;

   271     static iconv_t gUnicodeToUTF8;

   272     static iconv_t gUTF8ToUnicode;

   273 #endif

   274     static Mutex  *gLock;

   275     static bool    gInitialized;

   276     static bool    gIsNativeUTF8;

   278     static void LazyInit();

   280     static void Lock()   { if (gLock) gLock->Lock();   }

   281     static void Unlock() { if (gLock) gLock->Unlock(); }

   282 };

   284 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;

   285 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;

   286 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   287 iconv_t nsNativeCharsetConverter::gNativeToUTF8    = INVALID_ICONV_T;

   288 iconv_t nsNativeCharsetConverter::gUTF8ToNative    = INVALID_ICONV_T;

   289 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8   = INVALID_ICONV_T;

   290 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode   = INVALID_ICONV_T;

   291 #endif

   292 Mutex  *nsNativeCharsetConverter::gLock            = nullptr;

   293 bool    nsNativeCharsetConverter::gInitialized     = false;

   294 bool    nsNativeCharsetConverter::gIsNativeUTF8    = false;

   296 void

   297 nsNativeCharsetConverter::LazyInit()

   298 {

   299     // LazyInit may be called before NS_StartupNativeCharsetUtils, but

   300     // the setlocale it does has to be called before nl_langinfo. Like in

   301     // NS_StartupNativeCharsetUtils, assume we are called early enough that

   302     // we are the first to care about the locale's charset.

   303     if (!gLock)

   304       setlocale(LC_CTYPE, "");

   305     const char  *blank_list[] = { "", nullptr };

   306     const char **native_charset_list = blank_list;

   307     const char  *native_charset = nl_langinfo(CODESET);

   308     if (native_charset == nullptr) {

   309         NS_ERROR("native charset is unknown");

   310         // fallback to ISO-8859-1

   311         native_charset_list = ISO_8859_1_NAMES;

   312     }

   313     else

   314         native_charset_list[0] = native_charset;

   316     // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)

   317     // return 'UTF-8' (or 'utf-8')

   318     if (!PL_strcasecmp(native_charset, "UTF-8"))

   319         gIsNativeUTF8 = true;

   321     gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);

   322     gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);

   324 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   325     if (gNativeToUnicode == INVALID_ICONV_T) {

   326         gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);

   327         gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);

   328         NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");

   329         NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");

   330     }

   331     if (gUnicodeToNative == INVALID_ICONV_T) {

   332         gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);

   333         gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);

   334         NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");

   335         NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");

   336     }

   337 #else

   338     NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");

   339     NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");

   340 #endif

   342     /*

   343      * On Solaris 8 (and newer?), the iconv modules converting to UCS-2

   344      * prepend a byte order mark unicode character (BOM, u+FEFF) during

   345      * the first use of the iconv converter. The same is the case of

   346      * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.

   347      * However, we use 'UTF-16LE/BE' in both cases, instead so that we

   348      * should be safe. But just in case...

   349      *

   350      * This dummy conversion gets rid of the BOMs and fixes bug 153562.

   351      */

   352     char dummy_input[1] = { ' ' };

   353     char dummy_output[4];

   355     if (gNativeToUnicode != INVALID_ICONV_T) {

   356 	const char *input = dummy_input;

   357 	size_t input_left = sizeof(dummy_input);

   358 	char *output = dummy_output;

   359 	size_t output_left = sizeof(dummy_output);

   361 	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);

   362     }

   363 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   364     if (gUTF8ToUnicode != INVALID_ICONV_T) {

   365 	const char *input = dummy_input;

   366 	size_t input_left = sizeof(dummy_input);

   367 	char *output = dummy_output;

   368 	size_t output_left = sizeof(dummy_output);

   370 	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);

   371     }

   372 #endif

   374     gInitialized = true;

   375 }

   377 void

   378 nsNativeCharsetConverter::GlobalInit()

   379 {

   380     gLock = new Mutex("nsNativeCharsetConverter.gLock");

   381 }

   383 void

   384 nsNativeCharsetConverter::GlobalShutdown()

   385 {

   386     if (gLock) {

   387         delete gLock;

   388         gLock = nullptr;

   389     }

   391     if (gNativeToUnicode != INVALID_ICONV_T) {

   392         iconv_close(gNativeToUnicode);

   393         gNativeToUnicode = INVALID_ICONV_T;

   394     }

   396     if (gUnicodeToNative != INVALID_ICONV_T) {

   397         iconv_close(gUnicodeToNative);

   398         gUnicodeToNative = INVALID_ICONV_T;

   399     }

   401 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   402     if (gNativeToUTF8 != INVALID_ICONV_T) {

   403         iconv_close(gNativeToUTF8);

   404         gNativeToUTF8 = INVALID_ICONV_T;

   405     }

   406     if (gUTF8ToNative != INVALID_ICONV_T) {

   407         iconv_close(gUTF8ToNative);

   408         gUTF8ToNative = INVALID_ICONV_T;

   409     }

   410     if (gUnicodeToUTF8 != INVALID_ICONV_T) {

   411         iconv_close(gUnicodeToUTF8);

   412         gUnicodeToUTF8 = INVALID_ICONV_T;

   413     }

   414     if (gUTF8ToUnicode != INVALID_ICONV_T) {

   415         iconv_close(gUTF8ToUnicode);

   416         gUTF8ToUnicode = INVALID_ICONV_T;

   417     }

   418 #endif

   420     gInitialized = false;

   421 }

   423 nsNativeCharsetConverter::nsNativeCharsetConverter()

   424 {

   425     Lock();

   426     if (!gInitialized)

   427         LazyInit();

   428 }

   430 nsNativeCharsetConverter::~nsNativeCharsetConverter()

   431 {

   432     // reset converters for next time

   433     if (gNativeToUnicode != INVALID_ICONV_T)

   434         xp_iconv_reset(gNativeToUnicode);

   435     if (gUnicodeToNative != INVALID_ICONV_T)

   436         xp_iconv_reset(gUnicodeToNative);

   437 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   438     if (gNativeToUTF8 != INVALID_ICONV_T)

   439         xp_iconv_reset(gNativeToUTF8);

   440     if (gUTF8ToNative != INVALID_ICONV_T)

   441         xp_iconv_reset(gUTF8ToNative);

   442     if (gUnicodeToUTF8 != INVALID_ICONV_T)

   443         xp_iconv_reset(gUnicodeToUTF8);

   444     if (gUTF8ToUnicode != INVALID_ICONV_T)

   445         xp_iconv_reset(gUTF8ToUnicode);

   446 #endif

   447     Unlock();

   448 }

   450 nsresult

   451 nsNativeCharsetConverter::NativeToUnicode(const char **input,

   452                                           uint32_t    *inputLeft,

   453                                           char16_t  **output,

   454                                           uint32_t    *outputLeft)

   455 {

   456     size_t res = 0;

   457     size_t inLeft = (size_t) *inputLeft;

   458     size_t outLeft = (size_t) *outputLeft * 2;

   460     if (gNativeToUnicode != INVALID_ICONV_T) {

   462         res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);

   464         *inputLeft = inLeft;

   465         *outputLeft = outLeft / 2;

   466         if (res != (size_t) -1)

   467             return NS_OK;

   469         NS_WARNING("conversion from native to utf-16 failed");

   471         // reset converter

   472         xp_iconv_reset(gNativeToUnicode);

   473     }

   474 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   475     else if ((gNativeToUTF8 != INVALID_ICONV_T) &&

   476              (gUTF8ToUnicode != INVALID_ICONV_T)) {

   477         // convert first to UTF8, then from UTF8 to UCS2

   478         const char *in = *input;

   480         char ubuf[1024];

   482         // we assume we're always called with enough space in |output|,

   483         // so convert many chars at a time...

   484         while (inLeft) {

   485             char *p = ubuf;

   486             size_t n = sizeof(ubuf);

   487             res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);

   488             if (res == (size_t) -1) {

   489                 NS_ERROR("conversion from native to utf-8 failed");

   490                 break;

   491             }

   492             NS_ASSERTION(outLeft > 0, "bad assumption");

   493             p = ubuf;

   494             n = sizeof(ubuf) - n;

   495             res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);

   496             if (res == (size_t) -1) {

   497                 NS_ERROR("conversion from utf-8 to utf-16 failed");

   498                 break;

   499             }

   500         }

   502         (*input) += (*inputLeft - inLeft);

   503         *inputLeft = inLeft;

   504         *outputLeft = outLeft / 2;

   506         if (res != (size_t) -1)

   507             return NS_OK;

   509         // reset converters

   510         xp_iconv_reset(gNativeToUTF8);

   511         xp_iconv_reset(gUTF8ToUnicode);

   512     }

   513 #endif

   515     // fallback: zero-pad and hope for the best

   516     // XXX This is lame and we have to do better.

   517     isolatin1_to_utf16(input, inputLeft, output, outputLeft);

   519     return NS_OK;

   520 }

   522 nsresult

   523 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,

   524                                           uint32_t         *inputLeft,

   525                                           char            **output,

   526                                           uint32_t         *outputLeft)

   527 {

   528     size_t res = 0;

   529     size_t inLeft = (size_t) *inputLeft * 2;

   530     size_t outLeft = (size_t) *outputLeft;

   532     if (gUnicodeToNative != INVALID_ICONV_T) {

   533         res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);

   535         *inputLeft = inLeft / 2;

   536         *outputLeft = outLeft;

   537         if (res != (size_t) -1) {

   538             return NS_OK;

   539         }

   541         NS_ERROR("iconv failed");

   543         // reset converter

   544         xp_iconv_reset(gUnicodeToNative);

   545     }

   546 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)

   547     else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&

   548              (gUTF8ToNative != INVALID_ICONV_T)) {

   549         const char *in = (const char *) *input;

   551         char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)

   553         // convert one uchar at a time...

   554         while (inLeft && outLeft) {

   555             char *p = ubuf;

   556             size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);

   557             res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);

   558             if (res == (size_t) -1) {

   559                 NS_ERROR("conversion from utf-16 to utf-8 failed");

   560                 break;

   561             }

   562             p = ubuf;

   563             n = sizeof(ubuf) - n;

   564             res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);

   565             if (res == (size_t) -1) {

   566                 if (errno == E2BIG) {

   567                     // not enough room for last uchar... back up and return.

   568                     in -= sizeof(char16_t);

   569                     res = 0;

   570                 }

   571                 else

   572                     NS_ERROR("conversion from utf-8 to native failed");

   573                 break;

   574             }

   575             inLeft -= sizeof(char16_t);

   576         }

   578         (*input) += (*inputLeft - inLeft / 2);

   579         *inputLeft = inLeft / 2;

   580         *outputLeft = outLeft;

   581         if (res != (size_t) -1) {

   582             return NS_OK;

   583         }

   585         // reset converters

   586         xp_iconv_reset(gUnicodeToUTF8);

   587         xp_iconv_reset(gUTF8ToNative);

   588     }

   589 #endif

   591     // fallback: truncate and hope for the best

   592     // XXX This is lame and we have to do better.

   593     utf16_to_isolatin1(input, inputLeft, output, outputLeft);

   595     return NS_OK;

   596 }

   598 bool

   599 nsNativeCharsetConverter::IsNativeUTF8()

   600 {

   601     if (!gInitialized) {

   602         Lock();

   603         if (!gInitialized)

   604            LazyInit();

   605         Unlock();

   606     }

   607     return gIsNativeUTF8;

   608 }

   610 #endif // USE_ICONV

   612 //-----------------------------------------------------------------------------

   613 // conversion using mb[r]towc/wc[r]tomb

   614 //-----------------------------------------------------------------------------

   615 #if defined(USE_STDCONV)

   616 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)

   617 #include <wchar.h>    // mbrtowc, wcrtomb

   618 #endif

   620 class nsNativeCharsetConverter

   621 {

   622 public:

   623     nsNativeCharsetConverter();

   625     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,

   626                              char16_t       **output, uint32_t *outputLeft);

   627     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,

   628                              char            **output, uint32_t *outputLeft);

   630     static void GlobalInit();

   631     static void GlobalShutdown() { }

   632     static bool IsNativeUTF8();

   634 private:

   635     static bool gWCharIsUnicode;

   637 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)

   638     mbstate_t ps;

   639 #endif

   640 };

   642 bool nsNativeCharsetConverter::gWCharIsUnicode = false;

   644 nsNativeCharsetConverter::nsNativeCharsetConverter()

   645 {

   646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)

   647     memset(&ps, 0, sizeof(ps));

   648 #endif

   649 }

   651 void

   652 nsNativeCharsetConverter::GlobalInit()

   653 {

   654     // verify that wchar_t for the current locale is actually unicode.

   655     // if it is not, then we should avoid calling mbtowc/wctomb and

   656     // just fallback on zero-pad/truncation conversion.

   657     //

   658     // this test cannot be done at build time because the encoding of

   659     // wchar_t may depend on the runtime locale.  sad, but true!!

   660     //

   661     // so, if wchar_t is unicode then converting an ASCII character

   662     // to wchar_t should not change its numeric value.  we'll just

   663     // check what happens with the ASCII 'a' character.

   664     //

   665     // this test is not perfect... obviously, it could yield false

   666     // positives, but then at least ASCII text would be converted

   667     // properly (or maybe just the 'a' character) -- oh well :(

   669     char a = 'a';

   670     unsigned int w = 0;

   672     int res = mbtowc((wchar_t *) &w, &a, 1);

   674     gWCharIsUnicode = (res != -1 && w == 'a');

   676 #ifdef DEBUG

   677     if (!gWCharIsUnicode)

   678         NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");

   679 #endif

   680 }

   682 nsresult

   683 nsNativeCharsetConverter::NativeToUnicode(const char **input,

   684                                           uint32_t    *inputLeft,

   685                                           char16_t  **output,

   686                                           uint32_t    *outputLeft)

   687 {

   688     if (gWCharIsUnicode) {

   689         int incr;

   691         // cannot use wchar_t here since it may have been redefined (e.g.,

   692         // via -fshort-wchar).  hopefully, sizeof(tmp) is sufficient XP.

   693         unsigned int tmp = 0;

   694         while (*inputLeft && *outputLeft) {

   695 #ifdef HAVE_MBRTOWC

   696             incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);

   697 #else

   698             // XXX is this thread-safe?

   699             incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);

   700 #endif

   701             if (incr < 0) {

   702                 NS_WARNING("mbtowc failed: possible charset mismatch");

   703                 // zero-pad and hope for the best

   704                 tmp = (unsigned char) **input;

   705                 incr = 1;

   706             }

   707             **output = (char16_t) tmp;

   708             (*input) += incr;

   709             (*inputLeft) -= incr;

   710             (*output)++;

   711             (*outputLeft)--;

   712         }

   713     }

   714     else {

   715         // wchar_t isn't unicode, so the best we can do is treat the

   716         // input as if it is isolatin1 :(

   717         isolatin1_to_utf16(input, inputLeft, output, outputLeft);

   718     }

   720     return NS_OK;

   721 }

   723 nsresult

   724 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,

   725                                           uint32_t         *inputLeft,

   726                                           char            **output,

   727                                           uint32_t         *outputLeft)

   728 {

   729     if (gWCharIsUnicode) {

   730         int incr;

   732         while (*inputLeft && *outputLeft >= MB_CUR_MAX) {

   733 #ifdef HAVE_WCRTOMB

   734             incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);

   735 #else

   736             // XXX is this thread-safe?

   737             incr = (int) wctomb(*output, (wchar_t) **input);

   738 #endif

   739             if (incr < 0) {

   740                 NS_WARNING("mbtowc failed: possible charset mismatch");

   741                 **output = (unsigned char) **input; // truncate

   742                 incr = 1;

   743             }

   744             // most likely we're dead anyways if this assertion should fire

   745             NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string");

   746             (*output) += incr;

   747             (*outputLeft) -= incr;

   748             (*input)++;

   749             (*inputLeft)--;

   750         }

   751     }

   752     else {

   753         // wchar_t isn't unicode, so the best we can do is treat the

   754         // input as if it is isolatin1 :(

   755         utf16_to_isolatin1(input, inputLeft, output, outputLeft);

   756     }

   758     return NS_OK;

   759 }

   761 // XXX : for now, return false

   762 bool

   763 nsNativeCharsetConverter::IsNativeUTF8()

   764 {

   765     return false;

   766 }

   768 #endif // USE_STDCONV

   770 //-----------------------------------------------------------------------------

   771 // API implementation

   772 //-----------------------------------------------------------------------------

   774 nsresult

   775 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)

   776 {

   777     output.Truncate();

   779     uint32_t inputLen = input.Length();

   781     nsACString::const_iterator iter;

   782     input.BeginReading(iter);

   784     //

   785     // OPTIMIZATION: preallocate space for largest possible result; convert

   786     // directly into the result buffer to avoid intermediate buffer copy.

   787     //

   788     // this will generally result in a larger allocation, but that seems

   789     // better than an extra buffer copy.

   790     //

   791     if (!output.SetLength(inputLen, fallible_t()))

   792         return NS_ERROR_OUT_OF_MEMORY;

   793     nsAString::iterator out_iter;

   794     output.BeginWriting(out_iter);

   796     char16_t *result = out_iter.get();

   797     uint32_t resultLeft = inputLen;

   799     const char *buf = iter.get();

   800     uint32_t bufLeft = inputLen;

   802     nsNativeCharsetConverter conv;

   803     nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);

   804     if (NS_SUCCEEDED(rv)) {

   805         NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");

   806         output.SetLength(inputLen - resultLeft);

   807     }

   808     return rv;

   809 }

   811 nsresult

   812 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)

   813 {

   814     output.Truncate();

   816     nsAString::const_iterator iter, end;

   817     input.BeginReading(iter);

   818     input.EndReading(end);

   820     // cannot easily avoid intermediate buffer copy.

   821     char temp[4096];

   823     nsNativeCharsetConverter conv;

   825     const char16_t *buf = iter.get();

   826     uint32_t bufLeft = Distance(iter, end);

   827     while (bufLeft) {

   828         char *p = temp;

   829         uint32_t tempLeft = sizeof(temp);

   831         nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);

   832         if (NS_FAILED(rv)) return rv;

   834         if (tempLeft < sizeof(temp))

   835             output.Append(temp, sizeof(temp) - tempLeft);

   836     }

   837     return NS_OK;

   838 }

   840 bool

   841 NS_IsNativeUTF8()

   842 {

   843     return nsNativeCharsetConverter::IsNativeUTF8();

   844 }

   846 void

   847 NS_StartupNativeCharsetUtils()

   848 {

   849     //

   850     // need to initialize the locale or else charset conversion will fail.

   851     // better not delay this in case some other component alters the locale

   852     // settings.

   853     //

   854     // XXX we assume that we are called early enough that we should

   855     // always be the first to care about the locale's charset.

   856     //

   857     setlocale(LC_CTYPE, "");

   859     nsNativeCharsetConverter::GlobalInit();

   860 }

   862 void

   863 NS_ShutdownNativeCharsetUtils()

   864 {

   865     nsNativeCharsetConverter::GlobalShutdown();

   866 }

   868 //-----------------------------------------------------------------------------

   869 // XP_WIN

   870 //-----------------------------------------------------------------------------

   871 #elif defined(XP_WIN)

   873 #include <windows.h>

   874 #include "nsString.h"

   875 #include "nsAString.h"

   876 #include "nsReadableUtils.h"

   878 using namespace mozilla;

   880 nsresult

   881 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)

   882 {

   883     uint32_t inputLen = input.Length();

   885     nsACString::const_iterator iter;

   886     input.BeginReading(iter);

   888     const char *buf = iter.get();

   890     // determine length of result

   891     uint32_t resultLen = 0;

   892     int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);

   893     if (n > 0)

   894         resultLen += n;

   896     // allocate sufficient space

   897     if (!output.SetLength(resultLen, fallible_t()))

   898         return NS_ERROR_OUT_OF_MEMORY;

   899     if (resultLen > 0) {

   900         nsAString::iterator out_iter;

   901         output.BeginWriting(out_iter);

   903         char16_t *result = out_iter.get();

   905         ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen);

   906     }

   907     return NS_OK;

   908 }

   910 nsresult

   911 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)

   912 {

   913     uint32_t inputLen = input.Length();

   915     nsAString::const_iterator iter;

   916     input.BeginReading(iter);

   918     char16ptr_t buf = iter.get();

   920     // determine length of result

   921     uint32_t resultLen = 0;

   923     int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,

   924                                   nullptr, nullptr);

   925     if (n > 0)

   926         resultLen += n;

   928     // allocate sufficient space

   929     if (!output.SetLength(resultLen, fallible_t()))

   930         return NS_ERROR_OUT_OF_MEMORY;

   931     if (resultLen > 0) {

   932         nsACString::iterator out_iter;

   933         output.BeginWriting(out_iter);

   935         // default "defaultChar" is '?', which is an illegal character on windows

   936         // file system.  That will cause file uncreatable. Change it to '_'

   937         const char defaultChar = '_';

   939         char *result = out_iter.get();

   941         ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,

   942                               &defaultChar, nullptr);

   943     }

   944     return NS_OK;

   945 }

   947 // moved from widget/windows/nsToolkit.cpp

   948 int32_t

   949 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW)

   950 {

   951     return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize);

   952 }

   954 int32_t

   955 NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut,

   956                char *aStrOutA, const char *aDefault)

   957 {

   958     if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))

   959         return 0;

   961     int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1,

   962                                                 aStrOutA, aBufferSizeOut,

   963                                                 aDefault, nullptr);

   965     if (!numCharsConverted) {

   966         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {

   967             // Overflow, add missing null termination but return 0

   968             aStrOutA[aBufferSizeOut-1] = '\0';

   969         }

   970         else {

   971             // Other error, clear string and return 0

   972             aStrOutA[0] = '\0';

   973         }

   974     }

   975     else if (numCharsConverted < aBufferSizeOut) {

   976         // Add 2nd null (really necessary?)

   977         aStrOutA[numCharsConverted] = '\0';

   978     }

   980     return numCharsConverted;

   981 }

   983 #else

   985 #include "nsReadableUtils.h"

   987 nsresult

   988 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)

   989 {

   990     CopyASCIItoUTF16(input, output);

   991     return NS_OK;

   992 }

   994 nsresult

   995 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)

   996 {

   997     LossyCopyUTF16toASCII(input, output);

   998     return NS_OK;

   999 }

  1001 void

  1002 NS_StartupNativeCharsetUtils()

  1003 {

  1004 }

  1006 void

  1007 NS_ShutdownNativeCharsetUtils()

  1008 {

  1009 }

  1011 #endif

The Tor Browser / file revision

xpcom/io/nsNativeCharsetUtils.cpp@97036ab72558

xpcom/io/nsNativeCharsetUtils.cpp