xpcom/io/nsNativeCharsetUtils.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 #include "xpcom-private.h"
     7 //-----------------------------------------------------------------------------
     8 // XP_MACOSX or ANDROID
     9 //-----------------------------------------------------------------------------
    10 #if defined(XP_MACOSX) || defined(ANDROID)
    12 #include "nsAString.h"
    13 #include "nsReadableUtils.h"
    14 #include "nsString.h"
    16 nsresult
    17 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
    18 {
    19     CopyUTF8toUTF16(input, output);
    20     return NS_OK;
    21 }
    23 nsresult
    24 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
    25 {
    26     CopyUTF16toUTF8(input, output);
    27     return NS_OK;
    28 }
    30 void
    31 NS_StartupNativeCharsetUtils()
    32 {
    33 }
    35 void
    36 NS_ShutdownNativeCharsetUtils()
    37 {
    38 }
    41 //-----------------------------------------------------------------------------
    42 // XP_UNIX
    43 //-----------------------------------------------------------------------------
    44 #elif defined(XP_UNIX)
    46 #include <stdlib.h>   // mbtowc, wctomb
    47 #include <locale.h>   // setlocale
    48 #include "mozilla/Mutex.h"
    49 #include "nscore.h"
    50 #include "nsAString.h"
    51 #include "nsReadableUtils.h"
    53 using namespace mozilla;
    55 //
    56 // choose a conversion library.  we used to use mbrtowc/wcrtomb under Linux,
    57 // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
    58 // or not (see bug 206811 and 
    59 // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
    60 // iconv for all platforms where nltypes.h and nllanginfo.h are present 
    61 // along with iconv.
    62 //
    63 #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
    64 #define USE_ICONV 1
    65 #else
    66 #define USE_STDCONV 1
    67 #endif
    69 static void
    70 isolatin1_to_utf16(const char **input, uint32_t *inputLeft, char16_t **output, uint32_t *outputLeft)
    71 {
    72     while (*inputLeft && *outputLeft) {
    73         **output = (unsigned char) **input;
    74         (*input)++;
    75         (*inputLeft)--;
    76         (*output)++;
    77         (*outputLeft)--;
    78     }
    79 }
    81 static void
    82 utf16_to_isolatin1(const char16_t **input, uint32_t *inputLeft, char **output, uint32_t *outputLeft)
    83 {
    84     while (*inputLeft && *outputLeft) {
    85         **output = (unsigned char) **input;
    86         (*input)++;
    87         (*inputLeft)--;
    88         (*output)++;
    89         (*outputLeft)--;
    90     }
    91 }
    93 //-----------------------------------------------------------------------------
    94 // conversion using iconv
    95 //-----------------------------------------------------------------------------
    96 #if defined(USE_ICONV)
    97 #include <nl_types.h> // CODESET
    98 #include <langinfo.h> // nl_langinfo
    99 #include <iconv.h>    // iconv_open, iconv, iconv_close
   100 #include <errno.h>
   101 #include "plstr.h"
   103 #if defined(HAVE_ICONV_WITH_CONST_INPUT)
   104 #define ICONV_INPUT(x) (x)
   105 #else
   106 #define ICONV_INPUT(x) ((char **)x)
   107 #endif
   109 // solaris definitely needs this, but we'll enable it by default
   110 // just in case... but we know for sure that iconv(3) in glibc
   111 // doesn't need this.
   112 #if !defined(__GLIBC__)
   113 #define ENABLE_UTF8_FALLBACK_SUPPORT
   114 #endif
   116 #define INVALID_ICONV_T ((iconv_t) -1)
   118 static inline size_t
   119 xp_iconv(iconv_t converter,
   120          const char **input,
   121          size_t      *inputLeft,
   122          char       **output,
   123          size_t      *outputLeft)
   124 {
   125     size_t res, outputAvail = outputLeft ? *outputLeft : 0;
   126     res = iconv(converter, ICONV_INPUT(input), inputLeft, output, outputLeft);
   127     if (res == (size_t) -1) {
   128         // on some platforms (e.g., linux) iconv will fail with
   129         // E2BIG if it cannot convert _all_ of its input.  it'll
   130         // still adjust all of the in/out params correctly, so we
   131         // can ignore this error.  the assumption is that we will
   132         // be called again to complete the conversion.
   133         if ((errno == E2BIG) && (*outputLeft < outputAvail))
   134             res = 0;
   135     }
   136     return res;
   137 }
   139 static inline void
   140 xp_iconv_reset(iconv_t converter)
   141 {
   142     // NOTE: the man pages on Solaris claim that you can pass nullptr
   143     // for all parameter to reset the converter, but beware the 
   144     // evil Solaris crash if you go down this route >:-)
   146     const char *zero_char_in_ptr  = nullptr;
   147     char       *zero_char_out_ptr = nullptr;
   148     size_t      zero_size_in      = 0,
   149                 zero_size_out     = 0;
   151     xp_iconv(converter, &zero_char_in_ptr,
   152                         &zero_size_in,
   153                         &zero_char_out_ptr,
   154                         &zero_size_out);
   155 }
   157 static inline iconv_t
   158 xp_iconv_open(const char **to_list, const char **from_list)
   159 {
   160     iconv_t res;
   161     const char **from_name;
   162     const char **to_name;
   164     // try all possible combinations to locate a converter.
   165     to_name = to_list;
   166     while (*to_name) {
   167         if (**to_name) {
   168             from_name = from_list;
   169             while (*from_name) {
   170                 if (**from_name) {
   171                     res = iconv_open(*to_name, *from_name);
   172                     if (res != INVALID_ICONV_T)
   173                         return res;
   174                 }
   175                 from_name++;
   176             }
   177         }
   178         to_name++;
   179     }
   181     return INVALID_ICONV_T;
   182 }
   184 /* 
   185  * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
   186  * have to use UTF-16 with iconv(3) on platforms where it's supported.
   187  * However, the way UTF-16 and UCS-2 are interpreted varies across platforms 
   188  * and implementations of iconv(3). On Tru64, it also depends on the environment
   189  * variable. To avoid the trouble arising from byte-swapping 
   190  * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling 
   191  * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 
   192  * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
   193  * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
   194  * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
   195  * variable ICONV_BYTEORDER is set to 'big-endian', about which not much 
   196  * can be done other than adding a note in the release notes. (bug 206811)
   197  */
   198 static const char *UTF_16_NAMES[] = {
   199 #if defined(IS_LITTLE_ENDIAN)
   200     "UTF-16LE",
   201 #if defined(__GLIBC__)
   202     "UNICODELITTLE",
   203 #endif
   204     "UCS-2LE",
   205 #else
   206     "UTF-16BE",
   207 #if defined(__GLIBC__)
   208     "UNICODEBIG",
   209 #endif
   210     "UCS-2BE",
   211 #endif
   212     "UTF-16",
   213     "UCS-2",
   214     "UCS2",
   215     "UCS_2",
   216     "ucs-2",
   217     "ucs2",
   218     "ucs_2",
   219     nullptr
   220 };
   222 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   223 static const char *UTF_8_NAMES[] = {
   224     "UTF-8",
   225     "UTF8",
   226     "UTF_8",
   227     "utf-8",
   228     "utf8",
   229     "utf_8",
   230     nullptr
   231 };
   232 #endif
   234 static const char *ISO_8859_1_NAMES[] = {
   235     "ISO-8859-1",
   236 #if !defined(__GLIBC__)
   237     "ISO8859-1",
   238     "ISO88591",
   239     "ISO_8859_1",
   240     "ISO8859_1",
   241     "iso-8859-1",
   242     "iso8859-1",
   243     "iso88591",
   244     "iso_8859_1",
   245     "iso8859_1",
   246 #endif
   247     nullptr
   248 };
   250 class nsNativeCharsetConverter
   251 {
   252 public:
   253     nsNativeCharsetConverter();
   254    ~nsNativeCharsetConverter();
   256     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
   257                              char16_t       **output, uint32_t *outputLeft);
   258     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
   259                              char            **output, uint32_t *outputLeft);
   261     static void GlobalInit();
   262     static void GlobalShutdown();
   263     static bool IsNativeUTF8();
   265 private:
   266     static iconv_t gNativeToUnicode;
   267     static iconv_t gUnicodeToNative;
   268 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   269     static iconv_t gNativeToUTF8;
   270     static iconv_t gUTF8ToNative;
   271     static iconv_t gUnicodeToUTF8;
   272     static iconv_t gUTF8ToUnicode;
   273 #endif
   274     static Mutex  *gLock;
   275     static bool    gInitialized;
   276     static bool    gIsNativeUTF8;
   278     static void LazyInit();
   280     static void Lock()   { if (gLock) gLock->Lock();   }
   281     static void Unlock() { if (gLock) gLock->Unlock(); }
   282 };
   284 iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
   285 iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
   286 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   287 iconv_t nsNativeCharsetConverter::gNativeToUTF8    = INVALID_ICONV_T;
   288 iconv_t nsNativeCharsetConverter::gUTF8ToNative    = INVALID_ICONV_T;
   289 iconv_t nsNativeCharsetConverter::gUnicodeToUTF8   = INVALID_ICONV_T;
   290 iconv_t nsNativeCharsetConverter::gUTF8ToUnicode   = INVALID_ICONV_T;
   291 #endif
   292 Mutex  *nsNativeCharsetConverter::gLock            = nullptr;
   293 bool    nsNativeCharsetConverter::gInitialized     = false;
   294 bool    nsNativeCharsetConverter::gIsNativeUTF8    = false;
   296 void
   297 nsNativeCharsetConverter::LazyInit()
   298 {
   299     // LazyInit may be called before NS_StartupNativeCharsetUtils, but
   300     // the setlocale it does has to be called before nl_langinfo. Like in
   301     // NS_StartupNativeCharsetUtils, assume we are called early enough that
   302     // we are the first to care about the locale's charset.
   303     if (!gLock)
   304       setlocale(LC_CTYPE, "");
   305     const char  *blank_list[] = { "", nullptr };
   306     const char **native_charset_list = blank_list;
   307     const char  *native_charset = nl_langinfo(CODESET);
   308     if (native_charset == nullptr) {
   309         NS_ERROR("native charset is unknown");
   310         // fallback to ISO-8859-1
   311         native_charset_list = ISO_8859_1_NAMES;
   312     }
   313     else
   314         native_charset_list[0] = native_charset;
   316     // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET) 
   317     // return 'UTF-8' (or 'utf-8')
   318     if (!PL_strcasecmp(native_charset, "UTF-8"))
   319         gIsNativeUTF8 = true;
   321     gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
   322     gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
   324 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   325     if (gNativeToUnicode == INVALID_ICONV_T) {
   326         gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
   327         gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
   328         NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
   329         NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
   330     }
   331     if (gUnicodeToNative == INVALID_ICONV_T) {
   332         gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
   333         gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
   334         NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
   335         NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
   336     }
   337 #else
   338     NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
   339     NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
   340 #endif
   342     /*
   343      * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
   344      * prepend a byte order mark unicode character (BOM, u+FEFF) during
   345      * the first use of the iconv converter. The same is the case of 
   346      * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. 
   347      * However, we use 'UTF-16LE/BE' in both cases, instead so that we 
   348      * should be safe. But just in case...
   349      *
   350      * This dummy conversion gets rid of the BOMs and fixes bug 153562.
   351      */
   352     char dummy_input[1] = { ' ' };
   353     char dummy_output[4];
   355     if (gNativeToUnicode != INVALID_ICONV_T) {
   356 	const char *input = dummy_input;
   357 	size_t input_left = sizeof(dummy_input);
   358 	char *output = dummy_output;
   359 	size_t output_left = sizeof(dummy_output);
   361 	xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
   362     }
   363 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   364     if (gUTF8ToUnicode != INVALID_ICONV_T) {
   365 	const char *input = dummy_input;
   366 	size_t input_left = sizeof(dummy_input);
   367 	char *output = dummy_output;
   368 	size_t output_left = sizeof(dummy_output);
   370 	xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
   371     }
   372 #endif
   374     gInitialized = true;
   375 }
   377 void
   378 nsNativeCharsetConverter::GlobalInit()
   379 {
   380     gLock = new Mutex("nsNativeCharsetConverter.gLock");
   381 }
   383 void
   384 nsNativeCharsetConverter::GlobalShutdown()
   385 {
   386     if (gLock) {
   387         delete gLock;
   388         gLock = nullptr;
   389     }
   391     if (gNativeToUnicode != INVALID_ICONV_T) {
   392         iconv_close(gNativeToUnicode);
   393         gNativeToUnicode = INVALID_ICONV_T;
   394     }
   396     if (gUnicodeToNative != INVALID_ICONV_T) {
   397         iconv_close(gUnicodeToNative);
   398         gUnicodeToNative = INVALID_ICONV_T;
   399     }
   401 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   402     if (gNativeToUTF8 != INVALID_ICONV_T) {
   403         iconv_close(gNativeToUTF8);
   404         gNativeToUTF8 = INVALID_ICONV_T;
   405     }
   406     if (gUTF8ToNative != INVALID_ICONV_T) {
   407         iconv_close(gUTF8ToNative);
   408         gUTF8ToNative = INVALID_ICONV_T;
   409     }
   410     if (gUnicodeToUTF8 != INVALID_ICONV_T) {
   411         iconv_close(gUnicodeToUTF8);
   412         gUnicodeToUTF8 = INVALID_ICONV_T;
   413     }
   414     if (gUTF8ToUnicode != INVALID_ICONV_T) {
   415         iconv_close(gUTF8ToUnicode);
   416         gUTF8ToUnicode = INVALID_ICONV_T;
   417     }
   418 #endif
   420     gInitialized = false;
   421 }
   423 nsNativeCharsetConverter::nsNativeCharsetConverter()
   424 {
   425     Lock();
   426     if (!gInitialized)
   427         LazyInit();
   428 }
   430 nsNativeCharsetConverter::~nsNativeCharsetConverter()
   431 {
   432     // reset converters for next time
   433     if (gNativeToUnicode != INVALID_ICONV_T)
   434         xp_iconv_reset(gNativeToUnicode);
   435     if (gUnicodeToNative != INVALID_ICONV_T)
   436         xp_iconv_reset(gUnicodeToNative);
   437 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   438     if (gNativeToUTF8 != INVALID_ICONV_T)
   439         xp_iconv_reset(gNativeToUTF8);
   440     if (gUTF8ToNative != INVALID_ICONV_T)
   441         xp_iconv_reset(gUTF8ToNative);
   442     if (gUnicodeToUTF8 != INVALID_ICONV_T)
   443         xp_iconv_reset(gUnicodeToUTF8);
   444     if (gUTF8ToUnicode != INVALID_ICONV_T)
   445         xp_iconv_reset(gUTF8ToUnicode);
   446 #endif
   447     Unlock();
   448 }
   450 nsresult
   451 nsNativeCharsetConverter::NativeToUnicode(const char **input,
   452                                           uint32_t    *inputLeft,
   453                                           char16_t  **output,
   454                                           uint32_t    *outputLeft)
   455 {
   456     size_t res = 0;
   457     size_t inLeft = (size_t) *inputLeft;
   458     size_t outLeft = (size_t) *outputLeft * 2;
   460     if (gNativeToUnicode != INVALID_ICONV_T) {
   462         res = xp_iconv(gNativeToUnicode, input, &inLeft, (char **) output, &outLeft);
   464         *inputLeft = inLeft;
   465         *outputLeft = outLeft / 2;
   466         if (res != (size_t) -1) 
   467             return NS_OK;
   469         NS_WARNING("conversion from native to utf-16 failed");
   471         // reset converter
   472         xp_iconv_reset(gNativeToUnicode);
   473     }
   474 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   475     else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
   476              (gUTF8ToUnicode != INVALID_ICONV_T)) {
   477         // convert first to UTF8, then from UTF8 to UCS2
   478         const char *in = *input;
   480         char ubuf[1024];
   482         // we assume we're always called with enough space in |output|,
   483         // so convert many chars at a time...
   484         while (inLeft) {
   485             char *p = ubuf;
   486             size_t n = sizeof(ubuf);
   487             res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
   488             if (res == (size_t) -1) {
   489                 NS_ERROR("conversion from native to utf-8 failed");
   490                 break;
   491             }
   492             NS_ASSERTION(outLeft > 0, "bad assumption");
   493             p = ubuf;
   494             n = sizeof(ubuf) - n;
   495             res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
   496             if (res == (size_t) -1) {
   497                 NS_ERROR("conversion from utf-8 to utf-16 failed");
   498                 break;
   499             }
   500         }
   502         (*input) += (*inputLeft - inLeft);
   503         *inputLeft = inLeft;
   504         *outputLeft = outLeft / 2;
   506         if (res != (size_t) -1) 
   507             return NS_OK;
   509         // reset converters
   510         xp_iconv_reset(gNativeToUTF8);
   511         xp_iconv_reset(gUTF8ToUnicode);
   512     }
   513 #endif
   515     // fallback: zero-pad and hope for the best
   516     // XXX This is lame and we have to do better.
   517     isolatin1_to_utf16(input, inputLeft, output, outputLeft);
   519     return NS_OK;
   520 }
   522 nsresult
   523 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
   524                                           uint32_t         *inputLeft,
   525                                           char            **output,
   526                                           uint32_t         *outputLeft)
   527 {
   528     size_t res = 0;
   529     size_t inLeft = (size_t) *inputLeft * 2;
   530     size_t outLeft = (size_t) *outputLeft;
   532     if (gUnicodeToNative != INVALID_ICONV_T) {
   533         res = xp_iconv(gUnicodeToNative, (const char **) input, &inLeft, output, &outLeft);
   535         *inputLeft = inLeft / 2;
   536         *outputLeft = outLeft;
   537         if (res != (size_t) -1) {
   538             return NS_OK;
   539         }
   541         NS_ERROR("iconv failed");
   543         // reset converter
   544         xp_iconv_reset(gUnicodeToNative);
   545     }
   546 #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
   547     else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
   548              (gUTF8ToNative != INVALID_ICONV_T)) {
   549         const char *in = (const char *) *input;
   551         char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
   553         // convert one uchar at a time...
   554         while (inLeft && outLeft) {
   555             char *p = ubuf;
   556             size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);
   557             res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
   558             if (res == (size_t) -1) {
   559                 NS_ERROR("conversion from utf-16 to utf-8 failed");
   560                 break;
   561             }
   562             p = ubuf;
   563             n = sizeof(ubuf) - n;
   564             res = xp_iconv(gUTF8ToNative, (const char **) &p, &n, output, &outLeft);
   565             if (res == (size_t) -1) {
   566                 if (errno == E2BIG) {
   567                     // not enough room for last uchar... back up and return.
   568                     in -= sizeof(char16_t);
   569                     res = 0;
   570                 }
   571                 else
   572                     NS_ERROR("conversion from utf-8 to native failed");
   573                 break;
   574             }
   575             inLeft -= sizeof(char16_t);
   576         }
   578         (*input) += (*inputLeft - inLeft / 2);
   579         *inputLeft = inLeft / 2;
   580         *outputLeft = outLeft;
   581         if (res != (size_t) -1) {
   582             return NS_OK;
   583         }
   585         // reset converters
   586         xp_iconv_reset(gUnicodeToUTF8);
   587         xp_iconv_reset(gUTF8ToNative);
   588     }
   589 #endif
   591     // fallback: truncate and hope for the best
   592     // XXX This is lame and we have to do better.
   593     utf16_to_isolatin1(input, inputLeft, output, outputLeft);
   595     return NS_OK;
   596 }
   598 bool
   599 nsNativeCharsetConverter::IsNativeUTF8()
   600 {
   601     if (!gInitialized) {
   602         Lock();
   603         if (!gInitialized)
   604            LazyInit();
   605         Unlock();
   606     }
   607     return gIsNativeUTF8; 
   608 }
   610 #endif // USE_ICONV
   612 //-----------------------------------------------------------------------------
   613 // conversion using mb[r]towc/wc[r]tomb
   614 //-----------------------------------------------------------------------------
   615 #if defined(USE_STDCONV)
   616 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   617 #include <wchar.h>    // mbrtowc, wcrtomb
   618 #endif
   620 class nsNativeCharsetConverter
   621 {
   622 public:
   623     nsNativeCharsetConverter();
   625     nsresult NativeToUnicode(const char      **input , uint32_t *inputLeft,
   626                              char16_t       **output, uint32_t *outputLeft);
   627     nsresult UnicodeToNative(const char16_t **input , uint32_t *inputLeft,
   628                              char            **output, uint32_t *outputLeft);
   630     static void GlobalInit();
   631     static void GlobalShutdown() { }
   632     static bool IsNativeUTF8();
   634 private:
   635     static bool gWCharIsUnicode;
   637 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   638     mbstate_t ps;
   639 #endif
   640 };
   642 bool nsNativeCharsetConverter::gWCharIsUnicode = false;
   644 nsNativeCharsetConverter::nsNativeCharsetConverter()
   645 {
   646 #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
   647     memset(&ps, 0, sizeof(ps));
   648 #endif
   649 }
   651 void
   652 nsNativeCharsetConverter::GlobalInit()
   653 {
   654     // verify that wchar_t for the current locale is actually unicode.
   655     // if it is not, then we should avoid calling mbtowc/wctomb and
   656     // just fallback on zero-pad/truncation conversion.
   657     //
   658     // this test cannot be done at build time because the encoding of
   659     // wchar_t may depend on the runtime locale.  sad, but true!!
   660     //
   661     // so, if wchar_t is unicode then converting an ASCII character
   662     // to wchar_t should not change its numeric value.  we'll just
   663     // check what happens with the ASCII 'a' character.
   664     //
   665     // this test is not perfect... obviously, it could yield false
   666     // positives, but then at least ASCII text would be converted
   667     // properly (or maybe just the 'a' character) -- oh well :(
   669     char a = 'a';
   670     unsigned int w = 0;
   672     int res = mbtowc((wchar_t *) &w, &a, 1);
   674     gWCharIsUnicode = (res != -1 && w == 'a');
   676 #ifdef DEBUG
   677     if (!gWCharIsUnicode)
   678         NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
   679 #endif
   680 }
   682 nsresult
   683 nsNativeCharsetConverter::NativeToUnicode(const char **input,
   684                                           uint32_t    *inputLeft,
   685                                           char16_t  **output,
   686                                           uint32_t    *outputLeft)
   687 {
   688     if (gWCharIsUnicode) {
   689         int incr;
   691         // cannot use wchar_t here since it may have been redefined (e.g.,
   692         // via -fshort-wchar).  hopefully, sizeof(tmp) is sufficient XP.
   693         unsigned int tmp = 0;
   694         while (*inputLeft && *outputLeft) {
   695 #ifdef HAVE_MBRTOWC
   696             incr = (int) mbrtowc((wchar_t *) &tmp, *input, *inputLeft, &ps);
   697 #else
   698             // XXX is this thread-safe?
   699             incr = (int) mbtowc((wchar_t *) &tmp, *input, *inputLeft);
   700 #endif
   701             if (incr < 0) {
   702                 NS_WARNING("mbtowc failed: possible charset mismatch");
   703                 // zero-pad and hope for the best
   704                 tmp = (unsigned char) **input;
   705                 incr = 1;
   706             }
   707             **output = (char16_t) tmp;
   708             (*input) += incr;
   709             (*inputLeft) -= incr;
   710             (*output)++;
   711             (*outputLeft)--;
   712         }
   713     }
   714     else {
   715         // wchar_t isn't unicode, so the best we can do is treat the
   716         // input as if it is isolatin1 :(
   717         isolatin1_to_utf16(input, inputLeft, output, outputLeft);
   718     }
   720     return NS_OK;
   721 }
   723 nsresult
   724 nsNativeCharsetConverter::UnicodeToNative(const char16_t **input,
   725                                           uint32_t         *inputLeft,
   726                                           char            **output,
   727                                           uint32_t         *outputLeft)
   728 {
   729     if (gWCharIsUnicode) {
   730         int incr;
   732         while (*inputLeft && *outputLeft >= MB_CUR_MAX) {
   733 #ifdef HAVE_WCRTOMB
   734             incr = (int) wcrtomb(*output, (wchar_t) **input, &ps);
   735 #else
   736             // XXX is this thread-safe?
   737             incr = (int) wctomb(*output, (wchar_t) **input);
   738 #endif
   739             if (incr < 0) {
   740                 NS_WARNING("mbtowc failed: possible charset mismatch");
   741                 **output = (unsigned char) **input; // truncate
   742                 incr = 1;
   743             }
   744             // most likely we're dead anyways if this assertion should fire
   745             NS_ASSERTION(uint32_t(incr) <= *outputLeft, "wrote beyond end of string");
   746             (*output) += incr;
   747             (*outputLeft) -= incr;
   748             (*input)++;
   749             (*inputLeft)--;
   750         }
   751     }
   752     else {
   753         // wchar_t isn't unicode, so the best we can do is treat the
   754         // input as if it is isolatin1 :(
   755         utf16_to_isolatin1(input, inputLeft, output, outputLeft);
   756     }
   758     return NS_OK;
   759 }
   761 // XXX : for now, return false
   762 bool
   763 nsNativeCharsetConverter::IsNativeUTF8()
   764 {
   765     return false;
   766 }
   768 #endif // USE_STDCONV
   770 //-----------------------------------------------------------------------------
   771 // API implementation
   772 //-----------------------------------------------------------------------------
   774 nsresult
   775 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
   776 {
   777     output.Truncate();
   779     uint32_t inputLen = input.Length();
   781     nsACString::const_iterator iter;
   782     input.BeginReading(iter);
   784     //
   785     // OPTIMIZATION: preallocate space for largest possible result; convert
   786     // directly into the result buffer to avoid intermediate buffer copy.
   787     //
   788     // this will generally result in a larger allocation, but that seems
   789     // better than an extra buffer copy.
   790     //
   791     if (!output.SetLength(inputLen, fallible_t()))
   792         return NS_ERROR_OUT_OF_MEMORY;
   793     nsAString::iterator out_iter;
   794     output.BeginWriting(out_iter);
   796     char16_t *result = out_iter.get();
   797     uint32_t resultLeft = inputLen;
   799     const char *buf = iter.get();
   800     uint32_t bufLeft = inputLen;
   802     nsNativeCharsetConverter conv;
   803     nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
   804     if (NS_SUCCEEDED(rv)) {
   805         NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
   806         output.SetLength(inputLen - resultLeft);
   807     }
   808     return rv;
   809 }
   811 nsresult
   812 NS_CopyUnicodeToNative(const nsAString &input, nsACString &output)
   813 {
   814     output.Truncate();
   816     nsAString::const_iterator iter, end;
   817     input.BeginReading(iter);
   818     input.EndReading(end);
   820     // cannot easily avoid intermediate buffer copy.
   821     char temp[4096];
   823     nsNativeCharsetConverter conv;
   825     const char16_t *buf = iter.get();
   826     uint32_t bufLeft = Distance(iter, end);
   827     while (bufLeft) {
   828         char *p = temp;
   829         uint32_t tempLeft = sizeof(temp);
   831         nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
   832         if (NS_FAILED(rv)) return rv;
   834         if (tempLeft < sizeof(temp))
   835             output.Append(temp, sizeof(temp) - tempLeft);
   836     }
   837     return NS_OK;
   838 }
   840 bool
   841 NS_IsNativeUTF8()
   842 {
   843     return nsNativeCharsetConverter::IsNativeUTF8();
   844 }
   846 void
   847 NS_StartupNativeCharsetUtils()
   848 {
   849     //
   850     // need to initialize the locale or else charset conversion will fail.
   851     // better not delay this in case some other component alters the locale
   852     // settings.
   853     //
   854     // XXX we assume that we are called early enough that we should
   855     // always be the first to care about the locale's charset.
   856     //
   857     setlocale(LC_CTYPE, "");
   859     nsNativeCharsetConverter::GlobalInit();
   860 }
   862 void
   863 NS_ShutdownNativeCharsetUtils()
   864 {
   865     nsNativeCharsetConverter::GlobalShutdown();
   866 }
   868 //-----------------------------------------------------------------------------
   869 // XP_WIN
   870 //-----------------------------------------------------------------------------
   871 #elif defined(XP_WIN)
   873 #include <windows.h>
   874 #include "nsString.h"
   875 #include "nsAString.h"
   876 #include "nsReadableUtils.h"
   878 using namespace mozilla;
   880 nsresult
   881 NS_CopyNativeToUnicode(const nsACString &input, nsAString &output)
   882 {
   883     uint32_t inputLen = input.Length();
   885     nsACString::const_iterator iter;
   886     input.BeginReading(iter);
   888     const char *buf = iter.get();
   890     // determine length of result
   891     uint32_t resultLen = 0;
   892     int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);
   893     if (n > 0)
   894         resultLen += n;
   896     // allocate sufficient space
   897     if (!output.SetLength(resultLen, fallible_t()))
   898         return NS_ERROR_OUT_OF_MEMORY;
   899     if (resultLen > 0) {
   900         nsAString::iterator out_iter;
   901         output.BeginWriting(out_iter);
   903         char16_t *result = out_iter.get();
   905         ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, wwc(result), resultLen);
   906     }
   907     return NS_OK;
   908 }
   910 nsresult
   911 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
   912 {
   913     uint32_t inputLen = input.Length();
   915     nsAString::const_iterator iter;
   916     input.BeginReading(iter);
   918     char16ptr_t buf = iter.get();
   920     // determine length of result
   921     uint32_t resultLen = 0;
   923     int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,
   924                                   nullptr, nullptr);
   925     if (n > 0)
   926         resultLen += n;
   928     // allocate sufficient space
   929     if (!output.SetLength(resultLen, fallible_t()))
   930         return NS_ERROR_OUT_OF_MEMORY;
   931     if (resultLen > 0) {
   932         nsACString::iterator out_iter;
   933         output.BeginWriting(out_iter);
   935         // default "defaultChar" is '?', which is an illegal character on windows
   936         // file system.  That will cause file uncreatable. Change it to '_'
   937         const char defaultChar = '_';
   939         char *result = out_iter.get();
   941         ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
   942                               &defaultChar, nullptr);
   943     }
   944     return NS_OK;
   945 }
   947 // moved from widget/windows/nsToolkit.cpp
   948 int32_t 
   949 NS_ConvertAtoW(const char *aStrInA, int aBufferSize, char16_t *aStrOutW)
   950 {
   951     return MultiByteToWideChar(CP_ACP, 0, aStrInA, -1, wwc(aStrOutW), aBufferSize);
   952 }
   954 int32_t 
   955 NS_ConvertWtoA(const char16_t *aStrInW, int aBufferSizeOut,
   956                char *aStrOutA, const char *aDefault)
   957 {
   958     if ((!aStrInW) || (!aStrOutA) || (aBufferSizeOut <= 0))
   959         return 0;
   961     int numCharsConverted = WideCharToMultiByte(CP_ACP, 0, char16ptr_t(aStrInW), -1,
   962                                                 aStrOutA, aBufferSizeOut,
   963                                                 aDefault, nullptr);
   965     if (!numCharsConverted) {
   966         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
   967             // Overflow, add missing null termination but return 0
   968             aStrOutA[aBufferSizeOut-1] = '\0';
   969         }
   970         else {
   971             // Other error, clear string and return 0
   972             aStrOutA[0] = '\0';
   973         }
   974     }
   975     else if (numCharsConverted < aBufferSizeOut) {
   976         // Add 2nd null (really necessary?)
   977         aStrOutA[numCharsConverted] = '\0';
   978     }
   980     return numCharsConverted;
   981 }
   983 #else
   985 #include "nsReadableUtils.h"
   987 nsresult
   988 NS_CopyNativeToUnicode(const nsACString &input, nsAString  &output)
   989 {
   990     CopyASCIItoUTF16(input, output);
   991     return NS_OK;
   992 }
   994 nsresult
   995 NS_CopyUnicodeToNative(const nsAString  &input, nsACString &output)
   996 {
   997     LossyCopyUTF16toASCII(input, output);
   998     return NS_OK;
   999 }
  1001 void
  1002 NS_StartupNativeCharsetUtils()
  1006 void
  1007 NS_ShutdownNativeCharsetUtils()
  1011 #endif

mercurial