The Tor Browser: js/public/CharacterEncoding.h@925c144e1f1f

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-

     2  * vim: set ts=8 sts=4 et sw=4 tw=99:

     3  * This Source Code Form is subject to the terms of the Mozilla Public

     4  * License, v. 2.0. If a copy of the MPL was not distributed with this

     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     7 #ifndef js_CharacterEncoding_h

     8 #define js_CharacterEncoding_h

    10 #include "mozilla/NullPtr.h"

    11 #include "mozilla/Range.h"

    13 #include "js/TypeDecls.h"

    14 #include "js/Utility.h"

    16 namespace js {

    17 struct ThreadSafeContext;

    18 }

    20 namespace JS {

    22 /*

    23  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI

    24  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each

    25  * byte is treated as a 2-byte character, and there is no way to pass in a

    26  * string containing characters beyond U+00FF.

    27  */

    28 class Latin1Chars : public mozilla::Range<unsigned char>

    29 {

    30     typedef mozilla::Range<unsigned char> Base;

    32   public:

    33     Latin1Chars() : Base() {}

    34     Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {}

    35     Latin1Chars(const char *aBytes, size_t aLength)

    36       : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)

    37     {}

    38 };

    40 /*

    41  * A Latin1Chars, but with \0 termination for C compatibility.

    42  */

    43 class Latin1CharsZ : public mozilla::RangedPtr<unsigned char>

    44 {

    45     typedef mozilla::RangedPtr<unsigned char> Base;

    47   public:

    48     Latin1CharsZ() : Base(nullptr, 0) {}

    50     Latin1CharsZ(char *aBytes, size_t aLength)

    51       : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)

    52     {

    53         MOZ_ASSERT(aBytes[aLength] == '\0');

    54     }

    56     Latin1CharsZ(unsigned char *aBytes, size_t aLength)

    57       : Base(aBytes, aLength)

    58     {

    59         MOZ_ASSERT(aBytes[aLength] == '\0');

    60     }

    62     using Base::operator=;

    64     char *c_str() { return reinterpret_cast<char *>(get()); }

    65 };

    67 class UTF8Chars : public mozilla::Range<unsigned char>

    68 {

    69     typedef mozilla::Range<unsigned char> Base;

    71   public:

    72     UTF8Chars() : Base() {}

    73     UTF8Chars(char *aBytes, size_t aLength)

    74       : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)

    75     {}

    76     UTF8Chars(const char *aBytes, size_t aLength)

    77       : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)

    78     {}

    79 };

    81 /*

    82  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.

    83  */

    84 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>

    85 {

    86     typedef mozilla::RangedPtr<unsigned char> Base;

    88   public:

    89     UTF8CharsZ() : Base(nullptr, 0) {}

    91     UTF8CharsZ(char *aBytes, size_t aLength)

    92       : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)

    93     {

    94         MOZ_ASSERT(aBytes[aLength] == '\0');

    95     }

    97     UTF8CharsZ(unsigned char *aBytes, size_t aLength)

    98       : Base(aBytes, aLength)

    99     {

   100         MOZ_ASSERT(aBytes[aLength] == '\0');

   101     }

   103     using Base::operator=;

   105     char *c_str() { return reinterpret_cast<char *>(get()); }

   106 };

   108 /*

   109  * SpiderMonkey uses a 2-byte character representation: it is a

   110  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,

   111  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a

   112  * sufficiently dedicated JavaScript program to be fully unicode-aware by

   113  * manually interpreting UTF-16 extension characters embedded in the JS

   114  * string.

   115  */

   116 class TwoByteChars : public mozilla::Range<jschar>

   117 {

   118     typedef mozilla::Range<jschar> Base;

   120   public:

   121     TwoByteChars() : Base() {}

   122     TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {}

   123     TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {}

   124 };

   126 /*

   127  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.

   128  */

   129 class TwoByteCharsZ : public mozilla::RangedPtr<jschar>

   130 {

   131     typedef mozilla::RangedPtr<jschar> Base;

   133   public:

   134     TwoByteCharsZ() : Base(nullptr, 0) {}

   136     TwoByteCharsZ(jschar *chars, size_t length)

   137       : Base(chars, length)

   138     {

   139         MOZ_ASSERT(chars[length] == '\0');

   140     }

   142     using Base::operator=;

   143 };

   145 typedef mozilla::RangedPtr<const jschar> ConstCharPtr;

   147 /*

   148  * Like TwoByteChars, but the chars are const.

   149  */

   150 class ConstTwoByteChars : public mozilla::RangedPtr<const jschar>

   151 {

   152   public:

   153     ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {}

   154     ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {}

   155     ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {}

   156     ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len)

   157       : ConstCharPtr(pos, start, len)

   158     {}

   160     using ConstCharPtr::operator=;

   161 };

   164 /*

   165  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by

   166  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source

   167  * contains any UTF-16 extension characters, then this may give invalid Latin1

   168  * output. The returned string is zero terminated. The returned string or the

   169  * returned string's |start()| must be freed with JS_free or js_free,

   170  * respectively. If allocation fails, an OOM error will be set and the method

   171  * will return a nullptr chars (which can be tested for with the ! operator).

   172  * This method cannot trigger GC.

   173  */

   174 extern Latin1CharsZ

   175 LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);

   177 extern UTF8CharsZ

   178 TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);

   180 uint32_t

   181 Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);

   183 /*

   184  * Inflate bytes in UTF-8 encoding to jschars.

   185  * - On error, returns an empty TwoByteCharsZ.

   186  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold

   187  *   its length;  the length value excludes the trailing null.

   188  */

   189 extern TwoByteCharsZ

   190 UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);

   192 /*

   193  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters

   194  * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8

   195  * input.

   196  */

   197 extern TwoByteCharsZ

   198 LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);

   200 } // namespace JS

   202 inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); }

   203 inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); }

   205 #endif /* js_CharacterEncoding_h */

The Tor Browser / file revision