js/public/CharacterEncoding.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/public/CharacterEncoding.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,205 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +#ifndef js_CharacterEncoding_h
    1.11 +#define js_CharacterEncoding_h
    1.12 +
    1.13 +#include "mozilla/NullPtr.h"
    1.14 +#include "mozilla/Range.h"
    1.15 +
    1.16 +#include "js/TypeDecls.h"
    1.17 +#include "js/Utility.h"
    1.18 +
    1.19 +namespace js {
    1.20 +struct ThreadSafeContext;
    1.21 +}
    1.22 +
    1.23 +namespace JS {
    1.24 +
    1.25 +/*
    1.26 + * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
    1.27 + * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
    1.28 + * byte is treated as a 2-byte character, and there is no way to pass in a
    1.29 + * string containing characters beyond U+00FF.
    1.30 + */
    1.31 +class Latin1Chars : public mozilla::Range<unsigned char>
    1.32 +{
    1.33 +    typedef mozilla::Range<unsigned char> Base;
    1.34 +
    1.35 +  public:
    1.36 +    Latin1Chars() : Base() {}
    1.37 +    Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {}
    1.38 +    Latin1Chars(const char *aBytes, size_t aLength)
    1.39 +      : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
    1.40 +    {}
    1.41 +};
    1.42 +
    1.43 +/*
    1.44 + * A Latin1Chars, but with \0 termination for C compatibility.
    1.45 + */
    1.46 +class Latin1CharsZ : public mozilla::RangedPtr<unsigned char>
    1.47 +{
    1.48 +    typedef mozilla::RangedPtr<unsigned char> Base;
    1.49 +
    1.50 +  public:
    1.51 +    Latin1CharsZ() : Base(nullptr, 0) {}
    1.52 +
    1.53 +    Latin1CharsZ(char *aBytes, size_t aLength)
    1.54 +      : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
    1.55 +    {
    1.56 +        MOZ_ASSERT(aBytes[aLength] == '\0');
    1.57 +    }
    1.58 +
    1.59 +    Latin1CharsZ(unsigned char *aBytes, size_t aLength)
    1.60 +      : Base(aBytes, aLength)
    1.61 +    {
    1.62 +        MOZ_ASSERT(aBytes[aLength] == '\0');
    1.63 +    }
    1.64 +
    1.65 +    using Base::operator=;
    1.66 +
    1.67 +    char *c_str() { return reinterpret_cast<char *>(get()); }
    1.68 +};
    1.69 +
    1.70 +class UTF8Chars : public mozilla::Range<unsigned char>
    1.71 +{
    1.72 +    typedef mozilla::Range<unsigned char> Base;
    1.73 +
    1.74 +  public:
    1.75 +    UTF8Chars() : Base() {}
    1.76 +    UTF8Chars(char *aBytes, size_t aLength)
    1.77 +      : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
    1.78 +    {}
    1.79 +    UTF8Chars(const char *aBytes, size_t aLength)
    1.80 +      : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
    1.81 +    {}
    1.82 +};
    1.83 +
    1.84 +/*
    1.85 + * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
    1.86 + */
    1.87 +class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
    1.88 +{
    1.89 +    typedef mozilla::RangedPtr<unsigned char> Base;
    1.90 +
    1.91 +  public:
    1.92 +    UTF8CharsZ() : Base(nullptr, 0) {}
    1.93 +
    1.94 +    UTF8CharsZ(char *aBytes, size_t aLength)
    1.95 +      : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
    1.96 +    {
    1.97 +        MOZ_ASSERT(aBytes[aLength] == '\0');
    1.98 +    }
    1.99 +
   1.100 +    UTF8CharsZ(unsigned char *aBytes, size_t aLength)
   1.101 +      : Base(aBytes, aLength)
   1.102 +    {
   1.103 +        MOZ_ASSERT(aBytes[aLength] == '\0');
   1.104 +    }
   1.105 +
   1.106 +    using Base::operator=;
   1.107 +
   1.108 +    char *c_str() { return reinterpret_cast<char *>(get()); }
   1.109 +};
   1.110 +
   1.111 +/*
   1.112 + * SpiderMonkey uses a 2-byte character representation: it is a
   1.113 + * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
   1.114 + * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
   1.115 + * sufficiently dedicated JavaScript program to be fully unicode-aware by
   1.116 + * manually interpreting UTF-16 extension characters embedded in the JS
   1.117 + * string.
   1.118 + */
   1.119 +class TwoByteChars : public mozilla::Range<jschar>
   1.120 +{
   1.121 +    typedef mozilla::Range<jschar> Base;
   1.122 +
   1.123 +  public:
   1.124 +    TwoByteChars() : Base() {}
   1.125 +    TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {}
   1.126 +    TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {}
   1.127 +};
   1.128 +
   1.129 +/*
   1.130 + * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
   1.131 + */
   1.132 +class TwoByteCharsZ : public mozilla::RangedPtr<jschar>
   1.133 +{
   1.134 +    typedef mozilla::RangedPtr<jschar> Base;
   1.135 +
   1.136 +  public:
   1.137 +    TwoByteCharsZ() : Base(nullptr, 0) {}
   1.138 +
   1.139 +    TwoByteCharsZ(jschar *chars, size_t length)
   1.140 +      : Base(chars, length)
   1.141 +    {
   1.142 +        MOZ_ASSERT(chars[length] == '\0');
   1.143 +    }
   1.144 +
   1.145 +    using Base::operator=;
   1.146 +};
   1.147 +
   1.148 +typedef mozilla::RangedPtr<const jschar> ConstCharPtr;
   1.149 +
   1.150 +/*
   1.151 + * Like TwoByteChars, but the chars are const.
   1.152 + */
   1.153 +class ConstTwoByteChars : public mozilla::RangedPtr<const jschar>
   1.154 +{
   1.155 +  public:
   1.156 +    ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {}
   1.157 +    ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {}
   1.158 +    ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {}
   1.159 +    ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len)
   1.160 +      : ConstCharPtr(pos, start, len)
   1.161 +    {}
   1.162 +
   1.163 +    using ConstCharPtr::operator=;
   1.164 +};
   1.165 +
   1.166 +
   1.167 +/*
   1.168 + * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
   1.169 + * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
   1.170 + * contains any UTF-16 extension characters, then this may give invalid Latin1
   1.171 + * output. The returned string is zero terminated. The returned string or the
   1.172 + * returned string's |start()| must be freed with JS_free or js_free,
   1.173 + * respectively. If allocation fails, an OOM error will be set and the method
   1.174 + * will return a nullptr chars (which can be tested for with the ! operator).
   1.175 + * This method cannot trigger GC.
   1.176 + */
   1.177 +extern Latin1CharsZ
   1.178 +LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
   1.179 +
   1.180 +extern UTF8CharsZ
   1.181 +TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
   1.182 +
   1.183 +uint32_t
   1.184 +Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);
   1.185 +
   1.186 +/*
   1.187 + * Inflate bytes in UTF-8 encoding to jschars.
   1.188 + * - On error, returns an empty TwoByteCharsZ.
   1.189 + * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
   1.190 + *   its length;  the length value excludes the trailing null.
   1.191 + */
   1.192 +extern TwoByteCharsZ
   1.193 +UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
   1.194 +
   1.195 +/*
   1.196 + * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
   1.197 + * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
   1.198 + * input.
   1.199 + */
   1.200 +extern TwoByteCharsZ
   1.201 +LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
   1.202 +
   1.203 +} // namespace JS
   1.204 +
   1.205 +inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); }
   1.206 +inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); }
   1.207 +
   1.208 +#endif /* js_CharacterEncoding_h */

mercurial