1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/public/CharacterEncoding.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,205 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#ifndef js_CharacterEncoding_h 1.11 +#define js_CharacterEncoding_h 1.12 + 1.13 +#include "mozilla/NullPtr.h" 1.14 +#include "mozilla/Range.h" 1.15 + 1.16 +#include "js/TypeDecls.h" 1.17 +#include "js/Utility.h" 1.18 + 1.19 +namespace js { 1.20 +struct ThreadSafeContext; 1.21 +} 1.22 + 1.23 +namespace JS { 1.24 + 1.25 +/* 1.26 + * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI 1.27 + * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each 1.28 + * byte is treated as a 2-byte character, and there is no way to pass in a 1.29 + * string containing characters beyond U+00FF. 1.30 + */ 1.31 +class Latin1Chars : public mozilla::Range<unsigned char> 1.32 +{ 1.33 + typedef mozilla::Range<unsigned char> Base; 1.34 + 1.35 + public: 1.36 + Latin1Chars() : Base() {} 1.37 + Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {} 1.38 + Latin1Chars(const char *aBytes, size_t aLength) 1.39 + : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength) 1.40 + {} 1.41 +}; 1.42 + 1.43 +/* 1.44 + * A Latin1Chars, but with \0 termination for C compatibility. 1.45 + */ 1.46 +class Latin1CharsZ : public mozilla::RangedPtr<unsigned char> 1.47 +{ 1.48 + typedef mozilla::RangedPtr<unsigned char> Base; 1.49 + 1.50 + public: 1.51 + Latin1CharsZ() : Base(nullptr, 0) {} 1.52 + 1.53 + Latin1CharsZ(char *aBytes, size_t aLength) 1.54 + : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) 1.55 + { 1.56 + MOZ_ASSERT(aBytes[aLength] == '\0'); 1.57 + } 1.58 + 1.59 + Latin1CharsZ(unsigned char *aBytes, size_t aLength) 1.60 + : Base(aBytes, aLength) 1.61 + { 1.62 + MOZ_ASSERT(aBytes[aLength] == '\0'); 1.63 + } 1.64 + 1.65 + using Base::operator=; 1.66 + 1.67 + char *c_str() { return reinterpret_cast<char *>(get()); } 1.68 +}; 1.69 + 1.70 +class UTF8Chars : public mozilla::Range<unsigned char> 1.71 +{ 1.72 + typedef mozilla::Range<unsigned char> Base; 1.73 + 1.74 + public: 1.75 + UTF8Chars() : Base() {} 1.76 + UTF8Chars(char *aBytes, size_t aLength) 1.77 + : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) 1.78 + {} 1.79 + UTF8Chars(const char *aBytes, size_t aLength) 1.80 + : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength) 1.81 + {} 1.82 +}; 1.83 + 1.84 +/* 1.85 + * SpiderMonkey also deals directly with UTF-8 encoded text in some places. 1.86 + */ 1.87 +class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> 1.88 +{ 1.89 + typedef mozilla::RangedPtr<unsigned char> Base; 1.90 + 1.91 + public: 1.92 + UTF8CharsZ() : Base(nullptr, 0) {} 1.93 + 1.94 + UTF8CharsZ(char *aBytes, size_t aLength) 1.95 + : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) 1.96 + { 1.97 + MOZ_ASSERT(aBytes[aLength] == '\0'); 1.98 + } 1.99 + 1.100 + UTF8CharsZ(unsigned char *aBytes, size_t aLength) 1.101 + : Base(aBytes, aLength) 1.102 + { 1.103 + MOZ_ASSERT(aBytes[aLength] == '\0'); 1.104 + } 1.105 + 1.106 + using Base::operator=; 1.107 + 1.108 + char *c_str() { return reinterpret_cast<char *>(get()); } 1.109 +}; 1.110 + 1.111 +/* 1.112 + * SpiderMonkey uses a 2-byte character representation: it is a 1.113 + * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, 1.114 + * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a 1.115 + * sufficiently dedicated JavaScript program to be fully unicode-aware by 1.116 + * manually interpreting UTF-16 extension characters embedded in the JS 1.117 + * string. 1.118 + */ 1.119 +class TwoByteChars : public mozilla::Range<jschar> 1.120 +{ 1.121 + typedef mozilla::Range<jschar> Base; 1.122 + 1.123 + public: 1.124 + TwoByteChars() : Base() {} 1.125 + TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {} 1.126 + TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {} 1.127 +}; 1.128 + 1.129 +/* 1.130 + * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. 1.131 + */ 1.132 +class TwoByteCharsZ : public mozilla::RangedPtr<jschar> 1.133 +{ 1.134 + typedef mozilla::RangedPtr<jschar> Base; 1.135 + 1.136 + public: 1.137 + TwoByteCharsZ() : Base(nullptr, 0) {} 1.138 + 1.139 + TwoByteCharsZ(jschar *chars, size_t length) 1.140 + : Base(chars, length) 1.141 + { 1.142 + MOZ_ASSERT(chars[length] == '\0'); 1.143 + } 1.144 + 1.145 + using Base::operator=; 1.146 +}; 1.147 + 1.148 +typedef mozilla::RangedPtr<const jschar> ConstCharPtr; 1.149 + 1.150 +/* 1.151 + * Like TwoByteChars, but the chars are const. 1.152 + */ 1.153 +class ConstTwoByteChars : public mozilla::RangedPtr<const jschar> 1.154 +{ 1.155 + public: 1.156 + ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {} 1.157 + ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {} 1.158 + ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {} 1.159 + ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len) 1.160 + : ConstCharPtr(pos, start, len) 1.161 + {} 1.162 + 1.163 + using ConstCharPtr::operator=; 1.164 +}; 1.165 + 1.166 + 1.167 +/* 1.168 + * Convert a 2-byte character sequence to "ISO-Latin-1". This works by 1.169 + * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source 1.170 + * contains any UTF-16 extension characters, then this may give invalid Latin1 1.171 + * output. The returned string is zero terminated. The returned string or the 1.172 + * returned string's |start()| must be freed with JS_free or js_free, 1.173 + * respectively. If allocation fails, an OOM error will be set and the method 1.174 + * will return a nullptr chars (which can be tested for with the ! operator). 1.175 + * This method cannot trigger GC. 1.176 + */ 1.177 +extern Latin1CharsZ 1.178 +LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); 1.179 + 1.180 +extern UTF8CharsZ 1.181 +TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); 1.182 + 1.183 +uint32_t 1.184 +Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length); 1.185 + 1.186 +/* 1.187 + * Inflate bytes in UTF-8 encoding to jschars. 1.188 + * - On error, returns an empty TwoByteCharsZ. 1.189 + * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold 1.190 + * its length; the length value excludes the trailing null. 1.191 + */ 1.192 +extern TwoByteCharsZ 1.193 +UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); 1.194 + 1.195 +/* 1.196 + * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters 1.197 + * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8 1.198 + * input. 1.199 + */ 1.200 +extern TwoByteCharsZ 1.201 +LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); 1.202 + 1.203 +} // namespace JS 1.204 + 1.205 +inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); } 1.206 +inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); } 1.207 + 1.208 +#endif /* js_CharacterEncoding_h */