michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * vim: set ts=8 sts=4 et sw=4 tw=99: michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef js_CharacterEncoding_h michael@0: #define js_CharacterEncoding_h michael@0: michael@0: #include "mozilla/NullPtr.h" michael@0: #include "mozilla/Range.h" michael@0: michael@0: #include "js/TypeDecls.h" michael@0: #include "js/Utility.h" michael@0: michael@0: namespace js { michael@0: struct ThreadSafeContext; michael@0: } michael@0: michael@0: namespace JS { michael@0: michael@0: /* michael@0: * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI michael@0: * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each michael@0: * byte is treated as a 2-byte character, and there is no way to pass in a michael@0: * string containing characters beyond U+00FF. michael@0: */ michael@0: class Latin1Chars : public mozilla::Range michael@0: { michael@0: typedef mozilla::Range Base; michael@0: michael@0: public: michael@0: Latin1Chars() : Base() {} michael@0: Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} michael@0: Latin1Chars(const char *aBytes, size_t aLength) michael@0: : Base(reinterpret_cast(const_cast(aBytes)), aLength) michael@0: {} michael@0: }; michael@0: michael@0: /* michael@0: * A Latin1Chars, but with \0 termination for C compatibility. michael@0: */ michael@0: class Latin1CharsZ : public mozilla::RangedPtr michael@0: { michael@0: typedef mozilla::RangedPtr Base; michael@0: michael@0: public: michael@0: Latin1CharsZ() : Base(nullptr, 0) {} michael@0: michael@0: Latin1CharsZ(char *aBytes, size_t aLength) michael@0: : Base(reinterpret_cast(aBytes), aLength) michael@0: { michael@0: MOZ_ASSERT(aBytes[aLength] == '\0'); michael@0: } michael@0: michael@0: Latin1CharsZ(unsigned char *aBytes, size_t aLength) michael@0: : Base(aBytes, aLength) michael@0: { michael@0: MOZ_ASSERT(aBytes[aLength] == '\0'); michael@0: } michael@0: michael@0: using Base::operator=; michael@0: michael@0: char *c_str() { return reinterpret_cast(get()); } michael@0: }; michael@0: michael@0: class UTF8Chars : public mozilla::Range michael@0: { michael@0: typedef mozilla::Range Base; michael@0: michael@0: public: michael@0: UTF8Chars() : Base() {} michael@0: UTF8Chars(char *aBytes, size_t aLength) michael@0: : Base(reinterpret_cast(aBytes), aLength) michael@0: {} michael@0: UTF8Chars(const char *aBytes, size_t aLength) michael@0: : Base(reinterpret_cast(const_cast(aBytes)), aLength) michael@0: {} michael@0: }; michael@0: michael@0: /* michael@0: * SpiderMonkey also deals directly with UTF-8 encoded text in some places. michael@0: */ michael@0: class UTF8CharsZ : public mozilla::RangedPtr michael@0: { michael@0: typedef mozilla::RangedPtr Base; michael@0: michael@0: public: michael@0: UTF8CharsZ() : Base(nullptr, 0) {} michael@0: michael@0: UTF8CharsZ(char *aBytes, size_t aLength) michael@0: : Base(reinterpret_cast(aBytes), aLength) michael@0: { michael@0: MOZ_ASSERT(aBytes[aLength] == '\0'); michael@0: } michael@0: michael@0: UTF8CharsZ(unsigned char *aBytes, size_t aLength) michael@0: : Base(aBytes, aLength) michael@0: { michael@0: MOZ_ASSERT(aBytes[aLength] == '\0'); michael@0: } michael@0: michael@0: using Base::operator=; michael@0: michael@0: char *c_str() { return reinterpret_cast(get()); } michael@0: }; michael@0: michael@0: /* michael@0: * SpiderMonkey uses a 2-byte character representation: it is a michael@0: * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, michael@0: * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a michael@0: * sufficiently dedicated JavaScript program to be fully unicode-aware by michael@0: * manually interpreting UTF-16 extension characters embedded in the JS michael@0: * string. michael@0: */ michael@0: class TwoByteChars : public mozilla::Range michael@0: { michael@0: typedef mozilla::Range Base; michael@0: michael@0: public: michael@0: TwoByteChars() : Base() {} michael@0: TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {} michael@0: TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast(aChars), aLength) {} michael@0: }; michael@0: michael@0: /* michael@0: * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. michael@0: */ michael@0: class TwoByteCharsZ : public mozilla::RangedPtr michael@0: { michael@0: typedef mozilla::RangedPtr Base; michael@0: michael@0: public: michael@0: TwoByteCharsZ() : Base(nullptr, 0) {} michael@0: michael@0: TwoByteCharsZ(jschar *chars, size_t length) michael@0: : Base(chars, length) michael@0: { michael@0: MOZ_ASSERT(chars[length] == '\0'); michael@0: } michael@0: michael@0: using Base::operator=; michael@0: }; michael@0: michael@0: typedef mozilla::RangedPtr ConstCharPtr; michael@0: michael@0: /* michael@0: * Like TwoByteChars, but the chars are const. michael@0: */ michael@0: class ConstTwoByteChars : public mozilla::RangedPtr michael@0: { michael@0: public: michael@0: ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {} michael@0: ConstTwoByteChars(const mozilla::RangedPtr &s) : ConstCharPtr(s) {} michael@0: ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {} michael@0: ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len) michael@0: : ConstCharPtr(pos, start, len) michael@0: {} michael@0: michael@0: using ConstCharPtr::operator=; michael@0: }; michael@0: michael@0: michael@0: /* michael@0: * Convert a 2-byte character sequence to "ISO-Latin-1". This works by michael@0: * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source michael@0: * contains any UTF-16 extension characters, then this may give invalid Latin1 michael@0: * output. The returned string is zero terminated. The returned string or the michael@0: * returned string's |start()| must be freed with JS_free or js_free, michael@0: * respectively. If allocation fails, an OOM error will be set and the method michael@0: * will return a nullptr chars (which can be tested for with the ! operator). michael@0: * This method cannot trigger GC. michael@0: */ michael@0: extern Latin1CharsZ michael@0: LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); michael@0: michael@0: extern UTF8CharsZ michael@0: TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); michael@0: michael@0: uint32_t michael@0: Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length); michael@0: michael@0: /* michael@0: * Inflate bytes in UTF-8 encoding to jschars. michael@0: * - On error, returns an empty TwoByteCharsZ. michael@0: * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold michael@0: * its length; the length value excludes the trailing null. michael@0: */ michael@0: extern TwoByteCharsZ michael@0: UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); michael@0: michael@0: /* michael@0: * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters michael@0: * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8 michael@0: * input. michael@0: */ michael@0: extern TwoByteCharsZ michael@0: LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); michael@0: michael@0: } // namespace JS michael@0: michael@0: inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); } michael@0: inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); } michael@0: michael@0: #endif /* js_CharacterEncoding_h */