js/public/CharacterEncoding.h

Fri, 16 Jan 2015 18:13:44 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 18:13:44 +0100
branch
TOR_BUG_9701
changeset 14
925c144e1f1f
permissions
-rw-r--r--

Integrate suggestion from review to improve consistency with existing code.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0 3 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 6
michael@0 7 #ifndef js_CharacterEncoding_h
michael@0 8 #define js_CharacterEncoding_h
michael@0 9
michael@0 10 #include "mozilla/NullPtr.h"
michael@0 11 #include "mozilla/Range.h"
michael@0 12
michael@0 13 #include "js/TypeDecls.h"
michael@0 14 #include "js/Utility.h"
michael@0 15
michael@0 16 namespace js {
michael@0 17 struct ThreadSafeContext;
michael@0 18 }
michael@0 19
michael@0 20 namespace JS {
michael@0 21
michael@0 22 /*
michael@0 23 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
michael@0 24 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
michael@0 25 * byte is treated as a 2-byte character, and there is no way to pass in a
michael@0 26 * string containing characters beyond U+00FF.
michael@0 27 */
michael@0 28 class Latin1Chars : public mozilla::Range<unsigned char>
michael@0 29 {
michael@0 30 typedef mozilla::Range<unsigned char> Base;
michael@0 31
michael@0 32 public:
michael@0 33 Latin1Chars() : Base() {}
michael@0 34 Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {}
michael@0 35 Latin1Chars(const char *aBytes, size_t aLength)
michael@0 36 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
michael@0 37 {}
michael@0 38 };
michael@0 39
michael@0 40 /*
michael@0 41 * A Latin1Chars, but with \0 termination for C compatibility.
michael@0 42 */
michael@0 43 class Latin1CharsZ : public mozilla::RangedPtr<unsigned char>
michael@0 44 {
michael@0 45 typedef mozilla::RangedPtr<unsigned char> Base;
michael@0 46
michael@0 47 public:
michael@0 48 Latin1CharsZ() : Base(nullptr, 0) {}
michael@0 49
michael@0 50 Latin1CharsZ(char *aBytes, size_t aLength)
michael@0 51 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
michael@0 52 {
michael@0 53 MOZ_ASSERT(aBytes[aLength] == '\0');
michael@0 54 }
michael@0 55
michael@0 56 Latin1CharsZ(unsigned char *aBytes, size_t aLength)
michael@0 57 : Base(aBytes, aLength)
michael@0 58 {
michael@0 59 MOZ_ASSERT(aBytes[aLength] == '\0');
michael@0 60 }
michael@0 61
michael@0 62 using Base::operator=;
michael@0 63
michael@0 64 char *c_str() { return reinterpret_cast<char *>(get()); }
michael@0 65 };
michael@0 66
michael@0 67 class UTF8Chars : public mozilla::Range<unsigned char>
michael@0 68 {
michael@0 69 typedef mozilla::Range<unsigned char> Base;
michael@0 70
michael@0 71 public:
michael@0 72 UTF8Chars() : Base() {}
michael@0 73 UTF8Chars(char *aBytes, size_t aLength)
michael@0 74 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
michael@0 75 {}
michael@0 76 UTF8Chars(const char *aBytes, size_t aLength)
michael@0 77 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
michael@0 78 {}
michael@0 79 };
michael@0 80
michael@0 81 /*
michael@0 82 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
michael@0 83 */
michael@0 84 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
michael@0 85 {
michael@0 86 typedef mozilla::RangedPtr<unsigned char> Base;
michael@0 87
michael@0 88 public:
michael@0 89 UTF8CharsZ() : Base(nullptr, 0) {}
michael@0 90
michael@0 91 UTF8CharsZ(char *aBytes, size_t aLength)
michael@0 92 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
michael@0 93 {
michael@0 94 MOZ_ASSERT(aBytes[aLength] == '\0');
michael@0 95 }
michael@0 96
michael@0 97 UTF8CharsZ(unsigned char *aBytes, size_t aLength)
michael@0 98 : Base(aBytes, aLength)
michael@0 99 {
michael@0 100 MOZ_ASSERT(aBytes[aLength] == '\0');
michael@0 101 }
michael@0 102
michael@0 103 using Base::operator=;
michael@0 104
michael@0 105 char *c_str() { return reinterpret_cast<char *>(get()); }
michael@0 106 };
michael@0 107
michael@0 108 /*
michael@0 109 * SpiderMonkey uses a 2-byte character representation: it is a
michael@0 110 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
michael@0 111 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
michael@0 112 * sufficiently dedicated JavaScript program to be fully unicode-aware by
michael@0 113 * manually interpreting UTF-16 extension characters embedded in the JS
michael@0 114 * string.
michael@0 115 */
michael@0 116 class TwoByteChars : public mozilla::Range<jschar>
michael@0 117 {
michael@0 118 typedef mozilla::Range<jschar> Base;
michael@0 119
michael@0 120 public:
michael@0 121 TwoByteChars() : Base() {}
michael@0 122 TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {}
michael@0 123 TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {}
michael@0 124 };
michael@0 125
michael@0 126 /*
michael@0 127 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
michael@0 128 */
michael@0 129 class TwoByteCharsZ : public mozilla::RangedPtr<jschar>
michael@0 130 {
michael@0 131 typedef mozilla::RangedPtr<jschar> Base;
michael@0 132
michael@0 133 public:
michael@0 134 TwoByteCharsZ() : Base(nullptr, 0) {}
michael@0 135
michael@0 136 TwoByteCharsZ(jschar *chars, size_t length)
michael@0 137 : Base(chars, length)
michael@0 138 {
michael@0 139 MOZ_ASSERT(chars[length] == '\0');
michael@0 140 }
michael@0 141
michael@0 142 using Base::operator=;
michael@0 143 };
michael@0 144
michael@0 145 typedef mozilla::RangedPtr<const jschar> ConstCharPtr;
michael@0 146
michael@0 147 /*
michael@0 148 * Like TwoByteChars, but the chars are const.
michael@0 149 */
michael@0 150 class ConstTwoByteChars : public mozilla::RangedPtr<const jschar>
michael@0 151 {
michael@0 152 public:
michael@0 153 ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {}
michael@0 154 ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {}
michael@0 155 ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {}
michael@0 156 ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len)
michael@0 157 : ConstCharPtr(pos, start, len)
michael@0 158 {}
michael@0 159
michael@0 160 using ConstCharPtr::operator=;
michael@0 161 };
michael@0 162
michael@0 163
michael@0 164 /*
michael@0 165 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
michael@0 166 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
michael@0 167 * contains any UTF-16 extension characters, then this may give invalid Latin1
michael@0 168 * output. The returned string is zero terminated. The returned string or the
michael@0 169 * returned string's |start()| must be freed with JS_free or js_free,
michael@0 170 * respectively. If allocation fails, an OOM error will be set and the method
michael@0 171 * will return a nullptr chars (which can be tested for with the ! operator).
michael@0 172 * This method cannot trigger GC.
michael@0 173 */
michael@0 174 extern Latin1CharsZ
michael@0 175 LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
michael@0 176
michael@0 177 extern UTF8CharsZ
michael@0 178 TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
michael@0 179
michael@0 180 uint32_t
michael@0 181 Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);
michael@0 182
michael@0 183 /*
michael@0 184 * Inflate bytes in UTF-8 encoding to jschars.
michael@0 185 * - On error, returns an empty TwoByteCharsZ.
michael@0 186 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
michael@0 187 * its length; the length value excludes the trailing null.
michael@0 188 */
michael@0 189 extern TwoByteCharsZ
michael@0 190 UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
michael@0 191
michael@0 192 /*
michael@0 193 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
michael@0 194 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
michael@0 195 * input.
michael@0 196 */
michael@0 197 extern TwoByteCharsZ
michael@0 198 LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
michael@0 199
michael@0 200 } // namespace JS
michael@0 201
michael@0 202 inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); }
michael@0 203 inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); }
michael@0 204
michael@0 205 #endif /* js_CharacterEncoding_h */

mercurial