Fri, 16 Jan 2015 18:13:44 +0100
Integrate suggestion from review to improve consistency with existing code.
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef js_CharacterEncoding_h
8 #define js_CharacterEncoding_h
10 #include "mozilla/NullPtr.h"
11 #include "mozilla/Range.h"
13 #include "js/TypeDecls.h"
14 #include "js/Utility.h"
16 namespace js {
17 struct ThreadSafeContext;
18 }
20 namespace JS {
22 /*
23 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
24 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
25 * byte is treated as a 2-byte character, and there is no way to pass in a
26 * string containing characters beyond U+00FF.
27 */
28 class Latin1Chars : public mozilla::Range<unsigned char>
29 {
30 typedef mozilla::Range<unsigned char> Base;
32 public:
33 Latin1Chars() : Base() {}
34 Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {}
35 Latin1Chars(const char *aBytes, size_t aLength)
36 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
37 {}
38 };
40 /*
41 * A Latin1Chars, but with \0 termination for C compatibility.
42 */
43 class Latin1CharsZ : public mozilla::RangedPtr<unsigned char>
44 {
45 typedef mozilla::RangedPtr<unsigned char> Base;
47 public:
48 Latin1CharsZ() : Base(nullptr, 0) {}
50 Latin1CharsZ(char *aBytes, size_t aLength)
51 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
52 {
53 MOZ_ASSERT(aBytes[aLength] == '\0');
54 }
56 Latin1CharsZ(unsigned char *aBytes, size_t aLength)
57 : Base(aBytes, aLength)
58 {
59 MOZ_ASSERT(aBytes[aLength] == '\0');
60 }
62 using Base::operator=;
64 char *c_str() { return reinterpret_cast<char *>(get()); }
65 };
67 class UTF8Chars : public mozilla::Range<unsigned char>
68 {
69 typedef mozilla::Range<unsigned char> Base;
71 public:
72 UTF8Chars() : Base() {}
73 UTF8Chars(char *aBytes, size_t aLength)
74 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
75 {}
76 UTF8Chars(const char *aBytes, size_t aLength)
77 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
78 {}
79 };
81 /*
82 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
83 */
84 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
85 {
86 typedef mozilla::RangedPtr<unsigned char> Base;
88 public:
89 UTF8CharsZ() : Base(nullptr, 0) {}
91 UTF8CharsZ(char *aBytes, size_t aLength)
92 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
93 {
94 MOZ_ASSERT(aBytes[aLength] == '\0');
95 }
97 UTF8CharsZ(unsigned char *aBytes, size_t aLength)
98 : Base(aBytes, aLength)
99 {
100 MOZ_ASSERT(aBytes[aLength] == '\0');
101 }
103 using Base::operator=;
105 char *c_str() { return reinterpret_cast<char *>(get()); }
106 };
108 /*
109 * SpiderMonkey uses a 2-byte character representation: it is a
110 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
111 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
112 * sufficiently dedicated JavaScript program to be fully unicode-aware by
113 * manually interpreting UTF-16 extension characters embedded in the JS
114 * string.
115 */
116 class TwoByteChars : public mozilla::Range<jschar>
117 {
118 typedef mozilla::Range<jschar> Base;
120 public:
121 TwoByteChars() : Base() {}
122 TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {}
123 TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {}
124 };
126 /*
127 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
128 */
129 class TwoByteCharsZ : public mozilla::RangedPtr<jschar>
130 {
131 typedef mozilla::RangedPtr<jschar> Base;
133 public:
134 TwoByteCharsZ() : Base(nullptr, 0) {}
136 TwoByteCharsZ(jschar *chars, size_t length)
137 : Base(chars, length)
138 {
139 MOZ_ASSERT(chars[length] == '\0');
140 }
142 using Base::operator=;
143 };
145 typedef mozilla::RangedPtr<const jschar> ConstCharPtr;
147 /*
148 * Like TwoByteChars, but the chars are const.
149 */
150 class ConstTwoByteChars : public mozilla::RangedPtr<const jschar>
151 {
152 public:
153 ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {}
154 ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {}
155 ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {}
156 ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len)
157 : ConstCharPtr(pos, start, len)
158 {}
160 using ConstCharPtr::operator=;
161 };
164 /*
165 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
166 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
167 * contains any UTF-16 extension characters, then this may give invalid Latin1
168 * output. The returned string is zero terminated. The returned string or the
169 * returned string's |start()| must be freed with JS_free or js_free,
170 * respectively. If allocation fails, an OOM error will be set and the method
171 * will return a nullptr chars (which can be tested for with the ! operator).
172 * This method cannot trigger GC.
173 */
174 extern Latin1CharsZ
175 LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
177 extern UTF8CharsZ
178 TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars);
180 uint32_t
181 Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);
183 /*
184 * Inflate bytes in UTF-8 encoding to jschars.
185 * - On error, returns an empty TwoByteCharsZ.
186 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
187 * its length; the length value excludes the trailing null.
188 */
189 extern TwoByteCharsZ
190 UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
192 /*
193 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
194 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
195 * input.
196 */
197 extern TwoByteCharsZ
198 LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
200 } // namespace JS
202 inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); }
203 inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); }
205 #endif /* js_CharacterEncoding_h */