|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 #ifndef js_CharacterEncoding_h |
|
8 #define js_CharacterEncoding_h |
|
9 |
|
10 #include "mozilla/NullPtr.h" |
|
11 #include "mozilla/Range.h" |
|
12 |
|
13 #include "js/TypeDecls.h" |
|
14 #include "js/Utility.h" |
|
15 |
|
16 namespace js { |
|
17 struct ThreadSafeContext; |
|
18 } |
|
19 |
|
20 namespace JS { |
|
21 |
|
22 /* |
|
23 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI |
|
24 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each |
|
25 * byte is treated as a 2-byte character, and there is no way to pass in a |
|
26 * string containing characters beyond U+00FF. |
|
27 */ |
|
28 class Latin1Chars : public mozilla::Range<unsigned char> |
|
29 { |
|
30 typedef mozilla::Range<unsigned char> Base; |
|
31 |
|
32 public: |
|
33 Latin1Chars() : Base() {} |
|
34 Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) {} |
|
35 Latin1Chars(const char *aBytes, size_t aLength) |
|
36 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength) |
|
37 {} |
|
38 }; |
|
39 |
|
40 /* |
|
41 * A Latin1Chars, but with \0 termination for C compatibility. |
|
42 */ |
|
43 class Latin1CharsZ : public mozilla::RangedPtr<unsigned char> |
|
44 { |
|
45 typedef mozilla::RangedPtr<unsigned char> Base; |
|
46 |
|
47 public: |
|
48 Latin1CharsZ() : Base(nullptr, 0) {} |
|
49 |
|
50 Latin1CharsZ(char *aBytes, size_t aLength) |
|
51 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) |
|
52 { |
|
53 MOZ_ASSERT(aBytes[aLength] == '\0'); |
|
54 } |
|
55 |
|
56 Latin1CharsZ(unsigned char *aBytes, size_t aLength) |
|
57 : Base(aBytes, aLength) |
|
58 { |
|
59 MOZ_ASSERT(aBytes[aLength] == '\0'); |
|
60 } |
|
61 |
|
62 using Base::operator=; |
|
63 |
|
64 char *c_str() { return reinterpret_cast<char *>(get()); } |
|
65 }; |
|
66 |
|
67 class UTF8Chars : public mozilla::Range<unsigned char> |
|
68 { |
|
69 typedef mozilla::Range<unsigned char> Base; |
|
70 |
|
71 public: |
|
72 UTF8Chars() : Base() {} |
|
73 UTF8Chars(char *aBytes, size_t aLength) |
|
74 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) |
|
75 {} |
|
76 UTF8Chars(const char *aBytes, size_t aLength) |
|
77 : Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength) |
|
78 {} |
|
79 }; |
|
80 |
|
81 /* |
|
82 * SpiderMonkey also deals directly with UTF-8 encoded text in some places. |
|
83 */ |
|
84 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> |
|
85 { |
|
86 typedef mozilla::RangedPtr<unsigned char> Base; |
|
87 |
|
88 public: |
|
89 UTF8CharsZ() : Base(nullptr, 0) {} |
|
90 |
|
91 UTF8CharsZ(char *aBytes, size_t aLength) |
|
92 : Base(reinterpret_cast<unsigned char *>(aBytes), aLength) |
|
93 { |
|
94 MOZ_ASSERT(aBytes[aLength] == '\0'); |
|
95 } |
|
96 |
|
97 UTF8CharsZ(unsigned char *aBytes, size_t aLength) |
|
98 : Base(aBytes, aLength) |
|
99 { |
|
100 MOZ_ASSERT(aBytes[aLength] == '\0'); |
|
101 } |
|
102 |
|
103 using Base::operator=; |
|
104 |
|
105 char *c_str() { return reinterpret_cast<char *>(get()); } |
|
106 }; |
|
107 |
|
108 /* |
|
109 * SpiderMonkey uses a 2-byte character representation: it is a |
|
110 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, |
|
111 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a |
|
112 * sufficiently dedicated JavaScript program to be fully unicode-aware by |
|
113 * manually interpreting UTF-16 extension characters embedded in the JS |
|
114 * string. |
|
115 */ |
|
116 class TwoByteChars : public mozilla::Range<jschar> |
|
117 { |
|
118 typedef mozilla::Range<jschar> Base; |
|
119 |
|
120 public: |
|
121 TwoByteChars() : Base() {} |
|
122 TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {} |
|
123 TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {} |
|
124 }; |
|
125 |
|
126 /* |
|
127 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. |
|
128 */ |
|
129 class TwoByteCharsZ : public mozilla::RangedPtr<jschar> |
|
130 { |
|
131 typedef mozilla::RangedPtr<jschar> Base; |
|
132 |
|
133 public: |
|
134 TwoByteCharsZ() : Base(nullptr, 0) {} |
|
135 |
|
136 TwoByteCharsZ(jschar *chars, size_t length) |
|
137 : Base(chars, length) |
|
138 { |
|
139 MOZ_ASSERT(chars[length] == '\0'); |
|
140 } |
|
141 |
|
142 using Base::operator=; |
|
143 }; |
|
144 |
|
145 typedef mozilla::RangedPtr<const jschar> ConstCharPtr; |
|
146 |
|
147 /* |
|
148 * Like TwoByteChars, but the chars are const. |
|
149 */ |
|
150 class ConstTwoByteChars : public mozilla::RangedPtr<const jschar> |
|
151 { |
|
152 public: |
|
153 ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {} |
|
154 ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {} |
|
155 ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {} |
|
156 ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len) |
|
157 : ConstCharPtr(pos, start, len) |
|
158 {} |
|
159 |
|
160 using ConstCharPtr::operator=; |
|
161 }; |
|
162 |
|
163 |
|
164 /* |
|
165 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by |
|
166 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source |
|
167 * contains any UTF-16 extension characters, then this may give invalid Latin1 |
|
168 * output. The returned string is zero terminated. The returned string or the |
|
169 * returned string's |start()| must be freed with JS_free or js_free, |
|
170 * respectively. If allocation fails, an OOM error will be set and the method |
|
171 * will return a nullptr chars (which can be tested for with the ! operator). |
|
172 * This method cannot trigger GC. |
|
173 */ |
|
174 extern Latin1CharsZ |
|
175 LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); |
|
176 |
|
177 extern UTF8CharsZ |
|
178 TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars); |
|
179 |
|
180 uint32_t |
|
181 Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length); |
|
182 |
|
183 /* |
|
184 * Inflate bytes in UTF-8 encoding to jschars. |
|
185 * - On error, returns an empty TwoByteCharsZ. |
|
186 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold |
|
187 * its length; the length value excludes the trailing null. |
|
188 */ |
|
189 extern TwoByteCharsZ |
|
190 UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); |
|
191 |
|
192 /* |
|
193 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters |
|
194 * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8 |
|
195 * input. |
|
196 */ |
|
197 extern TwoByteCharsZ |
|
198 LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen); |
|
199 |
|
200 } // namespace JS |
|
201 |
|
202 inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); } |
|
203 inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); } |
|
204 |
|
205 #endif /* js_CharacterEncoding_h */ |