js/src/vm/Unicode.h

branch
TOR_BUG_3246
changeset 7
129ffea94266
equal deleted inserted replaced
-1:000000000000 0:9a11e3e0708d
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #ifndef vm_Unicode_h
8 #define vm_Unicode_h
9
10 #include "jspubtd.h"
11
12 extern const bool js_isidstart[];
13 extern const bool js_isident[];
14 extern const bool js_isspace[];
15
16 namespace js {
17 namespace unicode {
18
19 /*
20 * This enum contains the all the knowledge required to handle
21 * Unicode in JavaScript.
22 *
23 * SPACE
24 * Every character that is either in the ECMA-262 5th Edition
25 * class WhiteSpace or LineTerminator.
26 *
27 * WhiteSpace
28 * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
29 * and every other Unicode character with the General Category "Zs".
30 * In pratice this is every character with the value "Zs" as the third
31 * field (after the char code in hex, and the name) called General_Category
32 * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
33 * in the file UnicodeData.txt.
34 *
35 * LineTerminator
36 * \u000A, \u000D, \u2028, \u2029
37 *
38 * LETTER
39 * This are all characters included UnicodeLetter from ECMA-262.
40 * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
41 *
42 * IDENTIFIER_PART
43 * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
44 * Aka categories Mn/Mc, Md, Nd, Pc
45 * And <ZWNJ> and <ZWJ>.
46 * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
47 * a matcher for the real IdentifierPart like this:
48 *
49 * if isEscapeSequence():
50 * handleEscapeSequence()
51 * return True
52 * if char in ['$', '_']:
53 * return True
54 * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
55 * return True
56 *
57 */
58
59 struct CharFlag {
60 enum temp {
61 SPACE = 1 << 0,
62 LETTER = 1 << 1,
63 IDENTIFIER_PART = 1 << 2,
64 };
65 };
66
67 const jschar BYTE_ORDER_MARK2 = 0xFFFE;
68 const jschar NO_BREAK_SPACE = 0x00A0;
69
70 class CharacterInfo {
71 /*
72 * upperCase and loweCase normally store the delta between two
73 * letters. For example the lower case alpha (a) has the char code
74 * 97, and the upper case alpha (A) has 65. So for "a" we would
75 * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
76 * because this char is already in lower case.
77 * Well, not -32 exactly, but (2**16 - 32) to induce
78 * unsigned overflow with identical mathematical behavior.
79 * For upper case alpha, we would store 0 in upperCase and 32 in
80 * lowerCase (65 + 32 = 97).
81 *
82 * We use deltas to reuse information for multiple characters. For
83 * example the whole lower case latin alphabet fits into one entry,
84 * because it's always a UnicodeLetter and upperCase contains
85 * -32.
86 */
87 public:
88 uint16_t upperCase;
89 uint16_t lowerCase;
90 uint8_t flags;
91
92 inline bool isSpace() const {
93 return flags & CharFlag::SPACE;
94 }
95
96 inline bool isLetter() const {
97 return flags & CharFlag::LETTER;
98 }
99
100 inline bool isIdentifierPart() const {
101 return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
102 }
103 };
104
105 extern const uint8_t index1[];
106 extern const uint8_t index2[];
107 extern const CharacterInfo js_charinfo[];
108
109 inline const CharacterInfo&
110 CharInfo(jschar code)
111 {
112 const size_t shift = 5;
113 size_t index = index1[code >> shift];
114 index = index2[(index << shift) + (code & ((1 << shift) - 1))];
115
116 return js_charinfo[index];
117 }
118
119 inline bool
120 IsIdentifierStart(jschar ch)
121 {
122 /*
123 * ES5 7.6 IdentifierStart
124 * $ (dollar sign)
125 * _ (underscore)
126 * or any UnicodeLetter.
127 *
128 * We use a lookup table for small and thus common characters for speed.
129 */
130
131 if (ch < 128)
132 return js_isidstart[ch];
133
134 return CharInfo(ch).isLetter();
135 }
136
137 inline bool
138 IsIdentifierPart(jschar ch)
139 {
140 /* Matches ES5 7.6 IdentifierPart. */
141
142 if (ch < 128)
143 return js_isident[ch];
144
145 return CharInfo(ch).isIdentifierPart();
146 }
147
148 inline bool
149 IsLetter(jschar ch)
150 {
151 return CharInfo(ch).isLetter();
152 }
153
154 inline bool
155 IsSpace(jschar ch)
156 {
157 /*
158 * IsSpace checks if some character is included in the merged set
159 * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
160 * We combined them, because in practice nearly every
161 * calling function wants this, except some code in the tokenizer.
162 *
163 * We use a lookup table for ASCII-7 characters, because they are
164 * very common and must be handled quickly in the tokenizer.
165 * NO-BREAK SPACE is supposed to be the most common character not in
166 * this range, so we inline this case, too.
167 */
168
169 if (ch < 128)
170 return js_isspace[ch];
171
172 if (ch == NO_BREAK_SPACE)
173 return true;
174
175 return CharInfo(ch).isSpace();
176 }
177
178 inline bool
179 IsSpaceOrBOM2(jschar ch)
180 {
181 if (ch < 128)
182 return js_isspace[ch];
183
184 /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
185 if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
186 return true;
187
188 return CharInfo(ch).isSpace();
189 }
190
191 inline jschar
192 ToUpperCase(jschar ch)
193 {
194 const CharacterInfo &info = CharInfo(ch);
195
196 return uint16_t(ch) + info.upperCase;
197 }
198
199 inline jschar
200 ToLowerCase(jschar ch)
201 {
202 const CharacterInfo &info = CharInfo(ch);
203
204 return uint16_t(ch) + info.lowerCase;
205 }
206
207 } /* namespace unicode */
208 } /* namespace js */
209
210 #endif /* vm_Unicode_h */

mercurial