|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 #ifndef vm_Unicode_h |
|
8 #define vm_Unicode_h |
|
9 |
|
10 #include "jspubtd.h" |
|
11 |
|
12 extern const bool js_isidstart[]; |
|
13 extern const bool js_isident[]; |
|
14 extern const bool js_isspace[]; |
|
15 |
|
16 namespace js { |
|
17 namespace unicode { |
|
18 |
|
19 /* |
|
20 * This enum contains the all the knowledge required to handle |
|
21 * Unicode in JavaScript. |
|
22 * |
|
23 * SPACE |
|
24 * Every character that is either in the ECMA-262 5th Edition |
|
25 * class WhiteSpace or LineTerminator. |
|
26 * |
|
27 * WhiteSpace |
|
28 * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF |
|
29 * and every other Unicode character with the General Category "Zs". |
|
30 * In pratice this is every character with the value "Zs" as the third |
|
31 * field (after the char code in hex, and the name) called General_Category |
|
32 * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt) |
|
33 * in the file UnicodeData.txt. |
|
34 * |
|
35 * LineTerminator |
|
36 * \u000A, \u000D, \u2028, \u2029 |
|
37 * |
|
38 * LETTER |
|
39 * This are all characters included UnicodeLetter from ECMA-262. |
|
40 * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl' |
|
41 * |
|
42 * IDENTIFIER_PART |
|
43 * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation. |
|
44 * Aka categories Mn/Mc, Md, Nd, Pc |
|
45 * And <ZWNJ> and <ZWJ>. |
|
46 * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build |
|
47 * a matcher for the real IdentifierPart like this: |
|
48 * |
|
49 * if isEscapeSequence(): |
|
50 * handleEscapeSequence() |
|
51 * return True |
|
52 * if char in ['$', '_']: |
|
53 * return True |
|
54 * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER): |
|
55 * return True |
|
56 * |
|
57 */ |
|
58 |
|
59 struct CharFlag { |
|
60 enum temp { |
|
61 SPACE = 1 << 0, |
|
62 LETTER = 1 << 1, |
|
63 IDENTIFIER_PART = 1 << 2, |
|
64 }; |
|
65 }; |
|
66 |
|
67 const jschar BYTE_ORDER_MARK2 = 0xFFFE; |
|
68 const jschar NO_BREAK_SPACE = 0x00A0; |
|
69 |
|
70 class CharacterInfo { |
|
71 /* |
|
72 * upperCase and loweCase normally store the delta between two |
|
73 * letters. For example the lower case alpha (a) has the char code |
|
74 * 97, and the upper case alpha (A) has 65. So for "a" we would |
|
75 * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, |
|
76 * because this char is already in lower case. |
|
77 * Well, not -32 exactly, but (2**16 - 32) to induce |
|
78 * unsigned overflow with identical mathematical behavior. |
|
79 * For upper case alpha, we would store 0 in upperCase and 32 in |
|
80 * lowerCase (65 + 32 = 97). |
|
81 * |
|
82 * We use deltas to reuse information for multiple characters. For |
|
83 * example the whole lower case latin alphabet fits into one entry, |
|
84 * because it's always a UnicodeLetter and upperCase contains |
|
85 * -32. |
|
86 */ |
|
87 public: |
|
88 uint16_t upperCase; |
|
89 uint16_t lowerCase; |
|
90 uint8_t flags; |
|
91 |
|
92 inline bool isSpace() const { |
|
93 return flags & CharFlag::SPACE; |
|
94 } |
|
95 |
|
96 inline bool isLetter() const { |
|
97 return flags & CharFlag::LETTER; |
|
98 } |
|
99 |
|
100 inline bool isIdentifierPart() const { |
|
101 return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER); |
|
102 } |
|
103 }; |
|
104 |
|
105 extern const uint8_t index1[]; |
|
106 extern const uint8_t index2[]; |
|
107 extern const CharacterInfo js_charinfo[]; |
|
108 |
|
109 inline const CharacterInfo& |
|
110 CharInfo(jschar code) |
|
111 { |
|
112 const size_t shift = 5; |
|
113 size_t index = index1[code >> shift]; |
|
114 index = index2[(index << shift) + (code & ((1 << shift) - 1))]; |
|
115 |
|
116 return js_charinfo[index]; |
|
117 } |
|
118 |
|
119 inline bool |
|
120 IsIdentifierStart(jschar ch) |
|
121 { |
|
122 /* |
|
123 * ES5 7.6 IdentifierStart |
|
124 * $ (dollar sign) |
|
125 * _ (underscore) |
|
126 * or any UnicodeLetter. |
|
127 * |
|
128 * We use a lookup table for small and thus common characters for speed. |
|
129 */ |
|
130 |
|
131 if (ch < 128) |
|
132 return js_isidstart[ch]; |
|
133 |
|
134 return CharInfo(ch).isLetter(); |
|
135 } |
|
136 |
|
137 inline bool |
|
138 IsIdentifierPart(jschar ch) |
|
139 { |
|
140 /* Matches ES5 7.6 IdentifierPart. */ |
|
141 |
|
142 if (ch < 128) |
|
143 return js_isident[ch]; |
|
144 |
|
145 return CharInfo(ch).isIdentifierPart(); |
|
146 } |
|
147 |
|
148 inline bool |
|
149 IsLetter(jschar ch) |
|
150 { |
|
151 return CharInfo(ch).isLetter(); |
|
152 } |
|
153 |
|
154 inline bool |
|
155 IsSpace(jschar ch) |
|
156 { |
|
157 /* |
|
158 * IsSpace checks if some character is included in the merged set |
|
159 * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3. |
|
160 * We combined them, because in practice nearly every |
|
161 * calling function wants this, except some code in the tokenizer. |
|
162 * |
|
163 * We use a lookup table for ASCII-7 characters, because they are |
|
164 * very common and must be handled quickly in the tokenizer. |
|
165 * NO-BREAK SPACE is supposed to be the most common character not in |
|
166 * this range, so we inline this case, too. |
|
167 */ |
|
168 |
|
169 if (ch < 128) |
|
170 return js_isspace[ch]; |
|
171 |
|
172 if (ch == NO_BREAK_SPACE) |
|
173 return true; |
|
174 |
|
175 return CharInfo(ch).isSpace(); |
|
176 } |
|
177 |
|
178 inline bool |
|
179 IsSpaceOrBOM2(jschar ch) |
|
180 { |
|
181 if (ch < 128) |
|
182 return js_isspace[ch]; |
|
183 |
|
184 /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ |
|
185 if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) |
|
186 return true; |
|
187 |
|
188 return CharInfo(ch).isSpace(); |
|
189 } |
|
190 |
|
191 inline jschar |
|
192 ToUpperCase(jschar ch) |
|
193 { |
|
194 const CharacterInfo &info = CharInfo(ch); |
|
195 |
|
196 return uint16_t(ch) + info.upperCase; |
|
197 } |
|
198 |
|
199 inline jschar |
|
200 ToLowerCase(jschar ch) |
|
201 { |
|
202 const CharacterInfo &info = CharInfo(ch); |
|
203 |
|
204 return uint16_t(ch) + info.lowerCase; |
|
205 } |
|
206 |
|
207 } /* namespace unicode */ |
|
208 } /* namespace js */ |
|
209 |
|
210 #endif /* vm_Unicode_h */ |