Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
michael@0 | 2 | * vim: set ts=8 sts=4 et sw=4 tw=99: |
michael@0 | 3 | * This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 6 | |
michael@0 | 7 | #ifndef vm_Unicode_h |
michael@0 | 8 | #define vm_Unicode_h |
michael@0 | 9 | |
michael@0 | 10 | #include "jspubtd.h" |
michael@0 | 11 | |
michael@0 | 12 | extern const bool js_isidstart[]; |
michael@0 | 13 | extern const bool js_isident[]; |
michael@0 | 14 | extern const bool js_isspace[]; |
michael@0 | 15 | |
michael@0 | 16 | namespace js { |
michael@0 | 17 | namespace unicode { |
michael@0 | 18 | |
michael@0 | 19 | /* |
michael@0 | 20 | * This enum contains the all the knowledge required to handle |
michael@0 | 21 | * Unicode in JavaScript. |
michael@0 | 22 | * |
michael@0 | 23 | * SPACE |
michael@0 | 24 | * Every character that is either in the ECMA-262 5th Edition |
michael@0 | 25 | * class WhiteSpace or LineTerminator. |
michael@0 | 26 | * |
michael@0 | 27 | * WhiteSpace |
michael@0 | 28 | * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF |
michael@0 | 29 | * and every other Unicode character with the General Category "Zs". |
michael@0 | 30 | * In pratice this is every character with the value "Zs" as the third |
michael@0 | 31 | * field (after the char code in hex, and the name) called General_Category |
michael@0 | 32 | * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt) |
michael@0 | 33 | * in the file UnicodeData.txt. |
michael@0 | 34 | * |
michael@0 | 35 | * LineTerminator |
michael@0 | 36 | * \u000A, \u000D, \u2028, \u2029 |
michael@0 | 37 | * |
michael@0 | 38 | * LETTER |
michael@0 | 39 | * This are all characters included UnicodeLetter from ECMA-262. |
michael@0 | 40 | * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl' |
michael@0 | 41 | * |
michael@0 | 42 | * IDENTIFIER_PART |
michael@0 | 43 | * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation. |
michael@0 | 44 | * Aka categories Mn/Mc, Md, Nd, Pc |
michael@0 | 45 | * And <ZWNJ> and <ZWJ>. |
michael@0 | 46 | * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build |
michael@0 | 47 | * a matcher for the real IdentifierPart like this: |
michael@0 | 48 | * |
michael@0 | 49 | * if isEscapeSequence(): |
michael@0 | 50 | * handleEscapeSequence() |
michael@0 | 51 | * return True |
michael@0 | 52 | * if char in ['$', '_']: |
michael@0 | 53 | * return True |
michael@0 | 54 | * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER): |
michael@0 | 55 | * return True |
michael@0 | 56 | * |
michael@0 | 57 | */ |
michael@0 | 58 | |
michael@0 | 59 | struct CharFlag { |
michael@0 | 60 | enum temp { |
michael@0 | 61 | SPACE = 1 << 0, |
michael@0 | 62 | LETTER = 1 << 1, |
michael@0 | 63 | IDENTIFIER_PART = 1 << 2, |
michael@0 | 64 | }; |
michael@0 | 65 | }; |
michael@0 | 66 | |
michael@0 | 67 | const jschar BYTE_ORDER_MARK2 = 0xFFFE; |
michael@0 | 68 | const jschar NO_BREAK_SPACE = 0x00A0; |
michael@0 | 69 | |
michael@0 | 70 | class CharacterInfo { |
michael@0 | 71 | /* |
michael@0 | 72 | * upperCase and loweCase normally store the delta between two |
michael@0 | 73 | * letters. For example the lower case alpha (a) has the char code |
michael@0 | 74 | * 97, and the upper case alpha (A) has 65. So for "a" we would |
michael@0 | 75 | * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, |
michael@0 | 76 | * because this char is already in lower case. |
michael@0 | 77 | * Well, not -32 exactly, but (2**16 - 32) to induce |
michael@0 | 78 | * unsigned overflow with identical mathematical behavior. |
michael@0 | 79 | * For upper case alpha, we would store 0 in upperCase and 32 in |
michael@0 | 80 | * lowerCase (65 + 32 = 97). |
michael@0 | 81 | * |
michael@0 | 82 | * We use deltas to reuse information for multiple characters. For |
michael@0 | 83 | * example the whole lower case latin alphabet fits into one entry, |
michael@0 | 84 | * because it's always a UnicodeLetter and upperCase contains |
michael@0 | 85 | * -32. |
michael@0 | 86 | */ |
michael@0 | 87 | public: |
michael@0 | 88 | uint16_t upperCase; |
michael@0 | 89 | uint16_t lowerCase; |
michael@0 | 90 | uint8_t flags; |
michael@0 | 91 | |
michael@0 | 92 | inline bool isSpace() const { |
michael@0 | 93 | return flags & CharFlag::SPACE; |
michael@0 | 94 | } |
michael@0 | 95 | |
michael@0 | 96 | inline bool isLetter() const { |
michael@0 | 97 | return flags & CharFlag::LETTER; |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | inline bool isIdentifierPart() const { |
michael@0 | 101 | return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER); |
michael@0 | 102 | } |
michael@0 | 103 | }; |
michael@0 | 104 | |
michael@0 | 105 | extern const uint8_t index1[]; |
michael@0 | 106 | extern const uint8_t index2[]; |
michael@0 | 107 | extern const CharacterInfo js_charinfo[]; |
michael@0 | 108 | |
michael@0 | 109 | inline const CharacterInfo& |
michael@0 | 110 | CharInfo(jschar code) |
michael@0 | 111 | { |
michael@0 | 112 | const size_t shift = 5; |
michael@0 | 113 | size_t index = index1[code >> shift]; |
michael@0 | 114 | index = index2[(index << shift) + (code & ((1 << shift) - 1))]; |
michael@0 | 115 | |
michael@0 | 116 | return js_charinfo[index]; |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | inline bool |
michael@0 | 120 | IsIdentifierStart(jschar ch) |
michael@0 | 121 | { |
michael@0 | 122 | /* |
michael@0 | 123 | * ES5 7.6 IdentifierStart |
michael@0 | 124 | * $ (dollar sign) |
michael@0 | 125 | * _ (underscore) |
michael@0 | 126 | * or any UnicodeLetter. |
michael@0 | 127 | * |
michael@0 | 128 | * We use a lookup table for small and thus common characters for speed. |
michael@0 | 129 | */ |
michael@0 | 130 | |
michael@0 | 131 | if (ch < 128) |
michael@0 | 132 | return js_isidstart[ch]; |
michael@0 | 133 | |
michael@0 | 134 | return CharInfo(ch).isLetter(); |
michael@0 | 135 | } |
michael@0 | 136 | |
michael@0 | 137 | inline bool |
michael@0 | 138 | IsIdentifierPart(jschar ch) |
michael@0 | 139 | { |
michael@0 | 140 | /* Matches ES5 7.6 IdentifierPart. */ |
michael@0 | 141 | |
michael@0 | 142 | if (ch < 128) |
michael@0 | 143 | return js_isident[ch]; |
michael@0 | 144 | |
michael@0 | 145 | return CharInfo(ch).isIdentifierPart(); |
michael@0 | 146 | } |
michael@0 | 147 | |
michael@0 | 148 | inline bool |
michael@0 | 149 | IsLetter(jschar ch) |
michael@0 | 150 | { |
michael@0 | 151 | return CharInfo(ch).isLetter(); |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | inline bool |
michael@0 | 155 | IsSpace(jschar ch) |
michael@0 | 156 | { |
michael@0 | 157 | /* |
michael@0 | 158 | * IsSpace checks if some character is included in the merged set |
michael@0 | 159 | * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3. |
michael@0 | 160 | * We combined them, because in practice nearly every |
michael@0 | 161 | * calling function wants this, except some code in the tokenizer. |
michael@0 | 162 | * |
michael@0 | 163 | * We use a lookup table for ASCII-7 characters, because they are |
michael@0 | 164 | * very common and must be handled quickly in the tokenizer. |
michael@0 | 165 | * NO-BREAK SPACE is supposed to be the most common character not in |
michael@0 | 166 | * this range, so we inline this case, too. |
michael@0 | 167 | */ |
michael@0 | 168 | |
michael@0 | 169 | if (ch < 128) |
michael@0 | 170 | return js_isspace[ch]; |
michael@0 | 171 | |
michael@0 | 172 | if (ch == NO_BREAK_SPACE) |
michael@0 | 173 | return true; |
michael@0 | 174 | |
michael@0 | 175 | return CharInfo(ch).isSpace(); |
michael@0 | 176 | } |
michael@0 | 177 | |
michael@0 | 178 | inline bool |
michael@0 | 179 | IsSpaceOrBOM2(jschar ch) |
michael@0 | 180 | { |
michael@0 | 181 | if (ch < 128) |
michael@0 | 182 | return js_isspace[ch]; |
michael@0 | 183 | |
michael@0 | 184 | /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ |
michael@0 | 185 | if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) |
michael@0 | 186 | return true; |
michael@0 | 187 | |
michael@0 | 188 | return CharInfo(ch).isSpace(); |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | inline jschar |
michael@0 | 192 | ToUpperCase(jschar ch) |
michael@0 | 193 | { |
michael@0 | 194 | const CharacterInfo &info = CharInfo(ch); |
michael@0 | 195 | |
michael@0 | 196 | return uint16_t(ch) + info.upperCase; |
michael@0 | 197 | } |
michael@0 | 198 | |
michael@0 | 199 | inline jschar |
michael@0 | 200 | ToLowerCase(jschar ch) |
michael@0 | 201 | { |
michael@0 | 202 | const CharacterInfo &info = CharInfo(ch); |
michael@0 | 203 | |
michael@0 | 204 | return uint16_t(ch) + info.lowerCase; |
michael@0 | 205 | } |
michael@0 | 206 | |
michael@0 | 207 | } /* namespace unicode */ |
michael@0 | 208 | } /* namespace js */ |
michael@0 | 209 | |
michael@0 | 210 | #endif /* vm_Unicode_h */ |