1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/vm/Unicode.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,210 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#ifndef vm_Unicode_h 1.11 +#define vm_Unicode_h 1.12 + 1.13 +#include "jspubtd.h" 1.14 + 1.15 +extern const bool js_isidstart[]; 1.16 +extern const bool js_isident[]; 1.17 +extern const bool js_isspace[]; 1.18 + 1.19 +namespace js { 1.20 +namespace unicode { 1.21 + 1.22 +/* 1.23 + * This enum contains the all the knowledge required to handle 1.24 + * Unicode in JavaScript. 1.25 + * 1.26 + * SPACE 1.27 + * Every character that is either in the ECMA-262 5th Edition 1.28 + * class WhiteSpace or LineTerminator. 1.29 + * 1.30 + * WhiteSpace 1.31 + * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF 1.32 + * and every other Unicode character with the General Category "Zs". 1.33 + * In pratice this is every character with the value "Zs" as the third 1.34 + * field (after the char code in hex, and the name) called General_Category 1.35 + * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt) 1.36 + * in the file UnicodeData.txt. 1.37 + * 1.38 + * LineTerminator 1.39 + * \u000A, \u000D, \u2028, \u2029 1.40 + * 1.41 + * LETTER 1.42 + * This are all characters included UnicodeLetter from ECMA-262. 1.43 + * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl' 1.44 + * 1.45 + * IDENTIFIER_PART 1.46 + * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation. 1.47 + * Aka categories Mn/Mc, Md, Nd, Pc 1.48 + * And <ZWNJ> and <ZWJ>. 1.49 + * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build 1.50 + * a matcher for the real IdentifierPart like this: 1.51 + * 1.52 + * if isEscapeSequence(): 1.53 + * handleEscapeSequence() 1.54 + * return True 1.55 + * if char in ['$', '_']: 1.56 + * return True 1.57 + * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER): 1.58 + * return True 1.59 + * 1.60 + */ 1.61 + 1.62 +struct CharFlag { 1.63 + enum temp { 1.64 + SPACE = 1 << 0, 1.65 + LETTER = 1 << 1, 1.66 + IDENTIFIER_PART = 1 << 2, 1.67 + }; 1.68 +}; 1.69 + 1.70 +const jschar BYTE_ORDER_MARK2 = 0xFFFE; 1.71 +const jschar NO_BREAK_SPACE = 0x00A0; 1.72 + 1.73 +class CharacterInfo { 1.74 + /* 1.75 + * upperCase and loweCase normally store the delta between two 1.76 + * letters. For example the lower case alpha (a) has the char code 1.77 + * 97, and the upper case alpha (A) has 65. So for "a" we would 1.78 + * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, 1.79 + * because this char is already in lower case. 1.80 + * Well, not -32 exactly, but (2**16 - 32) to induce 1.81 + * unsigned overflow with identical mathematical behavior. 1.82 + * For upper case alpha, we would store 0 in upperCase and 32 in 1.83 + * lowerCase (65 + 32 = 97). 1.84 + * 1.85 + * We use deltas to reuse information for multiple characters. For 1.86 + * example the whole lower case latin alphabet fits into one entry, 1.87 + * because it's always a UnicodeLetter and upperCase contains 1.88 + * -32. 1.89 + */ 1.90 + public: 1.91 + uint16_t upperCase; 1.92 + uint16_t lowerCase; 1.93 + uint8_t flags; 1.94 + 1.95 + inline bool isSpace() const { 1.96 + return flags & CharFlag::SPACE; 1.97 + } 1.98 + 1.99 + inline bool isLetter() const { 1.100 + return flags & CharFlag::LETTER; 1.101 + } 1.102 + 1.103 + inline bool isIdentifierPart() const { 1.104 + return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER); 1.105 + } 1.106 +}; 1.107 + 1.108 +extern const uint8_t index1[]; 1.109 +extern const uint8_t index2[]; 1.110 +extern const CharacterInfo js_charinfo[]; 1.111 + 1.112 +inline const CharacterInfo& 1.113 +CharInfo(jschar code) 1.114 +{ 1.115 + const size_t shift = 5; 1.116 + size_t index = index1[code >> shift]; 1.117 + index = index2[(index << shift) + (code & ((1 << shift) - 1))]; 1.118 + 1.119 + return js_charinfo[index]; 1.120 +} 1.121 + 1.122 +inline bool 1.123 +IsIdentifierStart(jschar ch) 1.124 +{ 1.125 + /* 1.126 + * ES5 7.6 IdentifierStart 1.127 + * $ (dollar sign) 1.128 + * _ (underscore) 1.129 + * or any UnicodeLetter. 1.130 + * 1.131 + * We use a lookup table for small and thus common characters for speed. 1.132 + */ 1.133 + 1.134 + if (ch < 128) 1.135 + return js_isidstart[ch]; 1.136 + 1.137 + return CharInfo(ch).isLetter(); 1.138 +} 1.139 + 1.140 +inline bool 1.141 +IsIdentifierPart(jschar ch) 1.142 +{ 1.143 + /* Matches ES5 7.6 IdentifierPart. */ 1.144 + 1.145 + if (ch < 128) 1.146 + return js_isident[ch]; 1.147 + 1.148 + return CharInfo(ch).isIdentifierPart(); 1.149 +} 1.150 + 1.151 +inline bool 1.152 +IsLetter(jschar ch) 1.153 +{ 1.154 + return CharInfo(ch).isLetter(); 1.155 +} 1.156 + 1.157 +inline bool 1.158 +IsSpace(jschar ch) 1.159 +{ 1.160 + /* 1.161 + * IsSpace checks if some character is included in the merged set 1.162 + * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3. 1.163 + * We combined them, because in practice nearly every 1.164 + * calling function wants this, except some code in the tokenizer. 1.165 + * 1.166 + * We use a lookup table for ASCII-7 characters, because they are 1.167 + * very common and must be handled quickly in the tokenizer. 1.168 + * NO-BREAK SPACE is supposed to be the most common character not in 1.169 + * this range, so we inline this case, too. 1.170 + */ 1.171 + 1.172 + if (ch < 128) 1.173 + return js_isspace[ch]; 1.174 + 1.175 + if (ch == NO_BREAK_SPACE) 1.176 + return true; 1.177 + 1.178 + return CharInfo(ch).isSpace(); 1.179 +} 1.180 + 1.181 +inline bool 1.182 +IsSpaceOrBOM2(jschar ch) 1.183 +{ 1.184 + if (ch < 128) 1.185 + return js_isspace[ch]; 1.186 + 1.187 + /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ 1.188 + if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) 1.189 + return true; 1.190 + 1.191 + return CharInfo(ch).isSpace(); 1.192 +} 1.193 + 1.194 +inline jschar 1.195 +ToUpperCase(jschar ch) 1.196 +{ 1.197 + const CharacterInfo &info = CharInfo(ch); 1.198 + 1.199 + return uint16_t(ch) + info.upperCase; 1.200 +} 1.201 + 1.202 +inline jschar 1.203 +ToLowerCase(jschar ch) 1.204 +{ 1.205 + const CharacterInfo &info = CharInfo(ch); 1.206 + 1.207 + return uint16_t(ch) + info.lowerCase; 1.208 +} 1.209 + 1.210 +} /* namespace unicode */ 1.211 +} /* namespace js */ 1.212 + 1.213 +#endif /* vm_Unicode_h */