js/src/vm/Unicode.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0 3 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 6
michael@0 7 #ifndef vm_Unicode_h
michael@0 8 #define vm_Unicode_h
michael@0 9
michael@0 10 #include "jspubtd.h"
michael@0 11
michael@0 12 extern const bool js_isidstart[];
michael@0 13 extern const bool js_isident[];
michael@0 14 extern const bool js_isspace[];
michael@0 15
michael@0 16 namespace js {
michael@0 17 namespace unicode {
michael@0 18
michael@0 19 /*
michael@0 20 * This enum contains the all the knowledge required to handle
michael@0 21 * Unicode in JavaScript.
michael@0 22 *
michael@0 23 * SPACE
michael@0 24 * Every character that is either in the ECMA-262 5th Edition
michael@0 25 * class WhiteSpace or LineTerminator.
michael@0 26 *
michael@0 27 * WhiteSpace
michael@0 28 * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
michael@0 29 * and every other Unicode character with the General Category "Zs".
michael@0 30 * In pratice this is every character with the value "Zs" as the third
michael@0 31 * field (after the char code in hex, and the name) called General_Category
michael@0 32 * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
michael@0 33 * in the file UnicodeData.txt.
michael@0 34 *
michael@0 35 * LineTerminator
michael@0 36 * \u000A, \u000D, \u2028, \u2029
michael@0 37 *
michael@0 38 * LETTER
michael@0 39 * This are all characters included UnicodeLetter from ECMA-262.
michael@0 40 * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
michael@0 41 *
michael@0 42 * IDENTIFIER_PART
michael@0 43 * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
michael@0 44 * Aka categories Mn/Mc, Md, Nd, Pc
michael@0 45 * And <ZWNJ> and <ZWJ>.
michael@0 46 * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
michael@0 47 * a matcher for the real IdentifierPart like this:
michael@0 48 *
michael@0 49 * if isEscapeSequence():
michael@0 50 * handleEscapeSequence()
michael@0 51 * return True
michael@0 52 * if char in ['$', '_']:
michael@0 53 * return True
michael@0 54 * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
michael@0 55 * return True
michael@0 56 *
michael@0 57 */
michael@0 58
michael@0 59 struct CharFlag {
michael@0 60 enum temp {
michael@0 61 SPACE = 1 << 0,
michael@0 62 LETTER = 1 << 1,
michael@0 63 IDENTIFIER_PART = 1 << 2,
michael@0 64 };
michael@0 65 };
michael@0 66
michael@0 67 const jschar BYTE_ORDER_MARK2 = 0xFFFE;
michael@0 68 const jschar NO_BREAK_SPACE = 0x00A0;
michael@0 69
michael@0 70 class CharacterInfo {
michael@0 71 /*
michael@0 72 * upperCase and loweCase normally store the delta between two
michael@0 73 * letters. For example the lower case alpha (a) has the char code
michael@0 74 * 97, and the upper case alpha (A) has 65. So for "a" we would
michael@0 75 * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
michael@0 76 * because this char is already in lower case.
michael@0 77 * Well, not -32 exactly, but (2**16 - 32) to induce
michael@0 78 * unsigned overflow with identical mathematical behavior.
michael@0 79 * For upper case alpha, we would store 0 in upperCase and 32 in
michael@0 80 * lowerCase (65 + 32 = 97).
michael@0 81 *
michael@0 82 * We use deltas to reuse information for multiple characters. For
michael@0 83 * example the whole lower case latin alphabet fits into one entry,
michael@0 84 * because it's always a UnicodeLetter and upperCase contains
michael@0 85 * -32.
michael@0 86 */
michael@0 87 public:
michael@0 88 uint16_t upperCase;
michael@0 89 uint16_t lowerCase;
michael@0 90 uint8_t flags;
michael@0 91
michael@0 92 inline bool isSpace() const {
michael@0 93 return flags & CharFlag::SPACE;
michael@0 94 }
michael@0 95
michael@0 96 inline bool isLetter() const {
michael@0 97 return flags & CharFlag::LETTER;
michael@0 98 }
michael@0 99
michael@0 100 inline bool isIdentifierPart() const {
michael@0 101 return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
michael@0 102 }
michael@0 103 };
michael@0 104
michael@0 105 extern const uint8_t index1[];
michael@0 106 extern const uint8_t index2[];
michael@0 107 extern const CharacterInfo js_charinfo[];
michael@0 108
michael@0 109 inline const CharacterInfo&
michael@0 110 CharInfo(jschar code)
michael@0 111 {
michael@0 112 const size_t shift = 5;
michael@0 113 size_t index = index1[code >> shift];
michael@0 114 index = index2[(index << shift) + (code & ((1 << shift) - 1))];
michael@0 115
michael@0 116 return js_charinfo[index];
michael@0 117 }
michael@0 118
michael@0 119 inline bool
michael@0 120 IsIdentifierStart(jschar ch)
michael@0 121 {
michael@0 122 /*
michael@0 123 * ES5 7.6 IdentifierStart
michael@0 124 * $ (dollar sign)
michael@0 125 * _ (underscore)
michael@0 126 * or any UnicodeLetter.
michael@0 127 *
michael@0 128 * We use a lookup table for small and thus common characters for speed.
michael@0 129 */
michael@0 130
michael@0 131 if (ch < 128)
michael@0 132 return js_isidstart[ch];
michael@0 133
michael@0 134 return CharInfo(ch).isLetter();
michael@0 135 }
michael@0 136
michael@0 137 inline bool
michael@0 138 IsIdentifierPart(jschar ch)
michael@0 139 {
michael@0 140 /* Matches ES5 7.6 IdentifierPart. */
michael@0 141
michael@0 142 if (ch < 128)
michael@0 143 return js_isident[ch];
michael@0 144
michael@0 145 return CharInfo(ch).isIdentifierPart();
michael@0 146 }
michael@0 147
michael@0 148 inline bool
michael@0 149 IsLetter(jschar ch)
michael@0 150 {
michael@0 151 return CharInfo(ch).isLetter();
michael@0 152 }
michael@0 153
michael@0 154 inline bool
michael@0 155 IsSpace(jschar ch)
michael@0 156 {
michael@0 157 /*
michael@0 158 * IsSpace checks if some character is included in the merged set
michael@0 159 * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
michael@0 160 * We combined them, because in practice nearly every
michael@0 161 * calling function wants this, except some code in the tokenizer.
michael@0 162 *
michael@0 163 * We use a lookup table for ASCII-7 characters, because they are
michael@0 164 * very common and must be handled quickly in the tokenizer.
michael@0 165 * NO-BREAK SPACE is supposed to be the most common character not in
michael@0 166 * this range, so we inline this case, too.
michael@0 167 */
michael@0 168
michael@0 169 if (ch < 128)
michael@0 170 return js_isspace[ch];
michael@0 171
michael@0 172 if (ch == NO_BREAK_SPACE)
michael@0 173 return true;
michael@0 174
michael@0 175 return CharInfo(ch).isSpace();
michael@0 176 }
michael@0 177
michael@0 178 inline bool
michael@0 179 IsSpaceOrBOM2(jschar ch)
michael@0 180 {
michael@0 181 if (ch < 128)
michael@0 182 return js_isspace[ch];
michael@0 183
michael@0 184 /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
michael@0 185 if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
michael@0 186 return true;
michael@0 187
michael@0 188 return CharInfo(ch).isSpace();
michael@0 189 }
michael@0 190
michael@0 191 inline jschar
michael@0 192 ToUpperCase(jschar ch)
michael@0 193 {
michael@0 194 const CharacterInfo &info = CharInfo(ch);
michael@0 195
michael@0 196 return uint16_t(ch) + info.upperCase;
michael@0 197 }
michael@0 198
michael@0 199 inline jschar
michael@0 200 ToLowerCase(jschar ch)
michael@0 201 {
michael@0 202 const CharacterInfo &info = CharInfo(ch);
michael@0 203
michael@0 204 return uint16_t(ch) + info.lowerCase;
michael@0 205 }
michael@0 206
michael@0 207 } /* namespace unicode */
michael@0 208 } /* namespace js */
michael@0 209
michael@0 210 #endif /* vm_Unicode_h */

mercurial