The Tor Browser: comparison js/src/vm/Unicode.h

--1:000000000000
+:9a11e3e0708d
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+* vim: set ts=8 sts=4 et sw=4 tw=99:
+* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef vm_Unicode_h
+#define vm_Unicode_h
+#include "jspubtd.h"
+extern const bool js_isidstart[];
+extern const bool js_isident[];
+extern const bool js_isspace[];
+namespace js {
+namespace unicode {
+/*
+* This enum contains the all the knowledge required to handle
+* Unicode in JavaScript.
+*
+* SPACE
+*   Every character that is either in the ECMA-262 5th Edition
+*   class WhiteSpace or LineTerminator.
+*
+*   WhiteSpace
+*    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
+*    and every other Unicode character with the General Category "Zs".
+*    In pratice this is every character with the value "Zs" as the third
+*    field (after the char code in hex, and the name) called General_Category
+*    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
+*     in the file UnicodeData.txt.
+*
+*   LineTerminator
+*    \u000A, \u000D, \u2028, \u2029
+*
+* LETTER
+*   This are all characters included UnicodeLetter from ECMA-262.
+*   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
+*
+* IDENTIFIER_PART
+*   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
+*   Aka categories Mn/Mc, Md, Nd, Pc
+*   And <ZWNJ> and <ZWJ>.
+*   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
+*   a matcher for the real IdentifierPart like this:
+*
+*   if isEscapeSequence():
+*      handleEscapeSequence()
+*      return True
+*   if char in ['$', '_']:
+*      return True
+*   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
+*      return True
+*
+*/
+struct CharFlag {
+enum temp {
+SPACE  = 1 << 0,
+LETTER = 1 << 1,
+IDENTIFIER_PART = 1 << 2,
+};
+};
+const jschar BYTE_ORDER_MARK2 = 0xFFFE;
+const jschar NO_BREAK_SPACE  = 0x00A0;
+class CharacterInfo {
+/*
+* upperCase and loweCase normally store the delta between two
+* letters. For example the lower case alpha (a) has the char code
+* 97, and the upper case alpha (A) has 65. So for "a" we would
+* store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
+* because this char is already in lower case.
+* Well, not -32 exactly, but (2**16 - 32) to induce
+* unsigned overflow with identical mathematical behavior.
+* For upper case alpha, we would store 0 in upperCase and 32 in
+* lowerCase (65 + 32 = 97).
+*
+* We use deltas to reuse information for multiple characters. For
+* example the whole lower case latin alphabet fits into one entry,
+* because it's always a UnicodeLetter and upperCase contains
+* -32.
+*/
+public:
+uint16_t upperCase;
+uint16_t lowerCase;
+uint8_t flags;
+inline bool isSpace() const {
+return flags & CharFlag::SPACE;
+}
+inline bool isLetter() const {
+return flags & CharFlag::LETTER;
+}
+inline bool isIdentifierPart() const {
+return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
+}
+};
+extern const uint8_t index1[];
+extern const uint8_t index2[];
+extern const CharacterInfo js_charinfo[];
+inline const CharacterInfo&
+CharInfo(jschar code)
+{
+const size_t shift = 5;
+size_t index = index1[code >> shift];
+index = index2[(index << shift) + (code & ((1 << shift) - 1))];
+return js_charinfo[index];
+}
+inline bool
+IsIdentifierStart(jschar ch)
+{
+/*
+* ES5 7.6 IdentifierStart
+*  $ (dollar sign)
+*  _ (underscore)
+*  or any UnicodeLetter.
+*
+* We use a lookup table for small and thus common characters for speed.
+*/
+if (ch < 128)
+return js_isidstart[ch];
+return CharInfo(ch).isLetter();
+}
+inline bool
+IsIdentifierPart(jschar ch)
+{
+/* Matches ES5 7.6 IdentifierPart. */
+if (ch < 128)
+return js_isident[ch];
+return CharInfo(ch).isIdentifierPart();
+}
+inline bool
+IsLetter(jschar ch)
+{
+return CharInfo(ch).isLetter();
+}
+inline bool
+IsSpace(jschar ch)
+{
+/*
+* IsSpace checks if some character is included in the merged set
+* of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
+* We combined them, because in practice nearly every
+* calling function wants this, except some code in the tokenizer.
+*
+* We use a lookup table for ASCII-7 characters, because they are
+* very common and must be handled quickly in the tokenizer.
+* NO-BREAK SPACE is supposed to be the most common character not in
+* this range, so we inline this case, too.
+*/
+if (ch < 128)
+return js_isspace[ch];
+if (ch == NO_BREAK_SPACE)
+return true;
+return CharInfo(ch).isSpace();
+}
+inline bool
+IsSpaceOrBOM2(jschar ch)
+{
+if (ch < 128)
+return js_isspace[ch];
+/* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
+if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
+return true;
+return CharInfo(ch).isSpace();
+}
+inline jschar
+ToUpperCase(jschar ch)
+{
+const CharacterInfo &info = CharInfo(ch);
+return uint16_t(ch) + info.upperCase;
+}
+inline jschar
+ToLowerCase(jschar ch)
+{
+const CharacterInfo &info = CharInfo(ch);
+return uint16_t(ch) + info.lowerCase;
+}
+} /* namespace unicode */
+} /* namespace js */
+#endif /* vm_Unicode_h */

The Tor Browser / file comparison

comparison: js/src/vm/Unicode.h

js/src/vm/Unicode.h