The Tor Browser: js/src/vm/Unicode.h@6474c204b198 (annotated)

js/src/vm/Unicode.h@6474c204b198 (annotated)

js/src/vm/Unicode.h

Wed, 31 Dec 2014 06:09:35 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Wed, 31 Dec 2014 06:09:35 +0100
changeset 0: 6474c204b198
permissions: -rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
  * vim: set ts=8 sts=4 et sw=4 tw=99:
  * This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 #ifndef vm_Unicode_h
 #define vm_Unicode_h
 #include "jspubtd.h"
 extern const bool js_isidstart[];
 extern const bool js_isident[];
 extern const bool js_isspace[];
 namespace js {
 namespace unicode {
 /*
  * This enum contains the all the knowledge required to handle
  * Unicode in JavaScript.
  *
  * SPACE
  *   Every character that is either in the ECMA-262 5th Edition
  *   class WhiteSpace or LineTerminator.
  *
  *   WhiteSpace
  *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
  *    and every other Unicode character with the General Category "Zs".
  *    In pratice this is every character with the value "Zs" as the third
  *    field (after the char code in hex, and the name) called General_Category
  *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
  *     in the file UnicodeData.txt.
  *
  *   LineTerminator
  *    \u000A, \u000D, \u2028, \u2029
  *
  * LETTER
  *   This are all characters included UnicodeLetter from ECMA-262.
  *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
  *
  * IDENTIFIER_PART
  *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
  *   Aka categories Mn/Mc, Md, Nd, Pc
  *   And <ZWNJ> and <ZWJ>.
  *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
  *   a matcher for the real IdentifierPart like this:
  *
  *   if isEscapeSequence():
  *      handleEscapeSequence()
  *      return True
  *   if char in ['$', '_']:
  *      return True
  *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
  *      return True
  *
  */
 struct CharFlag {
     enum temp {
         SPACE  = 1 << 0,
         LETTER = 1 << 1,
         IDENTIFIER_PART = 1 << 2,
     };
 };
 const jschar BYTE_ORDER_MARK2 = 0xFFFE;
 const jschar NO_BREAK_SPACE  = 0x00A0;
 class CharacterInfo {
     /*
      * upperCase and loweCase normally store the delta between two
      * letters. For example the lower case alpha (a) has the char code
      * 97, and the upper case alpha (A) has 65. So for "a" we would
      * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
      * because this char is already in lower case.
      * Well, not -32 exactly, but (2**16 - 32) to induce
      * unsigned overflow with identical mathematical behavior.
      * For upper case alpha, we would store 0 in upperCase and 32 in
      * lowerCase (65 + 32 = 97).
      *
      * We use deltas to reuse information for multiple characters. For
      * example the whole lower case latin alphabet fits into one entry,
      * because it's always a UnicodeLetter and upperCase contains
      * -32.
      */
   public:
     uint16_t upperCase;
     uint16_t lowerCase;
     uint8_t flags;
     inline bool isSpace() const {
         return flags & CharFlag::SPACE;
     }
     inline bool isLetter() const {
         return flags & CharFlag::LETTER;
     }
     inline bool isIdentifierPart() const {
         return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
     }
 };
 extern const uint8_t index1[];
 extern const uint8_t index2[];
 extern const CharacterInfo js_charinfo[];
 inline const CharacterInfo&
 CharInfo(jschar code)
 {
     const size_t shift = 5;
     size_t index = index1[code >> shift];
     index = index2[(index << shift) + (code & ((1 << shift) - 1))];
     return js_charinfo[index];
 }
 inline bool
 IsIdentifierStart(jschar ch)
 {
     /*
      * ES5 7.6 IdentifierStart
      *  $ (dollar sign)
      *  _ (underscore)
      *  or any UnicodeLetter.
      *
      * We use a lookup table for small and thus common characters for speed.
      */
     if (ch < 128)
         return js_isidstart[ch];
     return CharInfo(ch).isLetter();
 }
 inline bool
 IsIdentifierPart(jschar ch)
 {
     /* Matches ES5 7.6 IdentifierPart. */
     if (ch < 128)
         return js_isident[ch];
     return CharInfo(ch).isIdentifierPart();
 }
 inline bool
 IsLetter(jschar ch)
 {
     return CharInfo(ch).isLetter();
 }
 inline bool
 IsSpace(jschar ch)
 {
     /*
      * IsSpace checks if some character is included in the merged set
      * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
      * We combined them, because in practice nearly every
      * calling function wants this, except some code in the tokenizer.
      *
      * We use a lookup table for ASCII-7 characters, because they are
      * very common and must be handled quickly in the tokenizer.
      * NO-BREAK SPACE is supposed to be the most common character not in
      * this range, so we inline this case, too.
      */
     if (ch < 128)
         return js_isspace[ch];
     if (ch == NO_BREAK_SPACE)
         return true;
     return CharInfo(ch).isSpace();
 }
 inline bool
 IsSpaceOrBOM2(jschar ch)
 {
     if (ch < 128)
         return js_isspace[ch];
     /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
     if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
         return true;
     return CharInfo(ch).isSpace();
 }
 inline jschar
 ToUpperCase(jschar ch)
 {
     const CharacterInfo &info = CharInfo(ch);
     return uint16_t(ch) + info.upperCase;
 }
 inline jschar
 ToLowerCase(jschar ch)
 {
     const CharacterInfo &info = CharInfo(ch);
     return uint16_t(ch) + info.lowerCase;
 }
 } /* namespace unicode */
 } /* namespace js */
 #endif /* vm_Unicode_h */

The Tor Browser / annotate

js/src/vm/Unicode.h@6474c204b198 (annotated)

js/src/vm/Unicode.h