The Tor Browser: js/src/vm/Unicode.h@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-

     2  * vim: set ts=8 sts=4 et sw=4 tw=99:

     3  * This Source Code Form is subject to the terms of the Mozilla Public

     4  * License, v. 2.0. If a copy of the MPL was not distributed with this

     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     7 #ifndef vm_Unicode_h

     8 #define vm_Unicode_h

    10 #include "jspubtd.h"

    12 extern const bool js_isidstart[];

    13 extern const bool js_isident[];

    14 extern const bool js_isspace[];

    16 namespace js {

    17 namespace unicode {

    19 /*

    20  * This enum contains the all the knowledge required to handle

    21  * Unicode in JavaScript.

    22  *

    23  * SPACE

    24  *   Every character that is either in the ECMA-262 5th Edition

    25  *   class WhiteSpace or LineTerminator.

    26  *

    27  *   WhiteSpace

    28  *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF

    29  *    and every other Unicode character with the General Category "Zs".

    30  *    In pratice this is every character with the value "Zs" as the third

    31  *    field (after the char code in hex, and the name) called General_Category

    32  *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)

    33  *     in the file UnicodeData.txt.

    34  *

    35  *   LineTerminator

    36  *    \u000A, \u000D, \u2028, \u2029

    37  *

    38  * LETTER

    39  *   This are all characters included UnicodeLetter from ECMA-262.

    40  *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'

    41  *

    42  * IDENTIFIER_PART

    43  *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.

    44  *   Aka categories Mn/Mc, Md, Nd, Pc

    45  *   And <ZWNJ> and <ZWJ>.

    46  *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build

    47  *   a matcher for the real IdentifierPart like this:

    48  *

    49  *   if isEscapeSequence():

    50  *      handleEscapeSequence()

    51  *      return True

    52  *   if char in ['$', '_']:

    53  *      return True

    54  *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):

    55  *      return True

    56  *

    57  */

    59 struct CharFlag {

    60     enum temp {

    61         SPACE  = 1 << 0,

    62         LETTER = 1 << 1,

    63         IDENTIFIER_PART = 1 << 2,

    64     };

    65 };

    67 const jschar BYTE_ORDER_MARK2 = 0xFFFE;

    68 const jschar NO_BREAK_SPACE  = 0x00A0;

    70 class CharacterInfo {

    71     /*

    72      * upperCase and loweCase normally store the delta between two

    73      * letters. For example the lower case alpha (a) has the char code

    74      * 97, and the upper case alpha (A) has 65. So for "a" we would

    75      * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,

    76      * because this char is already in lower case.

    77      * Well, not -32 exactly, but (2**16 - 32) to induce

    78      * unsigned overflow with identical mathematical behavior.

    79      * For upper case alpha, we would store 0 in upperCase and 32 in

    80      * lowerCase (65 + 32 = 97).

    81      *

    82      * We use deltas to reuse information for multiple characters. For

    83      * example the whole lower case latin alphabet fits into one entry,

    84      * because it's always a UnicodeLetter and upperCase contains

    85      * -32.

    86      */

    87   public:

    88     uint16_t upperCase;

    89     uint16_t lowerCase;

    90     uint8_t flags;

    92     inline bool isSpace() const {

    93         return flags & CharFlag::SPACE;

    94     }

    96     inline bool isLetter() const {

    97         return flags & CharFlag::LETTER;

    98     }

   100     inline bool isIdentifierPart() const {

   101         return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);

   102     }

   103 };

   105 extern const uint8_t index1[];

   106 extern const uint8_t index2[];

   107 extern const CharacterInfo js_charinfo[];

   109 inline const CharacterInfo&

   110 CharInfo(jschar code)

   111 {

   112     const size_t shift = 5;

   113     size_t index = index1[code >> shift];

   114     index = index2[(index << shift) + (code & ((1 << shift) - 1))];

   116     return js_charinfo[index];

   117 }

   119 inline bool

   120 IsIdentifierStart(jschar ch)

   121 {

   122     /*

   123      * ES5 7.6 IdentifierStart

   124      *  $ (dollar sign)

   125      *  _ (underscore)

   126      *  or any UnicodeLetter.

   127      *

   128      * We use a lookup table for small and thus common characters for speed.

   129      */

   131     if (ch < 128)

   132         return js_isidstart[ch];

   134     return CharInfo(ch).isLetter();

   135 }

   137 inline bool

   138 IsIdentifierPart(jschar ch)

   139 {

   140     /* Matches ES5 7.6 IdentifierPart. */

   142     if (ch < 128)

   143         return js_isident[ch];

   145     return CharInfo(ch).isIdentifierPart();

   146 }

   148 inline bool

   149 IsLetter(jschar ch)

   150 {

   151     return CharInfo(ch).isLetter();

   152 }

   154 inline bool

   155 IsSpace(jschar ch)

   156 {

   157     /*

   158      * IsSpace checks if some character is included in the merged set

   159      * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.

   160      * We combined them, because in practice nearly every

   161      * calling function wants this, except some code in the tokenizer.

   162      *

   163      * We use a lookup table for ASCII-7 characters, because they are

   164      * very common and must be handled quickly in the tokenizer.

   165      * NO-BREAK SPACE is supposed to be the most common character not in

   166      * this range, so we inline this case, too.

   167      */

   169     if (ch < 128)

   170         return js_isspace[ch];

   172     if (ch == NO_BREAK_SPACE)

   173         return true;

   175     return CharInfo(ch).isSpace();

   176 }

   178 inline bool

   179 IsSpaceOrBOM2(jschar ch)

   180 {

   181     if (ch < 128)

   182         return js_isspace[ch];

   184     /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */

   185     if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)

   186         return true;

   188     return CharInfo(ch).isSpace();

   189 }

   191 inline jschar

   192 ToUpperCase(jschar ch)

   193 {

   194     const CharacterInfo &info = CharInfo(ch);

   196     return uint16_t(ch) + info.upperCase;

   197 }

   199 inline jschar

   200 ToLowerCase(jschar ch)

   201 {

   202     const CharacterInfo &info = CharInfo(ch);

   204     return uint16_t(ch) + info.lowerCase;

   205 }

   207 } /* namespace unicode */

   208 } /* namespace js */

   210 #endif /* vm_Unicode_h */

The Tor Browser / file revision