js/src/vm/Unicode.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     2  * vim: set ts=8 sts=4 et sw=4 tw=99:
     3  * This Source Code Form is subject to the terms of the Mozilla Public
     4  * License, v. 2.0. If a copy of the MPL was not distributed with this
     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     7 #ifndef vm_Unicode_h
     8 #define vm_Unicode_h
    10 #include "jspubtd.h"
    12 extern const bool js_isidstart[];
    13 extern const bool js_isident[];
    14 extern const bool js_isspace[];
    16 namespace js {
    17 namespace unicode {
    19 /*
    20  * This enum contains the all the knowledge required to handle
    21  * Unicode in JavaScript.
    22  *
    23  * SPACE
    24  *   Every character that is either in the ECMA-262 5th Edition
    25  *   class WhiteSpace or LineTerminator.
    26  *
    27  *   WhiteSpace
    28  *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
    29  *    and every other Unicode character with the General Category "Zs".
    30  *    In pratice this is every character with the value "Zs" as the third
    31  *    field (after the char code in hex, and the name) called General_Category
    32  *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
    33  *     in the file UnicodeData.txt.
    34  *
    35  *   LineTerminator
    36  *    \u000A, \u000D, \u2028, \u2029
    37  *
    38  * LETTER
    39  *   This are all characters included UnicodeLetter from ECMA-262.
    40  *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
    41  *
    42  * IDENTIFIER_PART
    43  *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
    44  *   Aka categories Mn/Mc, Md, Nd, Pc
    45  *   And <ZWNJ> and <ZWJ>.
    46  *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
    47  *   a matcher for the real IdentifierPart like this:
    48  *
    49  *   if isEscapeSequence():
    50  *      handleEscapeSequence()
    51  *      return True
    52  *   if char in ['$', '_']:
    53  *      return True
    54  *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
    55  *      return True
    56  *
    57  */
    59 struct CharFlag {
    60     enum temp {
    61         SPACE  = 1 << 0,
    62         LETTER = 1 << 1,
    63         IDENTIFIER_PART = 1 << 2,
    64     };
    65 };
    67 const jschar BYTE_ORDER_MARK2 = 0xFFFE;
    68 const jschar NO_BREAK_SPACE  = 0x00A0;
    70 class CharacterInfo {
    71     /*
    72      * upperCase and loweCase normally store the delta between two
    73      * letters. For example the lower case alpha (a) has the char code
    74      * 97, and the upper case alpha (A) has 65. So for "a" we would
    75      * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
    76      * because this char is already in lower case.
    77      * Well, not -32 exactly, but (2**16 - 32) to induce
    78      * unsigned overflow with identical mathematical behavior.
    79      * For upper case alpha, we would store 0 in upperCase and 32 in
    80      * lowerCase (65 + 32 = 97).
    81      *
    82      * We use deltas to reuse information for multiple characters. For
    83      * example the whole lower case latin alphabet fits into one entry,
    84      * because it's always a UnicodeLetter and upperCase contains
    85      * -32.
    86      */
    87   public:
    88     uint16_t upperCase;
    89     uint16_t lowerCase;
    90     uint8_t flags;
    92     inline bool isSpace() const {
    93         return flags & CharFlag::SPACE;
    94     }
    96     inline bool isLetter() const {
    97         return flags & CharFlag::LETTER;
    98     }
   100     inline bool isIdentifierPart() const {
   101         return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
   102     }
   103 };
   105 extern const uint8_t index1[];
   106 extern const uint8_t index2[];
   107 extern const CharacterInfo js_charinfo[];
   109 inline const CharacterInfo&
   110 CharInfo(jschar code)
   111 {
   112     const size_t shift = 5;
   113     size_t index = index1[code >> shift];
   114     index = index2[(index << shift) + (code & ((1 << shift) - 1))];
   116     return js_charinfo[index];
   117 }
   119 inline bool
   120 IsIdentifierStart(jschar ch)
   121 {
   122     /*
   123      * ES5 7.6 IdentifierStart
   124      *  $ (dollar sign)
   125      *  _ (underscore)
   126      *  or any UnicodeLetter.
   127      *
   128      * We use a lookup table for small and thus common characters for speed.
   129      */
   131     if (ch < 128)
   132         return js_isidstart[ch];
   134     return CharInfo(ch).isLetter();
   135 }
   137 inline bool
   138 IsIdentifierPart(jschar ch)
   139 {
   140     /* Matches ES5 7.6 IdentifierPart. */
   142     if (ch < 128)
   143         return js_isident[ch];
   145     return CharInfo(ch).isIdentifierPart();
   146 }
   148 inline bool
   149 IsLetter(jschar ch)
   150 {
   151     return CharInfo(ch).isLetter();
   152 }
   154 inline bool
   155 IsSpace(jschar ch)
   156 {
   157     /*
   158      * IsSpace checks if some character is included in the merged set
   159      * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
   160      * We combined them, because in practice nearly every
   161      * calling function wants this, except some code in the tokenizer.
   162      *
   163      * We use a lookup table for ASCII-7 characters, because they are
   164      * very common and must be handled quickly in the tokenizer.
   165      * NO-BREAK SPACE is supposed to be the most common character not in
   166      * this range, so we inline this case, too.
   167      */
   169     if (ch < 128)
   170         return js_isspace[ch];
   172     if (ch == NO_BREAK_SPACE)
   173         return true;
   175     return CharInfo(ch).isSpace();
   176 }
   178 inline bool
   179 IsSpaceOrBOM2(jschar ch)
   180 {
   181     if (ch < 128)
   182         return js_isspace[ch];
   184     /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
   185     if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
   186         return true;
   188     return CharInfo(ch).isSpace();
   189 }
   191 inline jschar
   192 ToUpperCase(jschar ch)
   193 {
   194     const CharacterInfo &info = CharInfo(ch);
   196     return uint16_t(ch) + info.upperCase;
   197 }
   199 inline jschar
   200 ToLowerCase(jschar ch)
   201 {
   202     const CharacterInfo &info = CharInfo(ch);
   204     return uint16_t(ch) + info.lowerCase;
   205 }
   207 } /* namespace unicode */
   208 } /* namespace js */
   210 #endif /* vm_Unicode_h */

mercurial