js/src/vm/Unicode.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/vm/Unicode.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,210 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +#ifndef vm_Unicode_h
    1.11 +#define vm_Unicode_h
    1.12 +
    1.13 +#include "jspubtd.h"
    1.14 +
    1.15 +extern const bool js_isidstart[];
    1.16 +extern const bool js_isident[];
    1.17 +extern const bool js_isspace[];
    1.18 +
    1.19 +namespace js {
    1.20 +namespace unicode {
    1.21 +
    1.22 +/*
    1.23 + * This enum contains the all the knowledge required to handle
    1.24 + * Unicode in JavaScript.
    1.25 + *
    1.26 + * SPACE
    1.27 + *   Every character that is either in the ECMA-262 5th Edition
    1.28 + *   class WhiteSpace or LineTerminator.
    1.29 + *
    1.30 + *   WhiteSpace
    1.31 + *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
    1.32 + *    and every other Unicode character with the General Category "Zs".
    1.33 + *    In pratice this is every character with the value "Zs" as the third
    1.34 + *    field (after the char code in hex, and the name) called General_Category
    1.35 + *    (see http://www.unicode.org/reports/tr44/#UnicodeData.txt)
    1.36 + *     in the file UnicodeData.txt.
    1.37 + *
    1.38 + *   LineTerminator
    1.39 + *    \u000A, \u000D, \u2028, \u2029
    1.40 + *
    1.41 + * LETTER
    1.42 + *   This are all characters included UnicodeLetter from ECMA-262.
    1.43 + *   This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl'
    1.44 + *
    1.45 + * IDENTIFIER_PART
    1.46 + *   This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation.
    1.47 + *   Aka categories Mn/Mc, Md, Nd, Pc
    1.48 + *   And <ZWNJ> and <ZWJ>.
    1.49 + *   Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build
    1.50 + *   a matcher for the real IdentifierPart like this:
    1.51 + *
    1.52 + *   if isEscapeSequence():
    1.53 + *      handleEscapeSequence()
    1.54 + *      return True
    1.55 + *   if char in ['$', '_']:
    1.56 + *      return True
    1.57 + *   if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER):
    1.58 + *      return True
    1.59 + *
    1.60 + */
    1.61 +
    1.62 +struct CharFlag {
    1.63 +    enum temp {
    1.64 +        SPACE  = 1 << 0,
    1.65 +        LETTER = 1 << 1,
    1.66 +        IDENTIFIER_PART = 1 << 2,
    1.67 +    };
    1.68 +};
    1.69 +
    1.70 +const jschar BYTE_ORDER_MARK2 = 0xFFFE;
    1.71 +const jschar NO_BREAK_SPACE  = 0x00A0;
    1.72 +
    1.73 +class CharacterInfo {
    1.74 +    /*
    1.75 +     * upperCase and loweCase normally store the delta between two
    1.76 +     * letters. For example the lower case alpha (a) has the char code
    1.77 +     * 97, and the upper case alpha (A) has 65. So for "a" we would
    1.78 +     * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
    1.79 +     * because this char is already in lower case.
    1.80 +     * Well, not -32 exactly, but (2**16 - 32) to induce
    1.81 +     * unsigned overflow with identical mathematical behavior.
    1.82 +     * For upper case alpha, we would store 0 in upperCase and 32 in
    1.83 +     * lowerCase (65 + 32 = 97).
    1.84 +     *
    1.85 +     * We use deltas to reuse information for multiple characters. For
    1.86 +     * example the whole lower case latin alphabet fits into one entry,
    1.87 +     * because it's always a UnicodeLetter and upperCase contains
    1.88 +     * -32.
    1.89 +     */
    1.90 +  public:
    1.91 +    uint16_t upperCase;
    1.92 +    uint16_t lowerCase;
    1.93 +    uint8_t flags;
    1.94 +
    1.95 +    inline bool isSpace() const {
    1.96 +        return flags & CharFlag::SPACE;
    1.97 +    }
    1.98 +
    1.99 +    inline bool isLetter() const {
   1.100 +        return flags & CharFlag::LETTER;
   1.101 +    }
   1.102 +
   1.103 +    inline bool isIdentifierPart() const {
   1.104 +        return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER);
   1.105 +    }
   1.106 +};
   1.107 +
   1.108 +extern const uint8_t index1[];
   1.109 +extern const uint8_t index2[];
   1.110 +extern const CharacterInfo js_charinfo[];
   1.111 +
   1.112 +inline const CharacterInfo&
   1.113 +CharInfo(jschar code)
   1.114 +{
   1.115 +    const size_t shift = 5;
   1.116 +    size_t index = index1[code >> shift];
   1.117 +    index = index2[(index << shift) + (code & ((1 << shift) - 1))];
   1.118 +
   1.119 +    return js_charinfo[index];
   1.120 +}
   1.121 +
   1.122 +inline bool
   1.123 +IsIdentifierStart(jschar ch)
   1.124 +{
   1.125 +    /*
   1.126 +     * ES5 7.6 IdentifierStart
   1.127 +     *  $ (dollar sign)
   1.128 +     *  _ (underscore)
   1.129 +     *  or any UnicodeLetter.
   1.130 +     *
   1.131 +     * We use a lookup table for small and thus common characters for speed.
   1.132 +     */
   1.133 +
   1.134 +    if (ch < 128)
   1.135 +        return js_isidstart[ch];
   1.136 +
   1.137 +    return CharInfo(ch).isLetter();
   1.138 +}
   1.139 +
   1.140 +inline bool
   1.141 +IsIdentifierPart(jschar ch)
   1.142 +{
   1.143 +    /* Matches ES5 7.6 IdentifierPart. */
   1.144 +
   1.145 +    if (ch < 128)
   1.146 +        return js_isident[ch];
   1.147 +
   1.148 +    return CharInfo(ch).isIdentifierPart();
   1.149 +}
   1.150 +
   1.151 +inline bool
   1.152 +IsLetter(jschar ch)
   1.153 +{
   1.154 +    return CharInfo(ch).isLetter();
   1.155 +}
   1.156 +
   1.157 +inline bool
   1.158 +IsSpace(jschar ch)
   1.159 +{
   1.160 +    /*
   1.161 +     * IsSpace checks if some character is included in the merged set
   1.162 +     * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3.
   1.163 +     * We combined them, because in practice nearly every
   1.164 +     * calling function wants this, except some code in the tokenizer.
   1.165 +     *
   1.166 +     * We use a lookup table for ASCII-7 characters, because they are
   1.167 +     * very common and must be handled quickly in the tokenizer.
   1.168 +     * NO-BREAK SPACE is supposed to be the most common character not in
   1.169 +     * this range, so we inline this case, too.
   1.170 +     */
   1.171 +
   1.172 +    if (ch < 128)
   1.173 +        return js_isspace[ch];
   1.174 +
   1.175 +    if (ch == NO_BREAK_SPACE)
   1.176 +        return true;
   1.177 +
   1.178 +    return CharInfo(ch).isSpace();
   1.179 +}
   1.180 +
   1.181 +inline bool
   1.182 +IsSpaceOrBOM2(jschar ch)
   1.183 +{
   1.184 +    if (ch < 128)
   1.185 +        return js_isspace[ch];
   1.186 +
   1.187 +    /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
   1.188 +    if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
   1.189 +        return true;
   1.190 +
   1.191 +    return CharInfo(ch).isSpace();
   1.192 +}
   1.193 +
   1.194 +inline jschar
   1.195 +ToUpperCase(jschar ch)
   1.196 +{
   1.197 +    const CharacterInfo &info = CharInfo(ch);
   1.198 +
   1.199 +    return uint16_t(ch) + info.upperCase;
   1.200 +}
   1.201 +
   1.202 +inline jschar
   1.203 +ToLowerCase(jschar ch)
   1.204 +{
   1.205 +    const CharacterInfo &info = CharInfo(ch);
   1.206 +
   1.207 +    return uint16_t(ch) + info.lowerCase;
   1.208 +}
   1.209 +
   1.210 +} /* namespace unicode */
   1.211 +} /* namespace js */
   1.212 +
   1.213 +#endif /* vm_Unicode_h */

mercurial