diff -r 000000000000 -r 6474c204b198 js/src/vm/Unicode.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/js/src/vm/Unicode.h Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,210 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- + * vim: set ts=8 sts=4 et sw=4 tw=99: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef vm_Unicode_h +#define vm_Unicode_h + +#include "jspubtd.h" + +extern const bool js_isidstart[]; +extern const bool js_isident[]; +extern const bool js_isspace[]; + +namespace js { +namespace unicode { + +/* + * This enum contains the all the knowledge required to handle + * Unicode in JavaScript. + * + * SPACE + * Every character that is either in the ECMA-262 5th Edition + * class WhiteSpace or LineTerminator. + * + * WhiteSpace + * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF + * and every other Unicode character with the General Category "Zs". + * In pratice this is every character with the value "Zs" as the third + * field (after the char code in hex, and the name) called General_Category + * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt) + * in the file UnicodeData.txt. + * + * LineTerminator + * \u000A, \u000D, \u2028, \u2029 + * + * LETTER + * This are all characters included UnicodeLetter from ECMA-262. + * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl' + * + * IDENTIFIER_PART + * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation. + * Aka categories Mn/Mc, Md, Nd, Pc + * And and . + * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build + * a matcher for the real IdentifierPart like this: + * + * if isEscapeSequence(): + * handleEscapeSequence() + * return True + * if char in ['$', '_']: + * return True + * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER): + * return True + * + */ + +struct CharFlag { + enum temp { + SPACE = 1 << 0, + LETTER = 1 << 1, + IDENTIFIER_PART = 1 << 2, + }; +}; + +const jschar BYTE_ORDER_MARK2 = 0xFFFE; +const jschar NO_BREAK_SPACE = 0x00A0; + +class CharacterInfo { + /* + * upperCase and loweCase normally store the delta between two + * letters. For example the lower case alpha (a) has the char code + * 97, and the upper case alpha (A) has 65. So for "a" we would + * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, + * because this char is already in lower case. + * Well, not -32 exactly, but (2**16 - 32) to induce + * unsigned overflow with identical mathematical behavior. + * For upper case alpha, we would store 0 in upperCase and 32 in + * lowerCase (65 + 32 = 97). + * + * We use deltas to reuse information for multiple characters. For + * example the whole lower case latin alphabet fits into one entry, + * because it's always a UnicodeLetter and upperCase contains + * -32. + */ + public: + uint16_t upperCase; + uint16_t lowerCase; + uint8_t flags; + + inline bool isSpace() const { + return flags & CharFlag::SPACE; + } + + inline bool isLetter() const { + return flags & CharFlag::LETTER; + } + + inline bool isIdentifierPart() const { + return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER); + } +}; + +extern const uint8_t index1[]; +extern const uint8_t index2[]; +extern const CharacterInfo js_charinfo[]; + +inline const CharacterInfo& +CharInfo(jschar code) +{ + const size_t shift = 5; + size_t index = index1[code >> shift]; + index = index2[(index << shift) + (code & ((1 << shift) - 1))]; + + return js_charinfo[index]; +} + +inline bool +IsIdentifierStart(jschar ch) +{ + /* + * ES5 7.6 IdentifierStart + * $ (dollar sign) + * _ (underscore) + * or any UnicodeLetter. + * + * We use a lookup table for small and thus common characters for speed. + */ + + if (ch < 128) + return js_isidstart[ch]; + + return CharInfo(ch).isLetter(); +} + +inline bool +IsIdentifierPart(jschar ch) +{ + /* Matches ES5 7.6 IdentifierPart. */ + + if (ch < 128) + return js_isident[ch]; + + return CharInfo(ch).isIdentifierPart(); +} + +inline bool +IsLetter(jschar ch) +{ + return CharInfo(ch).isLetter(); +} + +inline bool +IsSpace(jschar ch) +{ + /* + * IsSpace checks if some character is included in the merged set + * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3. + * We combined them, because in practice nearly every + * calling function wants this, except some code in the tokenizer. + * + * We use a lookup table for ASCII-7 characters, because they are + * very common and must be handled quickly in the tokenizer. + * NO-BREAK SPACE is supposed to be the most common character not in + * this range, so we inline this case, too. + */ + + if (ch < 128) + return js_isspace[ch]; + + if (ch == NO_BREAK_SPACE) + return true; + + return CharInfo(ch).isSpace(); +} + +inline bool +IsSpaceOrBOM2(jschar ch) +{ + if (ch < 128) + return js_isspace[ch]; + + /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ + if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) + return true; + + return CharInfo(ch).isSpace(); +} + +inline jschar +ToUpperCase(jschar ch) +{ + const CharacterInfo &info = CharInfo(ch); + + return uint16_t(ch) + info.upperCase; +} + +inline jschar +ToLowerCase(jschar ch) +{ + const CharacterInfo &info = CharInfo(ch); + + return uint16_t(ch) + info.lowerCase; +} + +} /* namespace unicode */ +} /* namespace js */ + +#endif /* vm_Unicode_h */