michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * vim: set ts=8 sts=4 et sw=4 tw=99: michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef vm_Unicode_h michael@0: #define vm_Unicode_h michael@0: michael@0: #include "jspubtd.h" michael@0: michael@0: extern const bool js_isidstart[]; michael@0: extern const bool js_isident[]; michael@0: extern const bool js_isspace[]; michael@0: michael@0: namespace js { michael@0: namespace unicode { michael@0: michael@0: /* michael@0: * This enum contains the all the knowledge required to handle michael@0: * Unicode in JavaScript. michael@0: * michael@0: * SPACE michael@0: * Every character that is either in the ECMA-262 5th Edition michael@0: * class WhiteSpace or LineTerminator. michael@0: * michael@0: * WhiteSpace michael@0: * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF michael@0: * and every other Unicode character with the General Category "Zs". michael@0: * In pratice this is every character with the value "Zs" as the third michael@0: * field (after the char code in hex, and the name) called General_Category michael@0: * (see http://www.unicode.org/reports/tr44/#UnicodeData.txt) michael@0: * in the file UnicodeData.txt. michael@0: * michael@0: * LineTerminator michael@0: * \u000A, \u000D, \u2028, \u2029 michael@0: * michael@0: * LETTER michael@0: * This are all characters included UnicodeLetter from ECMA-262. michael@0: * This includes the category 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl' michael@0: * michael@0: * IDENTIFIER_PART michael@0: * This is UnicodeCombiningMark, UnicodeDigit, UnicodeConnectorPunctuation. michael@0: * Aka categories Mn/Mc, Md, Nd, Pc michael@0: * And and . michael@0: * Attention: FLAG_LETTER is _not_ IdentifierStart, but you could build michael@0: * a matcher for the real IdentifierPart like this: michael@0: * michael@0: * if isEscapeSequence(): michael@0: * handleEscapeSequence() michael@0: * return True michael@0: * if char in ['$', '_']: michael@0: * return True michael@0: * if GetFlag(char) & (FLAG_IDENTIFIER_PART | FLAG_LETTER): michael@0: * return True michael@0: * michael@0: */ michael@0: michael@0: struct CharFlag { michael@0: enum temp { michael@0: SPACE = 1 << 0, michael@0: LETTER = 1 << 1, michael@0: IDENTIFIER_PART = 1 << 2, michael@0: }; michael@0: }; michael@0: michael@0: const jschar BYTE_ORDER_MARK2 = 0xFFFE; michael@0: const jschar NO_BREAK_SPACE = 0x00A0; michael@0: michael@0: class CharacterInfo { michael@0: /* michael@0: * upperCase and loweCase normally store the delta between two michael@0: * letters. For example the lower case alpha (a) has the char code michael@0: * 97, and the upper case alpha (A) has 65. So for "a" we would michael@0: * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, michael@0: * because this char is already in lower case. michael@0: * Well, not -32 exactly, but (2**16 - 32) to induce michael@0: * unsigned overflow with identical mathematical behavior. michael@0: * For upper case alpha, we would store 0 in upperCase and 32 in michael@0: * lowerCase (65 + 32 = 97). michael@0: * michael@0: * We use deltas to reuse information for multiple characters. For michael@0: * example the whole lower case latin alphabet fits into one entry, michael@0: * because it's always a UnicodeLetter and upperCase contains michael@0: * -32. michael@0: */ michael@0: public: michael@0: uint16_t upperCase; michael@0: uint16_t lowerCase; michael@0: uint8_t flags; michael@0: michael@0: inline bool isSpace() const { michael@0: return flags & CharFlag::SPACE; michael@0: } michael@0: michael@0: inline bool isLetter() const { michael@0: return flags & CharFlag::LETTER; michael@0: } michael@0: michael@0: inline bool isIdentifierPart() const { michael@0: return flags & (CharFlag::IDENTIFIER_PART | CharFlag::LETTER); michael@0: } michael@0: }; michael@0: michael@0: extern const uint8_t index1[]; michael@0: extern const uint8_t index2[]; michael@0: extern const CharacterInfo js_charinfo[]; michael@0: michael@0: inline const CharacterInfo& michael@0: CharInfo(jschar code) michael@0: { michael@0: const size_t shift = 5; michael@0: size_t index = index1[code >> shift]; michael@0: index = index2[(index << shift) + (code & ((1 << shift) - 1))]; michael@0: michael@0: return js_charinfo[index]; michael@0: } michael@0: michael@0: inline bool michael@0: IsIdentifierStart(jschar ch) michael@0: { michael@0: /* michael@0: * ES5 7.6 IdentifierStart michael@0: * $ (dollar sign) michael@0: * _ (underscore) michael@0: * or any UnicodeLetter. michael@0: * michael@0: * We use a lookup table for small and thus common characters for speed. michael@0: */ michael@0: michael@0: if (ch < 128) michael@0: return js_isidstart[ch]; michael@0: michael@0: return CharInfo(ch).isLetter(); michael@0: } michael@0: michael@0: inline bool michael@0: IsIdentifierPart(jschar ch) michael@0: { michael@0: /* Matches ES5 7.6 IdentifierPart. */ michael@0: michael@0: if (ch < 128) michael@0: return js_isident[ch]; michael@0: michael@0: return CharInfo(ch).isIdentifierPart(); michael@0: } michael@0: michael@0: inline bool michael@0: IsLetter(jschar ch) michael@0: { michael@0: return CharInfo(ch).isLetter(); michael@0: } michael@0: michael@0: inline bool michael@0: IsSpace(jschar ch) michael@0: { michael@0: /* michael@0: * IsSpace checks if some character is included in the merged set michael@0: * of WhiteSpace and LineTerminator, specified by ES5 7.2 and 7.3. michael@0: * We combined them, because in practice nearly every michael@0: * calling function wants this, except some code in the tokenizer. michael@0: * michael@0: * We use a lookup table for ASCII-7 characters, because they are michael@0: * very common and must be handled quickly in the tokenizer. michael@0: * NO-BREAK SPACE is supposed to be the most common character not in michael@0: * this range, so we inline this case, too. michael@0: */ michael@0: michael@0: if (ch < 128) michael@0: return js_isspace[ch]; michael@0: michael@0: if (ch == NO_BREAK_SPACE) michael@0: return true; michael@0: michael@0: return CharInfo(ch).isSpace(); michael@0: } michael@0: michael@0: inline bool michael@0: IsSpaceOrBOM2(jschar ch) michael@0: { michael@0: if (ch < 128) michael@0: return js_isspace[ch]; michael@0: michael@0: /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */ michael@0: if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2) michael@0: return true; michael@0: michael@0: return CharInfo(ch).isSpace(); michael@0: } michael@0: michael@0: inline jschar michael@0: ToUpperCase(jschar ch) michael@0: { michael@0: const CharacterInfo &info = CharInfo(ch); michael@0: michael@0: return uint16_t(ch) + info.upperCase; michael@0: } michael@0: michael@0: inline jschar michael@0: ToLowerCase(jschar ch) michael@0: { michael@0: const CharacterInfo &info = CharInfo(ch); michael@0: michael@0: return uint16_t(ch) + info.lowerCase; michael@0: } michael@0: michael@0: } /* namespace unicode */ michael@0: } /* namespace js */ michael@0: michael@0: #endif /* vm_Unicode_h */