michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0:  * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0:  * This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #ifndef frontend_TokenStream_h
michael@0: #define frontend_TokenStream_h
michael@0: 
michael@0: // JS lexical scanner interface.
michael@0: 
michael@0: #include "mozilla/DebugOnly.h"
michael@0: #include "mozilla/PodOperations.h"
michael@0: 
michael@0: #include <stdarg.h>
michael@0: #include <stddef.h>
michael@0: #include <stdio.h>
michael@0: 
michael@0: #include "jscntxt.h"
michael@0: #include "jspubtd.h"
michael@0: 
michael@0: #include "js/Vector.h"
michael@0: #include "vm/RegExpObject.h"
michael@0: 
michael@0: namespace js {
michael@0: namespace frontend {
michael@0: 
michael@0: // Values of this type are used to index into arrays such as isExprEnding[],
michael@0: // so the first value must be zero.
michael@0: enum TokenKind {
michael@0:     TOK_ERROR = 0,                 // well-known as the only code < EOF
michael@0:     TOK_EOF,                       // end of file
michael@0:     TOK_EOL,                       // end of line; only returned by peekTokenSameLine()
michael@0:     TOK_SEMI,                      // semicolon
michael@0:     TOK_COMMA,                     // comma operator
michael@0:     TOK_HOOK, TOK_COLON,           // conditional (?:)
michael@0:     TOK_INC, TOK_DEC,              // increment/decrement (++ --)
michael@0:     TOK_DOT,                       // member operator (.)
michael@0:     TOK_TRIPLEDOT,                 // for rest arguments (...)
michael@0:     TOK_LB, TOK_RB,                // left and right brackets
michael@0:     TOK_LC, TOK_RC,                // left and right curlies (braces)
michael@0:     TOK_LP, TOK_RP,                // left and right parentheses
michael@0:     TOK_NAME,                      // identifier
michael@0:     TOK_NUMBER,                    // numeric constant
michael@0:     TOK_STRING,                    // string constant
michael@0:     TOK_REGEXP,                    // RegExp constant
michael@0:     TOK_TRUE,                      // true
michael@0:     TOK_FALSE,                     // false
michael@0:     TOK_NULL,                      // null
michael@0:     TOK_THIS,                      // this
michael@0:     TOK_FUNCTION,                  // function keyword
michael@0:     TOK_IF,                        // if keyword
michael@0:     TOK_ELSE,                      // else keyword
michael@0:     TOK_SWITCH,                    // switch keyword
michael@0:     TOK_CASE,                      // case keyword
michael@0:     TOK_DEFAULT,                   // default keyword
michael@0:     TOK_WHILE,                     // while keyword
michael@0:     TOK_DO,                        // do keyword
michael@0:     TOK_FOR,                       // for keyword
michael@0:     TOK_BREAK,                     // break keyword
michael@0:     TOK_CONTINUE,                  // continue keyword
michael@0:     TOK_VAR,                       // var keyword
michael@0:     TOK_CONST,                     // const keyword
michael@0:     TOK_WITH,                      // with keyword
michael@0:     TOK_RETURN,                    // return keyword
michael@0:     TOK_NEW,                       // new keyword
michael@0:     TOK_DELETE,                    // delete keyword
michael@0:     TOK_TRY,                       // try keyword
michael@0:     TOK_CATCH,                     // catch keyword
michael@0:     TOK_FINALLY,                   // finally keyword
michael@0:     TOK_THROW,                     // throw keyword
michael@0:     TOK_DEBUGGER,                  // debugger keyword
michael@0:     TOK_YIELD,                     // yield from generator function
michael@0:     TOK_LET,                       // let keyword
michael@0:     TOK_EXPORT,                    // export keyword
michael@0:     TOK_IMPORT,                    // import keyword
michael@0:     TOK_RESERVED,                  // reserved keywords
michael@0:     TOK_STRICT_RESERVED,           // reserved keywords in strict mode
michael@0: 
michael@0:     // The following token types occupy contiguous ranges to enable easy
michael@0:     // range-testing.
michael@0: 
michael@0:     // Binary operators tokens, TOK_OR thru TOK_MOD. These must be in the same
michael@0:     // order as F(OR) and friends in FOR_EACH_PARSE_NODE_KIND in ParseNode.h.
michael@0:     TOK_OR,                        // logical or (||)
michael@0:     TOK_BINOP_FIRST = TOK_OR,
michael@0:     TOK_AND,                       // logical and (&&)
michael@0:     TOK_BITOR,                     // bitwise-or (|)
michael@0:     TOK_BITXOR,                    // bitwise-xor (^)
michael@0:     TOK_BITAND,                    // bitwise-and (&)
michael@0: 
michael@0:     // Equality operation tokens, per TokenKindIsEquality.
michael@0:     TOK_STRICTEQ,
michael@0:     TOK_EQUALITY_START = TOK_STRICTEQ,
michael@0:     TOK_EQ,
michael@0:     TOK_STRICTNE,
michael@0:     TOK_NE,
michael@0:     TOK_EQUALITY_LAST = TOK_NE,
michael@0: 
michael@0:     // Relational ops (< <= > >=), per TokenKindIsRelational.
michael@0:     TOK_LT,
michael@0:     TOK_RELOP_START = TOK_LT,
michael@0:     TOK_LE,
michael@0:     TOK_GT,
michael@0:     TOK_GE,
michael@0:     TOK_RELOP_LAST = TOK_GE,
michael@0: 
michael@0:     TOK_INSTANCEOF,                // |instanceof| keyword
michael@0:     TOK_IN,                        // |in| keyword
michael@0: 
michael@0:     // Shift ops (<< >> >>>), per TokenKindIsShift.
michael@0:     TOK_LSH,
michael@0:     TOK_SHIFTOP_START = TOK_LSH,
michael@0:     TOK_RSH,
michael@0:     TOK_URSH,
michael@0:     TOK_SHIFTOP_LAST = TOK_URSH,
michael@0: 
michael@0:     TOK_ADD,
michael@0:     TOK_SUB,
michael@0:     TOK_MUL,
michael@0:     TOK_DIV,
michael@0:     TOK_MOD,
michael@0:     TOK_BINOP_LAST = TOK_MOD,
michael@0: 
michael@0:     // Unary operation tokens.
michael@0:     TOK_TYPEOF,
michael@0:     TOK_VOID,
michael@0:     TOK_NOT,
michael@0:     TOK_BITNOT,
michael@0: 
michael@0:     TOK_ARROW,                     // function arrow (=>)
michael@0: 
michael@0:     // Assignment ops (= += -= etc.), per TokenKindIsAssignment
michael@0:     TOK_ASSIGN,
michael@0:     TOK_ASSIGNMENT_START = TOK_ASSIGN,
michael@0:     TOK_ADDASSIGN,
michael@0:     TOK_SUBASSIGN,
michael@0:     TOK_BITORASSIGN,
michael@0:     TOK_BITXORASSIGN,
michael@0:     TOK_BITANDASSIGN,
michael@0:     TOK_LSHASSIGN,
michael@0:     TOK_RSHASSIGN,
michael@0:     TOK_URSHASSIGN,
michael@0:     TOK_MULASSIGN,
michael@0:     TOK_DIVASSIGN,
michael@0:     TOK_MODASSIGN,
michael@0:     TOK_ASSIGNMENT_LAST = TOK_MODASSIGN,
michael@0: 
michael@0:     TOK_LIMIT                      // domain size
michael@0: };
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsBinaryOp(TokenKind tt)
michael@0: {
michael@0:     return TOK_BINOP_FIRST <= tt && tt <= TOK_BINOP_LAST;
michael@0: }
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsEquality(TokenKind tt)
michael@0: {
michael@0:     return TOK_EQUALITY_START <= tt && tt <= TOK_EQUALITY_LAST;
michael@0: }
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsRelational(TokenKind tt)
michael@0: {
michael@0:     return TOK_RELOP_START <= tt && tt <= TOK_RELOP_LAST;
michael@0: }
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsShift(TokenKind tt)
michael@0: {
michael@0:     return TOK_SHIFTOP_START <= tt && tt <= TOK_SHIFTOP_LAST;
michael@0: }
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsAssignment(TokenKind tt)
michael@0: {
michael@0:     return TOK_ASSIGNMENT_START <= tt && tt <= TOK_ASSIGNMENT_LAST;
michael@0: }
michael@0: 
michael@0: inline bool
michael@0: TokenKindIsDecl(TokenKind tt)
michael@0: {
michael@0:     return tt == TOK_VAR || tt == TOK_LET;
michael@0: }
michael@0: 
michael@0: struct TokenPos {
michael@0:     uint32_t    begin;  // Offset of the token's first char.
michael@0:     uint32_t    end;    // Offset of 1 past the token's last char.
michael@0: 
michael@0:     TokenPos() {}
michael@0:     TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {}
michael@0: 
michael@0:     // Return a TokenPos that covers left, right, and anything in between.
michael@0:     static TokenPos box(const TokenPos &left, const TokenPos &right) {
michael@0:         JS_ASSERT(left.begin <= left.end);
michael@0:         JS_ASSERT(left.end <= right.begin);
michael@0:         JS_ASSERT(right.begin <= right.end);
michael@0:         return TokenPos(left.begin, right.end);
michael@0:     }
michael@0: 
michael@0:     bool operator==(const TokenPos& bpos) const {
michael@0:         return begin == bpos.begin && end == bpos.end;
michael@0:     }
michael@0: 
michael@0:     bool operator!=(const TokenPos& bpos) const {
michael@0:         return begin != bpos.begin || end != bpos.end;
michael@0:     }
michael@0: 
michael@0:     bool operator <(const TokenPos& bpos) const {
michael@0:         return begin < bpos.begin;
michael@0:     }
michael@0: 
michael@0:     bool operator <=(const TokenPos& bpos) const {
michael@0:         return begin <= bpos.begin;
michael@0:     }
michael@0: 
michael@0:     bool operator >(const TokenPos& bpos) const {
michael@0:         return !(*this <= bpos);
michael@0:     }
michael@0: 
michael@0:     bool operator >=(const TokenPos& bpos) const {
michael@0:         return !(*this < bpos);
michael@0:     }
michael@0: 
michael@0:     bool encloses(const TokenPos& pos) const {
michael@0:         return begin <= pos.begin && pos.end <= end;
michael@0:     }
michael@0: };
michael@0: 
michael@0: enum DecimalPoint { NoDecimal = false, HasDecimal = true };
michael@0: 
michael@0: struct Token
michael@0: {
michael@0:     TokenKind           type;           // char value or above enumerator
michael@0:     TokenPos            pos;            // token position in file
michael@0:     union {
michael@0:       private:
michael@0:         friend struct Token;
michael@0:         PropertyName    *name;          // non-numeric atom
michael@0:         JSAtom          *atom;          // potentially-numeric atom
michael@0:         struct {
michael@0:             double      value;          // floating point number
michael@0:             DecimalPoint decimalPoint;  // literal contains '.'
michael@0:         } number;
michael@0:         RegExpFlag      reflags;        // regexp flags; use tokenbuf to access
michael@0:                                         //   regexp chars
michael@0:     } u;
michael@0: 
michael@0:     // This constructor is necessary only for MSVC 2013 and how it compiles the
michael@0:     // initialization of TokenStream::tokens.  That field is initialized as
michael@0:     // tokens() in the constructor init-list.  This *should* zero the entire
michael@0:     // array, then (because Token has a non-trivial constructor, because
michael@0:     // TokenPos has a user-provided constructor) call the implicit Token
michael@0:     // constructor on each element, which would call the TokenPos constructor
michael@0:     // for Token::pos and do nothing.  (All of which is equivalent to just
michael@0:     // zeroing TokenStream::tokens.)  But MSVC 2013 (2010/2012 don't have this
michael@0:     // bug) doesn't zero out each element, so we need this extra constructor to
michael@0:     // make it do the right thing.  (Token is used primarily by reference or
michael@0:     // pointer, and it's only initialized a very few places, so having a
michael@0:     // user-defined constructor won't hurt perf.)  See also bug 920318.
michael@0:     Token()
michael@0:       : type(TOK_ERROR),
michael@0:         pos(0, 0)
michael@0:     {
michael@0:     }
michael@0: 
michael@0:     // Mutators
michael@0: 
michael@0:     void setName(PropertyName *name) {
michael@0:         JS_ASSERT(type == TOK_NAME);
michael@0:         JS_ASSERT(!IsPoisonedPtr(name));
michael@0:         u.name = name;
michael@0:     }
michael@0: 
michael@0:     void setAtom(JSAtom *atom) {
michael@0:         JS_ASSERT(type == TOK_STRING);
michael@0:         JS_ASSERT(!IsPoisonedPtr(atom));
michael@0:         u.atom = atom;
michael@0:     }
michael@0: 
michael@0:     void setRegExpFlags(js::RegExpFlag flags) {
michael@0:         JS_ASSERT(type == TOK_REGEXP);
michael@0:         JS_ASSERT((flags & AllFlags) == flags);
michael@0:         u.reflags = flags;
michael@0:     }
michael@0: 
michael@0:     void setNumber(double n, DecimalPoint decimalPoint) {
michael@0:         JS_ASSERT(type == TOK_NUMBER);
michael@0:         u.number.value = n;
michael@0:         u.number.decimalPoint = decimalPoint;
michael@0:     }
michael@0: 
michael@0:     // Type-safe accessors
michael@0: 
michael@0:     PropertyName *name() const {
michael@0:         JS_ASSERT(type == TOK_NAME);
michael@0:         return u.name->asPropertyName(); // poor-man's type verification
michael@0:     }
michael@0: 
michael@0:     JSAtom *atom() const {
michael@0:         JS_ASSERT(type == TOK_STRING);
michael@0:         return u.atom;
michael@0:     }
michael@0: 
michael@0:     js::RegExpFlag regExpFlags() const {
michael@0:         JS_ASSERT(type == TOK_REGEXP);
michael@0:         JS_ASSERT((u.reflags & AllFlags) == u.reflags);
michael@0:         return u.reflags;
michael@0:     }
michael@0: 
michael@0:     double number() const {
michael@0:         JS_ASSERT(type == TOK_NUMBER);
michael@0:         return u.number.value;
michael@0:     }
michael@0: 
michael@0:     DecimalPoint decimalPoint() const {
michael@0:         JS_ASSERT(type == TOK_NUMBER);
michael@0:         return u.number.decimalPoint;
michael@0:     }
michael@0: };
michael@0: 
michael@0: struct CompileError {
michael@0:     JSErrorReport report;
michael@0:     char *message;
michael@0:     ErrorArgumentsType argumentsType;
michael@0:     CompileError()
michael@0:       : message(nullptr), argumentsType(ArgumentsAreUnicode)
michael@0:     {
michael@0:         mozilla::PodZero(&report);
michael@0:     }
michael@0:     ~CompileError();
michael@0:     void throwError(JSContext *cx);
michael@0: 
michael@0:   private:
michael@0:     // CompileError owns raw allocated memory, so disable assignment and copying
michael@0:     // for safety.
michael@0:     void operator=(const CompileError &) MOZ_DELETE;
michael@0:     CompileError(const CompileError &) MOZ_DELETE;
michael@0: };
michael@0: 
michael@0: // Ideally, tokenizing would be entirely independent of context.  But the
michael@0: // strict mode flag, which is in SharedContext, affects tokenizing, and
michael@0: // TokenStream needs to see it.
michael@0: //
michael@0: // This class is a tiny back-channel from TokenStream to the strict mode flag
michael@0: // that avoids exposing the rest of SharedContext to TokenStream.
michael@0: //
michael@0: class StrictModeGetter {
michael@0:   public:
michael@0:     virtual bool strictMode() = 0;
michael@0: };
michael@0: 
michael@0: // TokenStream is the lexical scanner for Javascript source text.
michael@0: //
michael@0: // It takes a buffer of jschars and linearly scans it into |Token|s.
michael@0: // Internally the class uses a four element circular buffer |tokens| of
michael@0: // |Token|s. As an index for |tokens|, the member |cursor| points to the
michael@0: // current token.
michael@0: // Calls to getToken() increase |cursor| by one and return the new current
michael@0: // token. If a TokenStream was just created, the current token is initialized
michael@0: // with random data (i.e. not initialized). It is therefore important that
michael@0: // one of the first four member functions listed below is called first.
michael@0: // The circular buffer lets us go back up to two tokens from the last
michael@0: // scanned token. Internally, the relative number of backward steps that were
michael@0: // taken (via ungetToken()) after the last token was scanned is stored in
michael@0: // |lookahead|.
michael@0: //
michael@0: // The following table lists in which situations it is safe to call each listed
michael@0: // function. No checks are made by the functions in non-debug builds.
michael@0: //
michael@0: // Function Name     | Precondition; changes to |lookahead|
michael@0: // ------------------+---------------------------------------------------------
michael@0: // getToken          | none; if |lookahead > 0| then |lookahead--|
michael@0: // peekToken         | none; if |lookahead == 0| then |lookahead == 1|
michael@0: // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1|
michael@0: // matchToken        | none; if |lookahead > 0| and the match succeeds then
michael@0: //                   |       |lookahead--|
michael@0: // consumeKnownToken | none; if |lookahead > 0| then |lookahead--|
michael@0: // ungetToken        | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++|
michael@0: //
michael@0: // The behavior of the token scanning process (see getTokenInternal()) can be
michael@0: // modified by calling one of the first four above listed member functions with
michael@0: // an optional argument of type Modifier.  However, the modifier will be
michael@0: // ignored unless |lookahead == 0| holds.  Due to constraints of the grammar,
michael@0: // this turns out not to be a problem in practice. See the
michael@0: // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?'
michael@0: // for more details:
michael@0: // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E).
michael@0: //
michael@0: // The methods seek() and tell() allow to rescan from a previous visited
michael@0: // location of the buffer.
michael@0: //
michael@0: class MOZ_STACK_CLASS TokenStream
michael@0: {
michael@0:     // Unicode separators that are treated as line terminators, in addition to \n, \r.
michael@0:     enum {
michael@0:         LINE_SEPARATOR = 0x2028,
michael@0:         PARA_SEPARATOR = 0x2029
michael@0:     };
michael@0: 
michael@0:     static const size_t ntokens = 4;                // 1 current + 2 lookahead, rounded
michael@0:                                                     // to power of 2 to avoid divmod by 3
michael@0:     static const unsigned maxLookahead = 2;
michael@0:     static const unsigned ntokensMask = ntokens - 1;
michael@0: 
michael@0:   public:
michael@0:     typedef Vector<jschar, 32> CharBuffer;
michael@0: 
michael@0:     TokenStream(ExclusiveContext *cx, const ReadOnlyCompileOptions &options,
michael@0:                 const jschar *base, size_t length, StrictModeGetter *smg);
michael@0: 
michael@0:     ~TokenStream();
michael@0: 
michael@0:     // Accessors.
michael@0:     const Token &currentToken() const { return tokens[cursor]; }
michael@0:     bool isCurrentTokenType(TokenKind type) const {
michael@0:         return currentToken().type == type;
michael@0:     }
michael@0:     const CharBuffer &getTokenbuf() const { return tokenbuf; }
michael@0:     const char *getFilename() const { return filename; }
michael@0:     unsigned getLineno() const { return lineno; }
michael@0:     unsigned getColumn() const { return userbuf.addressOfNextRawChar() - linebase - 1; }
michael@0:     JSPrincipals *getOriginPrincipals() const { return originPrincipals; }
michael@0:     JSVersion versionNumber() const { return VersionNumber(options().version); }
michael@0:     JSVersion versionWithFlags() const { return options().version; }
michael@0: 
michael@0:     PropertyName *currentName() const {
michael@0:         if (isCurrentTokenType(TOK_YIELD))
michael@0:             return cx->names().yield;
michael@0:         JS_ASSERT(isCurrentTokenType(TOK_NAME));
michael@0:         return currentToken().name();
michael@0:     }
michael@0: 
michael@0:     bool isCurrentTokenAssignment() const {
michael@0:         return TokenKindIsAssignment(currentToken().type);
michael@0:     }
michael@0: 
michael@0:     // Flag methods.
michael@0:     bool isEOF() const { return flags.isEOF; }
michael@0:     bool sawOctalEscape() const { return flags.sawOctalEscape; }
michael@0:     bool hadError() const { return flags.hadError; }
michael@0: 
michael@0:     // TokenStream-specific error reporters.
michael@0:     bool reportError(unsigned errorNumber, ...);
michael@0:     bool reportWarning(unsigned errorNumber, ...);
michael@0: 
michael@0:     static const uint32_t NoOffset = UINT32_MAX;
michael@0: 
michael@0:     // General-purpose error reporters.  You should avoid calling these
michael@0:     // directly, and instead use the more succinct alternatives (e.g.
michael@0:     // reportError()) in TokenStream, Parser, and BytecodeEmitter.
michael@0:     bool reportCompileErrorNumberVA(uint32_t offset, unsigned flags, unsigned errorNumber,
michael@0:                                     va_list args);
michael@0:     bool reportStrictModeErrorNumberVA(uint32_t offset, bool strictMode, unsigned errorNumber,
michael@0:                                        va_list args);
michael@0:     bool reportStrictWarningErrorNumberVA(uint32_t offset, unsigned errorNumber,
michael@0:                                           va_list args);
michael@0: 
michael@0:     // asm.js reporter
michael@0:     void reportAsmJSError(uint32_t offset, unsigned errorNumber, ...);
michael@0: 
michael@0:   private:
michael@0:     // These are private because they should only be called by the tokenizer
michael@0:     // while tokenizing not by, for example, BytecodeEmitter.
michael@0:     bool reportStrictModeError(unsigned errorNumber, ...);
michael@0:     bool strictMode() const { return strictModeGetter && strictModeGetter->strictMode(); }
michael@0: 
michael@0:     void onError();
michael@0:     static JSAtom *atomize(ExclusiveContext *cx, CharBuffer &cb);
michael@0:     bool putIdentInTokenbuf(const jschar *identStart);
michael@0: 
michael@0:     struct Flags
michael@0:     {
michael@0:         bool isEOF:1;           // Hit end of file.
michael@0:         bool isDirtyLine:1;     // Non-whitespace since start of line.
michael@0:         bool sawOctalEscape:1;  // Saw an octal character escape.
michael@0:         bool hadError:1;        // Returned TOK_ERROR from getToken.
michael@0: 
michael@0:         Flags()
michael@0:           : isEOF(), isDirtyLine(), sawOctalEscape(), hadError()
michael@0:         {}
michael@0:     };
michael@0: 
michael@0:   public:
michael@0:     // Sometimes the parser needs to modify how tokens are created.
michael@0:     enum Modifier
michael@0:     {
michael@0:         None,           // Normal operation.
michael@0:         Operand,        // Looking for an operand, not an operator.  In
michael@0:                         //   practice, this means that when '/' is seen,
michael@0:                         //   we look for a regexp instead of just returning
michael@0:                         //   TOK_DIV.
michael@0:         KeywordIsName,  // Treat keywords as names by returning TOK_NAME.
michael@0:     };
michael@0: 
michael@0:     // Get the next token from the stream, make it the current token, and
michael@0:     // return its kind.
michael@0:     TokenKind getToken(Modifier modifier = None) {
michael@0:         // Check for a pushed-back token resulting from mismatching lookahead.
michael@0:         if (lookahead != 0) {
michael@0:             lookahead--;
michael@0:             cursor = (cursor + 1) & ntokensMask;
michael@0:             TokenKind tt = currentToken().type;
michael@0:             JS_ASSERT(tt != TOK_EOL);
michael@0:             return tt;
michael@0:         }
michael@0: 
michael@0:         return getTokenInternal(modifier);
michael@0:     }
michael@0: 
michael@0:     // Push the last scanned token back into the stream.
michael@0:     void ungetToken() {
michael@0:         JS_ASSERT(lookahead < maxLookahead);
michael@0:         lookahead++;
michael@0:         cursor = (cursor - 1) & ntokensMask;
michael@0:     }
michael@0: 
michael@0:     TokenKind peekToken(Modifier modifier = None) {
michael@0:         if (lookahead != 0)
michael@0:             return tokens[(cursor + 1) & ntokensMask].type;
michael@0:         TokenKind tt = getTokenInternal(modifier);
michael@0:         ungetToken();
michael@0:         return tt;
michael@0:     }
michael@0: 
michael@0:     TokenPos peekTokenPos(Modifier modifier = None) {
michael@0:         if (lookahead != 0)
michael@0:             return tokens[(cursor + 1) & ntokensMask].pos;
michael@0:         getTokenInternal(modifier);
michael@0:         ungetToken();
michael@0:         JS_ASSERT(lookahead != 0);
michael@0:         return tokens[(cursor + 1) & ntokensMask].pos;
michael@0:     }
michael@0: 
michael@0:     // This is like peekToken(), with one exception:  if there is an EOL
michael@0:     // between the end of the current token and the start of the next token, it
michael@0:     // returns TOK_EOL.  In that case, no token with TOK_EOL is actually
michael@0:     // created, just a TOK_EOL TokenKind is returned, and currentToken()
michael@0:     // shouldn't be consulted.  (This is the only place TOK_EOL is produced.)
michael@0:     MOZ_ALWAYS_INLINE TokenKind peekTokenSameLine(Modifier modifier = None) {
michael@0:        const Token &curr = currentToken();
michael@0: 
michael@0:         // If lookahead != 0, we have scanned ahead at least one token, and
michael@0:         // |lineno| is the line that the furthest-scanned token ends on.  If
michael@0:         // it's the same as the line that the current token ends on, that's a
michael@0:         // stronger condition than what we are looking for, and we don't need
michael@0:         // to return TOK_EOL.
michael@0:         if (lookahead != 0 && srcCoords.isOnThisLine(curr.pos.end, lineno))
michael@0:             return tokens[(cursor + 1) & ntokensMask].type;
michael@0: 
michael@0:         // The above check misses two cases where we don't have to return
michael@0:         // TOK_EOL.
michael@0:         // - The next token starts on the same line, but is a multi-line token.
michael@0:         // - The next token starts on the same line, but lookahead==2 and there
michael@0:         //   is a newline between the next token and the one after that.
michael@0:         // The following test is somewhat expensive but gets these cases (and
michael@0:         // all others) right.
michael@0:         (void)getToken(modifier);
michael@0:         const Token &next = currentToken();
michael@0:         ungetToken();
michael@0:         return srcCoords.lineNum(curr.pos.end) == srcCoords.lineNum(next.pos.begin)
michael@0:                ? next.type
michael@0:                : TOK_EOL;
michael@0:     }
michael@0: 
michael@0:     // Get the next token from the stream if its kind is |tt|.
michael@0:     bool matchToken(TokenKind tt, Modifier modifier = None) {
michael@0:         if (getToken(modifier) == tt)
michael@0:             return true;
michael@0:         ungetToken();
michael@0:         return false;
michael@0:     }
michael@0: 
michael@0:     void consumeKnownToken(TokenKind tt) {
michael@0:         JS_ALWAYS_TRUE(matchToken(tt));
michael@0:     }
michael@0: 
michael@0:     bool matchContextualKeyword(Handle<PropertyName*> keyword) {
michael@0:         if (getToken() == TOK_NAME && currentToken().name() == keyword)
michael@0:             return true;
michael@0:         ungetToken();
michael@0:         return false;
michael@0:     }
michael@0: 
michael@0:     bool nextTokenEndsExpr() {
michael@0:         return isExprEnding[peekToken()];
michael@0:     }
michael@0: 
michael@0:     class MOZ_STACK_CLASS Position {
michael@0:       public:
michael@0:         // The Token fields may contain pointers to atoms, so for correct
michael@0:         // rooting we must ensure collection of atoms is disabled while objects
michael@0:         // of this class are live.  Do this by requiring a dummy AutoKeepAtoms
michael@0:         // reference in the constructor.
michael@0:         //
michael@0:         // This class is explicity ignored by the analysis, so don't add any
michael@0:         // more pointers to GC things here!
michael@0:         Position(AutoKeepAtoms&) { }
michael@0:       private:
michael@0:         Position(const Position&) MOZ_DELETE;
michael@0:         friend class TokenStream;
michael@0:         const jschar *buf;
michael@0:         Flags flags;
michael@0:         unsigned lineno;
michael@0:         const jschar *linebase;
michael@0:         const jschar *prevLinebase;
michael@0:         Token currentToken;
michael@0:         unsigned lookahead;
michael@0:         Token lookaheadTokens[maxLookahead];
michael@0:     };
michael@0: 
michael@0:     void advance(size_t position);
michael@0:     void tell(Position *);
michael@0:     void seek(const Position &pos);
michael@0:     bool seek(const Position &pos, const TokenStream &other);
michael@0: 
michael@0:     size_t positionToOffset(const Position &pos) const {
michael@0:         return pos.buf - userbuf.base();
michael@0:     }
michael@0: 
michael@0:     const jschar *rawBase() const {
michael@0:         return userbuf.base();
michael@0:     }
michael@0: 
michael@0:     const jschar *rawLimit() const {
michael@0:         return userbuf.limit();
michael@0:     }
michael@0: 
michael@0:     bool hasDisplayURL() const {
michael@0:         return displayURL_ != nullptr;
michael@0:     }
michael@0: 
michael@0:     jschar *displayURL() {
michael@0:         return displayURL_;
michael@0:     }
michael@0: 
michael@0:     bool hasSourceMapURL() const {
michael@0:         return sourceMapURL_ != nullptr;
michael@0:     }
michael@0: 
michael@0:     jschar *sourceMapURL() {
michael@0:         return sourceMapURL_;
michael@0:     }
michael@0: 
michael@0:     // If the name at s[0:length] is not a keyword in this version, return
michael@0:     // true with *ttp unchanged.
michael@0:     //
michael@0:     // If it is a reserved word in this version and strictness mode, and thus
michael@0:     // can't be present in correct code, report a SyntaxError and return false.
michael@0:     //
michael@0:     // If it is a keyword, like "if", the behavior depends on ttp. If ttp is
michael@0:     // null, report a SyntaxError ("if is a reserved identifier") and return
michael@0:     // false. If ttp is non-null, return true with the keyword's TokenKind in
michael@0:     // *ttp.
michael@0:     bool checkForKeyword(const jschar *s, size_t length, TokenKind *ttp);
michael@0: 
michael@0:     // This class maps a userbuf offset (which is 0-indexed) to a line number
michael@0:     // (which is 1-indexed) and a column index (which is 0-indexed).
michael@0:     class SourceCoords
michael@0:     {
michael@0:         // For a given buffer holding source code, |lineStartOffsets_| has one
michael@0:         // element per line of source code, plus one sentinel element.  Each
michael@0:         // non-sentinel element holds the buffer offset for the start of the
michael@0:         // corresponding line of source code.  For this example script:
michael@0:         //
michael@0:         // 1  // xyz            [line starts at offset 0]
michael@0:         // 2  var x;            [line starts at offset 7]
michael@0:         // 3                    [line starts at offset 14]
michael@0:         // 4  var y;            [line starts at offset 15]
michael@0:         //
michael@0:         // |lineStartOffsets_| is:
michael@0:         //
michael@0:         //   [0, 7, 14, 15, MAX_PTR]
michael@0:         //
michael@0:         // To convert a "line number" to a "line index" (i.e. an index into
michael@0:         // |lineStartOffsets_|), subtract |initialLineNum_|.  E.g. line 3's
michael@0:         // line index is (3 - initialLineNum_), which is 2.  Therefore
michael@0:         // lineStartOffsets_[2] holds the buffer offset for the start of line 3,
michael@0:         // which is 14.  (Note that |initialLineNum_| is often 1, but not
michael@0:         // always.)
michael@0:         //
michael@0:         // The first element is always 0, and the last element is always the
michael@0:         // MAX_PTR sentinel.
michael@0:         //
michael@0:         // offset-to-line/column lookups are O(log n) in the worst case (binary
michael@0:         // search), but in practice they're heavily clustered and we do better
michael@0:         // than that by using the previous lookup's result (lastLineIndex_) as
michael@0:         // a starting point.
michael@0:         //
michael@0:         // Checking if an offset lies within a particular line number
michael@0:         // (isOnThisLine()) is O(1).
michael@0:         //
michael@0:         Vector<uint32_t, 128> lineStartOffsets_;
michael@0:         uint32_t            initialLineNum_;
michael@0: 
michael@0:         // This is mutable because it's modified on every search, but that fact
michael@0:         // isn't visible outside this class.
michael@0:         mutable uint32_t    lastLineIndex_;
michael@0: 
michael@0:         uint32_t lineIndexOf(uint32_t offset) const;
michael@0: 
michael@0:         static const uint32_t MAX_PTR = UINT32_MAX;
michael@0: 
michael@0:         uint32_t lineIndexToNum(uint32_t lineIndex) const { return lineIndex + initialLineNum_; }
michael@0:         uint32_t lineNumToIndex(uint32_t lineNum)   const { return lineNum   - initialLineNum_; }
michael@0: 
michael@0:       public:
michael@0:         SourceCoords(ExclusiveContext *cx, uint32_t ln);
michael@0: 
michael@0:         void add(uint32_t lineNum, uint32_t lineStartOffset);
michael@0:         bool fill(const SourceCoords &other);
michael@0: 
michael@0:         bool isOnThisLine(uint32_t offset, uint32_t lineNum) const {
michael@0:             uint32_t lineIndex = lineNumToIndex(lineNum);
michael@0:             JS_ASSERT(lineIndex + 1 < lineStartOffsets_.length());  // +1 due to sentinel
michael@0:             return lineStartOffsets_[lineIndex] <= offset &&
michael@0:                    offset < lineStartOffsets_[lineIndex + 1];
michael@0:         }
michael@0: 
michael@0:         uint32_t lineNum(uint32_t offset) const;
michael@0:         uint32_t columnIndex(uint32_t offset) const;
michael@0:         void lineNumAndColumnIndex(uint32_t offset, uint32_t *lineNum, uint32_t *columnIndex) const;
michael@0:     };
michael@0: 
michael@0:     SourceCoords srcCoords;
michael@0: 
michael@0:     JSAtomState &names() const {
michael@0:         return cx->names();
michael@0:     }
michael@0: 
michael@0:     ExclusiveContext *context() const {
michael@0:         return cx;
michael@0:     }
michael@0: 
michael@0:     const ReadOnlyCompileOptions &options() const {
michael@0:         return options_;
michael@0:     }
michael@0: 
michael@0:   private:
michael@0:     // This is the low-level interface to the JS source code buffer.  It just
michael@0:     // gets raw chars, basically.  TokenStreams functions are layered on top
michael@0:     // and do some extra stuff like converting all EOL sequences to '\n',
michael@0:     // tracking the line number, and setting |flags.isEOF|.  (The "raw" in "raw
michael@0:     // chars" refers to the lack of EOL sequence normalization.)
michael@0:     class TokenBuf {
michael@0:       public:
michael@0:         TokenBuf(ExclusiveContext *cx, const jschar *buf, size_t length)
michael@0:           : base_(buf), limit_(buf + length), ptr(buf)
michael@0:         { }
michael@0: 
michael@0:         bool hasRawChars() const {
michael@0:             return ptr < limit_;
michael@0:         }
michael@0: 
michael@0:         bool atStart() const {
michael@0:             return ptr == base_;
michael@0:         }
michael@0: 
michael@0:         const jschar *base() const {
michael@0:             return base_;
michael@0:         }
michael@0: 
michael@0:         const jschar *limit() const {
michael@0:             return limit_;
michael@0:         }
michael@0: 
michael@0:         jschar getRawChar() {
michael@0:             return *ptr++;      // this will nullptr-crash if poisoned
michael@0:         }
michael@0: 
michael@0:         jschar peekRawChar() const {
michael@0:             return *ptr;        // this will nullptr-crash if poisoned
michael@0:         }
michael@0: 
michael@0:         bool matchRawChar(jschar c) {
michael@0:             if (*ptr == c) {    // this will nullptr-crash if poisoned
michael@0:                 ptr++;
michael@0:                 return true;
michael@0:             }
michael@0:             return false;
michael@0:         }
michael@0: 
michael@0:         bool matchRawCharBackwards(jschar c) {
michael@0:             JS_ASSERT(ptr);     // make sure it hasn't been poisoned
michael@0:             if (*(ptr - 1) == c) {
michael@0:                 ptr--;
michael@0:                 return true;
michael@0:             }
michael@0:             return false;
michael@0:         }
michael@0: 
michael@0:         void ungetRawChar() {
michael@0:             JS_ASSERT(ptr);     // make sure it hasn't been poisoned
michael@0:             ptr--;
michael@0:         }
michael@0: 
michael@0:         const jschar *addressOfNextRawChar(bool allowPoisoned = false) const {
michael@0:             JS_ASSERT_IF(!allowPoisoned, ptr);     // make sure it hasn't been poisoned
michael@0:             return ptr;
michael@0:         }
michael@0: 
michael@0:         // Use this with caution!
michael@0:         void setAddressOfNextRawChar(const jschar *a, bool allowPoisoned = false) {
michael@0:             JS_ASSERT_IF(!allowPoisoned, a);
michael@0:             ptr = a;
michael@0:         }
michael@0: 
michael@0: #ifdef DEBUG
michael@0:         // Poison the TokenBuf so it cannot be accessed again.
michael@0:         void poison() {
michael@0:             ptr = nullptr;
michael@0:         }
michael@0: #endif
michael@0: 
michael@0:         static bool isRawEOLChar(int32_t c) {
michael@0:             return c == '\n' || c == '\r' || c == LINE_SEPARATOR || c == PARA_SEPARATOR;
michael@0:         }
michael@0: 
michael@0:         // Finds the next EOL, but stops once 'max' jschars have been scanned
michael@0:         // (*including* the starting jschar).
michael@0:         const jschar *findEOLMax(const jschar *p, size_t max);
michael@0: 
michael@0:       private:
michael@0:         const jschar *base_;            // base of buffer
michael@0:         const jschar *limit_;           // limit for quick bounds check
michael@0:         const jschar *ptr;              // next char to get
michael@0:     };
michael@0: 
michael@0:     TokenKind getTokenInternal(Modifier modifier);
michael@0: 
michael@0:     int32_t getChar();
michael@0:     int32_t getCharIgnoreEOL();
michael@0:     void ungetChar(int32_t c);
michael@0:     void ungetCharIgnoreEOL(int32_t c);
michael@0:     Token *newToken(ptrdiff_t adjust);
michael@0:     bool peekUnicodeEscape(int32_t *c);
michael@0:     bool matchUnicodeEscapeIdStart(int32_t *c);
michael@0:     bool matchUnicodeEscapeIdent(int32_t *c);
michael@0:     bool peekChars(int n, jschar *cp);
michael@0: 
michael@0:     bool getDirectives(bool isMultiline, bool shouldWarnDeprecated);
michael@0:     bool getDirective(bool isMultiline, bool shouldWarnDeprecated,
michael@0:                       const char *directive, int directiveLength,
michael@0:                       const char *errorMsgPragma, jschar **destination);
michael@0:     bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated);
michael@0:     bool getSourceMappingURL(bool isMultiline, bool shouldWarnDeprecated);
michael@0: 
michael@0:     // |expect| cannot be an EOL char.
michael@0:     bool matchChar(int32_t expect) {
michael@0:         MOZ_ASSERT(!TokenBuf::isRawEOLChar(expect));
michael@0:         return MOZ_LIKELY(userbuf.hasRawChars()) &&
michael@0:                userbuf.matchRawChar(expect);
michael@0:     }
michael@0: 
michael@0:     void consumeKnownChar(int32_t expect) {
michael@0:         mozilla::DebugOnly<int32_t> c = getChar();
michael@0:         JS_ASSERT(c == expect);
michael@0:     }
michael@0: 
michael@0:     int32_t peekChar() {
michael@0:         int32_t c = getChar();
michael@0:         ungetChar(c);
michael@0:         return c;
michael@0:     }
michael@0: 
michael@0:     void skipChars(int n) {
michael@0:         while (--n >= 0)
michael@0:             getChar();
michael@0:     }
michael@0: 
michael@0:     void updateLineInfoForEOL();
michael@0:     void updateFlagsForEOL();
michael@0: 
michael@0:     // Options used for parsing/tokenizing.
michael@0:     const ReadOnlyCompileOptions &options_;
michael@0: 
michael@0:     Token               tokens[ntokens];    // circular token buffer
michael@0:     unsigned            cursor;             // index of last parsed token
michael@0:     unsigned            lookahead;          // count of lookahead tokens
michael@0:     unsigned            lineno;             // current line number
michael@0:     Flags               flags;              // flags -- see above
michael@0:     const jschar        *linebase;          // start of current line;  points into userbuf
michael@0:     const jschar        *prevLinebase;      // start of previous line;  nullptr if on the first line
michael@0:     TokenBuf            userbuf;            // user input buffer
michael@0:     const char          *filename;          // input filename or null
michael@0:     jschar              *displayURL_;       // the user's requested source URL or null
michael@0:     jschar              *sourceMapURL_;     // source map's filename or null
michael@0:     CharBuffer          tokenbuf;           // current token string buffer
michael@0:     bool                maybeEOL[256];      // probabilistic EOL lookup table
michael@0:     bool                maybeStrSpecial[256];   // speeds up string scanning
michael@0:     uint8_t             isExprEnding[TOK_LIMIT];// which tokens definitely terminate exprs?
michael@0:     ExclusiveContext    *const cx;
michael@0:     JSPrincipals        *const originPrincipals;
michael@0:     StrictModeGetter    *strictModeGetter;  // used to test for strict mode
michael@0: };
michael@0: 
michael@0: // Steal one JSREPORT_* bit (see jsapi.h) to tell that arguments to the error
michael@0: // message have const jschar* type, not const char*.
michael@0: #define JSREPORT_UC 0x100
michael@0: 
michael@0: } // namespace frontend
michael@0: } // namespace js
michael@0: 
michael@0: extern JS_FRIEND_API(int)
michael@0: js_fgets(char *buf, int size, FILE *file);
michael@0: 
michael@0: #ifdef DEBUG
michael@0: extern const char *
michael@0: TokenKindToString(js::frontend::TokenKind tt);
michael@0: #endif
michael@0: 
michael@0: #endif /* frontend_TokenStream_h */