michael@0: // michael@0: // regexcmp.h michael@0: // michael@0: // Copyright (C) 2002-2012, International Business Machines Corporation and others. michael@0: // All Rights Reserved. michael@0: // michael@0: // This file contains declarations for the class RegexCompile michael@0: // michael@0: // This class is internal to the regular expression implementation. michael@0: // For the public Regular Expression API, see the file "unicode/regex.h" michael@0: // michael@0: michael@0: michael@0: #ifndef RBBISCAN_H michael@0: #define RBBISCAN_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: #if !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/parseerr.h" michael@0: #include "uhash.h" michael@0: #include "uvector.h" michael@0: michael@0: michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: michael@0: //-------------------------------------------------------------------------------- michael@0: // michael@0: // class RegexCompile Contains the regular expression compiler. michael@0: // michael@0: //-------------------------------------------------------------------------------- michael@0: struct RegexTableEl; michael@0: class RegexPattern; michael@0: michael@0: michael@0: class RegexCompile : public UMemory { michael@0: public: michael@0: michael@0: enum { michael@0: kStackSize = 100 // The size of the state stack for michael@0: }; // pattern parsing. Corresponds roughly michael@0: // to the depth of parentheses nesting michael@0: // that is allowed in the rules. michael@0: michael@0: struct RegexPatternChar { michael@0: UChar32 fChar; michael@0: UBool fQuoted; michael@0: }; michael@0: michael@0: RegexCompile(RegexPattern *rp, UErrorCode &e); michael@0: michael@0: void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); michael@0: void compile(UText *pat, UParseError &pp, UErrorCode &e); michael@0: michael@0: michael@0: virtual ~RegexCompile(); michael@0: michael@0: void nextChar(RegexPatternChar &c); // Get the next char from the input stream. michael@0: michael@0: static void cleanup(); // Memory cleanup michael@0: michael@0: michael@0: michael@0: // Categories of parentheses in pattern. michael@0: // The category is saved in the compile-time parentheses stack frame, and michael@0: // determines the code to be generated when the matching close ) is encountered. michael@0: enum EParenClass { michael@0: plain = -1, // No special handling michael@0: capturing = -2, michael@0: atomic = -3, michael@0: lookAhead = -4, michael@0: negLookAhead = -5, michael@0: flags = -6, michael@0: lookBehind = -7, michael@0: lookBehindN = -8 michael@0: }; michael@0: michael@0: private: michael@0: michael@0: michael@0: UBool doParseActions(int32_t a); michael@0: void error(UErrorCode e); // error reporting convenience function. michael@0: michael@0: UChar32 nextCharLL(); michael@0: UChar32 peekCharLL(); michael@0: UnicodeSet *scanProp(); michael@0: UnicodeSet *scanPosixProp(); michael@0: void handleCloseParen(); michael@0: int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern michael@0: // at the top of the just completed block michael@0: // or operation, and optionally ensure that michael@0: // there is space to add an opcode there. michael@0: void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for michael@0: // a reference to a UnicodeSet. michael@0: void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. michael@0: int32_t LoopOp); michael@0: UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier michael@0: void literalChar(UChar32 c); // Compile a literal char michael@0: void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. michael@0: void insertOp(int32_t where); // Open up a slot for a new op in the michael@0: // generated code at the specified location. michael@0: int32_t minMatchLength(int32_t start, michael@0: int32_t end); michael@0: int32_t maxMatchLength(int32_t start, michael@0: int32_t end); michael@0: void matchStartType(); michael@0: void stripNOPs(); michael@0: michael@0: void setEval(int32_t op); michael@0: void setPushOp(int32_t op); michael@0: UChar32 scanNamedChar(); michael@0: UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); michael@0: michael@0: michael@0: UErrorCode *fStatus; michael@0: RegexPattern *fRXPat; michael@0: UParseError *fParseErr; michael@0: michael@0: // michael@0: // Data associated with low level character scanning michael@0: // michael@0: int64_t fScanIndex; // Index of current character being processed michael@0: // in the rule input string. michael@0: UBool fQuoteMode; // Scan is in a \Q...\E quoted region michael@0: UBool fInBackslashQuote; // Scan is between a '\' and the following char. michael@0: UBool fEOLComments; // When scan is just after '(?', inhibit #... to michael@0: // end of line comments, in favor of (?#...) comments. michael@0: int64_t fLineNum; // Line number in input file. michael@0: int64_t fCharNum; // Char position within the line. michael@0: UChar32 fLastChar; // Previous char, needed to count CR-LF michael@0: // as a single line, not two. michael@0: UChar32 fPeekChar; // Saved char, if we've scanned ahead. michael@0: michael@0: michael@0: RegexPatternChar fC; // Current char for parse state machine michael@0: // processing. michael@0: michael@0: // michael@0: // Data for the state machine that parses the regular expression. michael@0: // michael@0: RegexTableEl **fStateTable; // State Transition Table for regex Rule michael@0: // parsing. index by p[state][char-class] michael@0: michael@0: uint16_t fStack[kStackSize]; // State stack, holds state pushes michael@0: int32_t fStackPtr; // and pops as specified in the state michael@0: // transition rules. michael@0: michael@0: // michael@0: // Data associated with the generation of the pcode for the match engine michael@0: // michael@0: int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) michael@0: // Always has high bit (31) set so that flag values michael@0: // on the paren stack are distinguished from relocatable michael@0: // pcode addresses. michael@0: int32_t fNewModeFlags; // New flags, while compiling (?i, holds state michael@0: // until last flag is scanned. michael@0: UBool fSetModeFlag; // true for (?ismx, false for (?-ismx michael@0: michael@0: UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. michael@0: // Once completed, meaning that some non-literal pattern michael@0: // construct is encountered, the appropriate opcodes michael@0: // to match the literal will be generated, and this michael@0: // string will be cleared. michael@0: michael@0: int64_t fPatternLength; // Length of the input pattern string. michael@0: michael@0: UVector32 fParenStack; // parentheses stack. Each frame consists of michael@0: // the positions of compiled pattern operations michael@0: // needing fixup, followed by negative value. The michael@0: // first entry in each frame is the position of the michael@0: // spot reserved for use when a quantifier michael@0: // needs to add a SAVE at the start of a (block) michael@0: // The negative value (-1, -2,...) indicates michael@0: // the kind of paren that opened the frame. Some michael@0: // need special handling on close. michael@0: michael@0: michael@0: int32_t fMatchOpenParen; // The position in the compiled pattern michael@0: // of the slot reserved for a state save michael@0: // at the start of the most recently processed michael@0: // parenthesized block. michael@0: int32_t fMatchCloseParen; // The position in the pattern of the first michael@0: // location after the most recently processed michael@0: // parenthesized block. michael@0: michael@0: int32_t fIntervalLow; // {lower, upper} interval quantifier values. michael@0: int32_t fIntervalUpper; // Placed here temporarily, when pattern is michael@0: // initially scanned. Each new interval michael@0: // encountered overwrites these values. michael@0: // -1 for the upper interval value means none michael@0: // was specified (unlimited occurences.) michael@0: michael@0: int64_t fNameStartPos; // Starting position of a \N{NAME} name in a michael@0: // pattern, valid while remainder of name is michael@0: // scanned. michael@0: michael@0: UStack fSetStack; // Stack of UnicodeSets, used while evaluating michael@0: // (at compile time) set expressions within michael@0: // the pattern. michael@0: UStack fSetOpStack; // Stack of pending set operators (&&, --, union) michael@0: michael@0: UChar32 fLastSetLiteral; // The last single code point added to a set. michael@0: // needed when "-y" is scanned, and we need michael@0: // to turn "x-y" into a range. michael@0: }; michael@0: michael@0: // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] michael@0: // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. michael@0: michael@0: enum SetOperations { michael@0: setStart = 0 << 16 | 1, michael@0: setEnd = 1 << 16 | 2, michael@0: setNegation = 2 << 16 | 3, michael@0: setCaseClose = 2 << 16 | 9, michael@0: setDifference2 = 3 << 16 | 4, // '--' set difference operator michael@0: setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator michael@0: setUnion = 4 << 16 | 6, // implicit union of adjacent items michael@0: setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. michael@0: setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS michael@0: #endif // RBBISCAN_H