Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | // |
michael@0 | 2 | // regexcmp.h |
michael@0 | 3 | // |
michael@0 | 4 | // Copyright (C) 2002-2012, International Business Machines Corporation and others. |
michael@0 | 5 | // All Rights Reserved. |
michael@0 | 6 | // |
michael@0 | 7 | // This file contains declarations for the class RegexCompile |
michael@0 | 8 | // |
michael@0 | 9 | // This class is internal to the regular expression implementation. |
michael@0 | 10 | // For the public Regular Expression API, see the file "unicode/regex.h" |
michael@0 | 11 | // |
michael@0 | 12 | |
michael@0 | 13 | |
michael@0 | 14 | #ifndef RBBISCAN_H |
michael@0 | 15 | #define RBBISCAN_H |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/uobject.h" |
michael@0 | 21 | #include "unicode/uniset.h" |
michael@0 | 22 | #include "unicode/parseerr.h" |
michael@0 | 23 | #include "uhash.h" |
michael@0 | 24 | #include "uvector.h" |
michael@0 | 25 | |
michael@0 | 26 | |
michael@0 | 27 | |
michael@0 | 28 | U_NAMESPACE_BEGIN |
michael@0 | 29 | |
michael@0 | 30 | |
michael@0 | 31 | //-------------------------------------------------------------------------------- |
michael@0 | 32 | // |
michael@0 | 33 | // class RegexCompile Contains the regular expression compiler. |
michael@0 | 34 | // |
michael@0 | 35 | //-------------------------------------------------------------------------------- |
michael@0 | 36 | struct RegexTableEl; |
michael@0 | 37 | class RegexPattern; |
michael@0 | 38 | |
michael@0 | 39 | |
michael@0 | 40 | class RegexCompile : public UMemory { |
michael@0 | 41 | public: |
michael@0 | 42 | |
michael@0 | 43 | enum { |
michael@0 | 44 | kStackSize = 100 // The size of the state stack for |
michael@0 | 45 | }; // pattern parsing. Corresponds roughly |
michael@0 | 46 | // to the depth of parentheses nesting |
michael@0 | 47 | // that is allowed in the rules. |
michael@0 | 48 | |
michael@0 | 49 | struct RegexPatternChar { |
michael@0 | 50 | UChar32 fChar; |
michael@0 | 51 | UBool fQuoted; |
michael@0 | 52 | }; |
michael@0 | 53 | |
michael@0 | 54 | RegexCompile(RegexPattern *rp, UErrorCode &e); |
michael@0 | 55 | |
michael@0 | 56 | void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); |
michael@0 | 57 | void compile(UText *pat, UParseError &pp, UErrorCode &e); |
michael@0 | 58 | |
michael@0 | 59 | |
michael@0 | 60 | virtual ~RegexCompile(); |
michael@0 | 61 | |
michael@0 | 62 | void nextChar(RegexPatternChar &c); // Get the next char from the input stream. |
michael@0 | 63 | |
michael@0 | 64 | static void cleanup(); // Memory cleanup |
michael@0 | 65 | |
michael@0 | 66 | |
michael@0 | 67 | |
michael@0 | 68 | // Categories of parentheses in pattern. |
michael@0 | 69 | // The category is saved in the compile-time parentheses stack frame, and |
michael@0 | 70 | // determines the code to be generated when the matching close ) is encountered. |
michael@0 | 71 | enum EParenClass { |
michael@0 | 72 | plain = -1, // No special handling |
michael@0 | 73 | capturing = -2, |
michael@0 | 74 | atomic = -3, |
michael@0 | 75 | lookAhead = -4, |
michael@0 | 76 | negLookAhead = -5, |
michael@0 | 77 | flags = -6, |
michael@0 | 78 | lookBehind = -7, |
michael@0 | 79 | lookBehindN = -8 |
michael@0 | 80 | }; |
michael@0 | 81 | |
michael@0 | 82 | private: |
michael@0 | 83 | |
michael@0 | 84 | |
michael@0 | 85 | UBool doParseActions(int32_t a); |
michael@0 | 86 | void error(UErrorCode e); // error reporting convenience function. |
michael@0 | 87 | |
michael@0 | 88 | UChar32 nextCharLL(); |
michael@0 | 89 | UChar32 peekCharLL(); |
michael@0 | 90 | UnicodeSet *scanProp(); |
michael@0 | 91 | UnicodeSet *scanPosixProp(); |
michael@0 | 92 | void handleCloseParen(); |
michael@0 | 93 | int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern |
michael@0 | 94 | // at the top of the just completed block |
michael@0 | 95 | // or operation, and optionally ensure that |
michael@0 | 96 | // there is space to add an opcode there. |
michael@0 | 97 | void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for |
michael@0 | 98 | // a reference to a UnicodeSet. |
michael@0 | 99 | void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. |
michael@0 | 100 | int32_t LoopOp); |
michael@0 | 101 | UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier |
michael@0 | 102 | void literalChar(UChar32 c); // Compile a literal char |
michael@0 | 103 | void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. |
michael@0 | 104 | void insertOp(int32_t where); // Open up a slot for a new op in the |
michael@0 | 105 | // generated code at the specified location. |
michael@0 | 106 | int32_t minMatchLength(int32_t start, |
michael@0 | 107 | int32_t end); |
michael@0 | 108 | int32_t maxMatchLength(int32_t start, |
michael@0 | 109 | int32_t end); |
michael@0 | 110 | void matchStartType(); |
michael@0 | 111 | void stripNOPs(); |
michael@0 | 112 | |
michael@0 | 113 | void setEval(int32_t op); |
michael@0 | 114 | void setPushOp(int32_t op); |
michael@0 | 115 | UChar32 scanNamedChar(); |
michael@0 | 116 | UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); |
michael@0 | 117 | |
michael@0 | 118 | |
michael@0 | 119 | UErrorCode *fStatus; |
michael@0 | 120 | RegexPattern *fRXPat; |
michael@0 | 121 | UParseError *fParseErr; |
michael@0 | 122 | |
michael@0 | 123 | // |
michael@0 | 124 | // Data associated with low level character scanning |
michael@0 | 125 | // |
michael@0 | 126 | int64_t fScanIndex; // Index of current character being processed |
michael@0 | 127 | // in the rule input string. |
michael@0 | 128 | UBool fQuoteMode; // Scan is in a \Q...\E quoted region |
michael@0 | 129 | UBool fInBackslashQuote; // Scan is between a '\' and the following char. |
michael@0 | 130 | UBool fEOLComments; // When scan is just after '(?', inhibit #... to |
michael@0 | 131 | // end of line comments, in favor of (?#...) comments. |
michael@0 | 132 | int64_t fLineNum; // Line number in input file. |
michael@0 | 133 | int64_t fCharNum; // Char position within the line. |
michael@0 | 134 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
michael@0 | 135 | // as a single line, not two. |
michael@0 | 136 | UChar32 fPeekChar; // Saved char, if we've scanned ahead. |
michael@0 | 137 | |
michael@0 | 138 | |
michael@0 | 139 | RegexPatternChar fC; // Current char for parse state machine |
michael@0 | 140 | // processing. |
michael@0 | 141 | |
michael@0 | 142 | // |
michael@0 | 143 | // Data for the state machine that parses the regular expression. |
michael@0 | 144 | // |
michael@0 | 145 | RegexTableEl **fStateTable; // State Transition Table for regex Rule |
michael@0 | 146 | // parsing. index by p[state][char-class] |
michael@0 | 147 | |
michael@0 | 148 | uint16_t fStack[kStackSize]; // State stack, holds state pushes |
michael@0 | 149 | int32_t fStackPtr; // and pops as specified in the state |
michael@0 | 150 | // transition rules. |
michael@0 | 151 | |
michael@0 | 152 | // |
michael@0 | 153 | // Data associated with the generation of the pcode for the match engine |
michael@0 | 154 | // |
michael@0 | 155 | int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) |
michael@0 | 156 | // Always has high bit (31) set so that flag values |
michael@0 | 157 | // on the paren stack are distinguished from relocatable |
michael@0 | 158 | // pcode addresses. |
michael@0 | 159 | int32_t fNewModeFlags; // New flags, while compiling (?i, holds state |
michael@0 | 160 | // until last flag is scanned. |
michael@0 | 161 | UBool fSetModeFlag; // true for (?ismx, false for (?-ismx |
michael@0 | 162 | |
michael@0 | 163 | UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. |
michael@0 | 164 | // Once completed, meaning that some non-literal pattern |
michael@0 | 165 | // construct is encountered, the appropriate opcodes |
michael@0 | 166 | // to match the literal will be generated, and this |
michael@0 | 167 | // string will be cleared. |
michael@0 | 168 | |
michael@0 | 169 | int64_t fPatternLength; // Length of the input pattern string. |
michael@0 | 170 | |
michael@0 | 171 | UVector32 fParenStack; // parentheses stack. Each frame consists of |
michael@0 | 172 | // the positions of compiled pattern operations |
michael@0 | 173 | // needing fixup, followed by negative value. The |
michael@0 | 174 | // first entry in each frame is the position of the |
michael@0 | 175 | // spot reserved for use when a quantifier |
michael@0 | 176 | // needs to add a SAVE at the start of a (block) |
michael@0 | 177 | // The negative value (-1, -2,...) indicates |
michael@0 | 178 | // the kind of paren that opened the frame. Some |
michael@0 | 179 | // need special handling on close. |
michael@0 | 180 | |
michael@0 | 181 | |
michael@0 | 182 | int32_t fMatchOpenParen; // The position in the compiled pattern |
michael@0 | 183 | // of the slot reserved for a state save |
michael@0 | 184 | // at the start of the most recently processed |
michael@0 | 185 | // parenthesized block. |
michael@0 | 186 | int32_t fMatchCloseParen; // The position in the pattern of the first |
michael@0 | 187 | // location after the most recently processed |
michael@0 | 188 | // parenthesized block. |
michael@0 | 189 | |
michael@0 | 190 | int32_t fIntervalLow; // {lower, upper} interval quantifier values. |
michael@0 | 191 | int32_t fIntervalUpper; // Placed here temporarily, when pattern is |
michael@0 | 192 | // initially scanned. Each new interval |
michael@0 | 193 | // encountered overwrites these values. |
michael@0 | 194 | // -1 for the upper interval value means none |
michael@0 | 195 | // was specified (unlimited occurences.) |
michael@0 | 196 | |
michael@0 | 197 | int64_t fNameStartPos; // Starting position of a \N{NAME} name in a |
michael@0 | 198 | // pattern, valid while remainder of name is |
michael@0 | 199 | // scanned. |
michael@0 | 200 | |
michael@0 | 201 | UStack fSetStack; // Stack of UnicodeSets, used while evaluating |
michael@0 | 202 | // (at compile time) set expressions within |
michael@0 | 203 | // the pattern. |
michael@0 | 204 | UStack fSetOpStack; // Stack of pending set operators (&&, --, union) |
michael@0 | 205 | |
michael@0 | 206 | UChar32 fLastSetLiteral; // The last single code point added to a set. |
michael@0 | 207 | // needed when "-y" is scanned, and we need |
michael@0 | 208 | // to turn "x-y" into a range. |
michael@0 | 209 | }; |
michael@0 | 210 | |
michael@0 | 211 | // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] |
michael@0 | 212 | // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. |
michael@0 | 213 | |
michael@0 | 214 | enum SetOperations { |
michael@0 | 215 | setStart = 0 << 16 | 1, |
michael@0 | 216 | setEnd = 1 << 16 | 2, |
michael@0 | 217 | setNegation = 2 << 16 | 3, |
michael@0 | 218 | setCaseClose = 2 << 16 | 9, |
michael@0 | 219 | setDifference2 = 3 << 16 | 4, // '--' set difference operator |
michael@0 | 220 | setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator |
michael@0 | 221 | setUnion = 4 << 16 | 6, // implicit union of adjacent items |
michael@0 | 222 | setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. |
michael@0 | 223 | setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. |
michael@0 | 224 | }; |
michael@0 | 225 | |
michael@0 | 226 | U_NAMESPACE_END |
michael@0 | 227 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
michael@0 | 228 | #endif // RBBISCAN_H |