intl/icu/source/i18n/regexcmp.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 //
michael@0 2 // regexcmp.h
michael@0 3 //
michael@0 4 // Copyright (C) 2002-2012, International Business Machines Corporation and others.
michael@0 5 // All Rights Reserved.
michael@0 6 //
michael@0 7 // This file contains declarations for the class RegexCompile
michael@0 8 //
michael@0 9 // This class is internal to the regular expression implementation.
michael@0 10 // For the public Regular Expression API, see the file "unicode/regex.h"
michael@0 11 //
michael@0 12
michael@0 13
michael@0 14 #ifndef RBBISCAN_H
michael@0 15 #define RBBISCAN_H
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 19
michael@0 20 #include "unicode/uobject.h"
michael@0 21 #include "unicode/uniset.h"
michael@0 22 #include "unicode/parseerr.h"
michael@0 23 #include "uhash.h"
michael@0 24 #include "uvector.h"
michael@0 25
michael@0 26
michael@0 27
michael@0 28 U_NAMESPACE_BEGIN
michael@0 29
michael@0 30
michael@0 31 //--------------------------------------------------------------------------------
michael@0 32 //
michael@0 33 // class RegexCompile Contains the regular expression compiler.
michael@0 34 //
michael@0 35 //--------------------------------------------------------------------------------
michael@0 36 struct RegexTableEl;
michael@0 37 class RegexPattern;
michael@0 38
michael@0 39
michael@0 40 class RegexCompile : public UMemory {
michael@0 41 public:
michael@0 42
michael@0 43 enum {
michael@0 44 kStackSize = 100 // The size of the state stack for
michael@0 45 }; // pattern parsing. Corresponds roughly
michael@0 46 // to the depth of parentheses nesting
michael@0 47 // that is allowed in the rules.
michael@0 48
michael@0 49 struct RegexPatternChar {
michael@0 50 UChar32 fChar;
michael@0 51 UBool fQuoted;
michael@0 52 };
michael@0 53
michael@0 54 RegexCompile(RegexPattern *rp, UErrorCode &e);
michael@0 55
michael@0 56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
michael@0 57 void compile(UText *pat, UParseError &pp, UErrorCode &e);
michael@0 58
michael@0 59
michael@0 60 virtual ~RegexCompile();
michael@0 61
michael@0 62 void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
michael@0 63
michael@0 64 static void cleanup(); // Memory cleanup
michael@0 65
michael@0 66
michael@0 67
michael@0 68 // Categories of parentheses in pattern.
michael@0 69 // The category is saved in the compile-time parentheses stack frame, and
michael@0 70 // determines the code to be generated when the matching close ) is encountered.
michael@0 71 enum EParenClass {
michael@0 72 plain = -1, // No special handling
michael@0 73 capturing = -2,
michael@0 74 atomic = -3,
michael@0 75 lookAhead = -4,
michael@0 76 negLookAhead = -5,
michael@0 77 flags = -6,
michael@0 78 lookBehind = -7,
michael@0 79 lookBehindN = -8
michael@0 80 };
michael@0 81
michael@0 82 private:
michael@0 83
michael@0 84
michael@0 85 UBool doParseActions(int32_t a);
michael@0 86 void error(UErrorCode e); // error reporting convenience function.
michael@0 87
michael@0 88 UChar32 nextCharLL();
michael@0 89 UChar32 peekCharLL();
michael@0 90 UnicodeSet *scanProp();
michael@0 91 UnicodeSet *scanPosixProp();
michael@0 92 void handleCloseParen();
michael@0 93 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
michael@0 94 // at the top of the just completed block
michael@0 95 // or operation, and optionally ensure that
michael@0 96 // there is space to add an opcode there.
michael@0 97 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
michael@0 98 // a reference to a UnicodeSet.
michael@0 99 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
michael@0 100 int32_t LoopOp);
michael@0 101 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
michael@0 102 void literalChar(UChar32 c); // Compile a literal char
michael@0 103 void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters.
michael@0 104 void insertOp(int32_t where); // Open up a slot for a new op in the
michael@0 105 // generated code at the specified location.
michael@0 106 int32_t minMatchLength(int32_t start,
michael@0 107 int32_t end);
michael@0 108 int32_t maxMatchLength(int32_t start,
michael@0 109 int32_t end);
michael@0 110 void matchStartType();
michael@0 111 void stripNOPs();
michael@0 112
michael@0 113 void setEval(int32_t op);
michael@0 114 void setPushOp(int32_t op);
michael@0 115 UChar32 scanNamedChar();
michael@0 116 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
michael@0 117
michael@0 118
michael@0 119 UErrorCode *fStatus;
michael@0 120 RegexPattern *fRXPat;
michael@0 121 UParseError *fParseErr;
michael@0 122
michael@0 123 //
michael@0 124 // Data associated with low level character scanning
michael@0 125 //
michael@0 126 int64_t fScanIndex; // Index of current character being processed
michael@0 127 // in the rule input string.
michael@0 128 UBool fQuoteMode; // Scan is in a \Q...\E quoted region
michael@0 129 UBool fInBackslashQuote; // Scan is between a '\' and the following char.
michael@0 130 UBool fEOLComments; // When scan is just after '(?', inhibit #... to
michael@0 131 // end of line comments, in favor of (?#...) comments.
michael@0 132 int64_t fLineNum; // Line number in input file.
michael@0 133 int64_t fCharNum; // Char position within the line.
michael@0 134 UChar32 fLastChar; // Previous char, needed to count CR-LF
michael@0 135 // as a single line, not two.
michael@0 136 UChar32 fPeekChar; // Saved char, if we've scanned ahead.
michael@0 137
michael@0 138
michael@0 139 RegexPatternChar fC; // Current char for parse state machine
michael@0 140 // processing.
michael@0 141
michael@0 142 //
michael@0 143 // Data for the state machine that parses the regular expression.
michael@0 144 //
michael@0 145 RegexTableEl **fStateTable; // State Transition Table for regex Rule
michael@0 146 // parsing. index by p[state][char-class]
michael@0 147
michael@0 148 uint16_t fStack[kStackSize]; // State stack, holds state pushes
michael@0 149 int32_t fStackPtr; // and pops as specified in the state
michael@0 150 // transition rules.
michael@0 151
michael@0 152 //
michael@0 153 // Data associated with the generation of the pcode for the match engine
michael@0 154 //
michael@0 155 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.)
michael@0 156 // Always has high bit (31) set so that flag values
michael@0 157 // on the paren stack are distinguished from relocatable
michael@0 158 // pcode addresses.
michael@0 159 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state
michael@0 160 // until last flag is scanned.
michael@0 161 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
michael@0 162
michael@0 163 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here.
michael@0 164 // Once completed, meaning that some non-literal pattern
michael@0 165 // construct is encountered, the appropriate opcodes
michael@0 166 // to match the literal will be generated, and this
michael@0 167 // string will be cleared.
michael@0 168
michael@0 169 int64_t fPatternLength; // Length of the input pattern string.
michael@0 170
michael@0 171 UVector32 fParenStack; // parentheses stack. Each frame consists of
michael@0 172 // the positions of compiled pattern operations
michael@0 173 // needing fixup, followed by negative value. The
michael@0 174 // first entry in each frame is the position of the
michael@0 175 // spot reserved for use when a quantifier
michael@0 176 // needs to add a SAVE at the start of a (block)
michael@0 177 // The negative value (-1, -2,...) indicates
michael@0 178 // the kind of paren that opened the frame. Some
michael@0 179 // need special handling on close.
michael@0 180
michael@0 181
michael@0 182 int32_t fMatchOpenParen; // The position in the compiled pattern
michael@0 183 // of the slot reserved for a state save
michael@0 184 // at the start of the most recently processed
michael@0 185 // parenthesized block.
michael@0 186 int32_t fMatchCloseParen; // The position in the pattern of the first
michael@0 187 // location after the most recently processed
michael@0 188 // parenthesized block.
michael@0 189
michael@0 190 int32_t fIntervalLow; // {lower, upper} interval quantifier values.
michael@0 191 int32_t fIntervalUpper; // Placed here temporarily, when pattern is
michael@0 192 // initially scanned. Each new interval
michael@0 193 // encountered overwrites these values.
michael@0 194 // -1 for the upper interval value means none
michael@0 195 // was specified (unlimited occurences.)
michael@0 196
michael@0 197 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
michael@0 198 // pattern, valid while remainder of name is
michael@0 199 // scanned.
michael@0 200
michael@0 201 UStack fSetStack; // Stack of UnicodeSets, used while evaluating
michael@0 202 // (at compile time) set expressions within
michael@0 203 // the pattern.
michael@0 204 UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
michael@0 205
michael@0 206 UChar32 fLastSetLiteral; // The last single code point added to a set.
michael@0 207 // needed when "-y" is scanned, and we need
michael@0 208 // to turn "x-y" into a range.
michael@0 209 };
michael@0 210
michael@0 211 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
michael@0 212 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
michael@0 213
michael@0 214 enum SetOperations {
michael@0 215 setStart = 0 << 16 | 1,
michael@0 216 setEnd = 1 << 16 | 2,
michael@0 217 setNegation = 2 << 16 | 3,
michael@0 218 setCaseClose = 2 << 16 | 9,
michael@0 219 setDifference2 = 3 << 16 | 4, // '--' set difference operator
michael@0 220 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
michael@0 221 setUnion = 4 << 16 | 6, // implicit union of adjacent items
michael@0 222 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
michael@0 223 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
michael@0 224 };
michael@0 225
michael@0 226 U_NAMESPACE_END
michael@0 227 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
michael@0 228 #endif // RBBISCAN_H

mercurial