|
1 // |
|
2 // regexcmp.h |
|
3 // |
|
4 // Copyright (C) 2002-2012, International Business Machines Corporation and others. |
|
5 // All Rights Reserved. |
|
6 // |
|
7 // This file contains declarations for the class RegexCompile |
|
8 // |
|
9 // This class is internal to the regular expression implementation. |
|
10 // For the public Regular Expression API, see the file "unicode/regex.h" |
|
11 // |
|
12 |
|
13 |
|
14 #ifndef RBBISCAN_H |
|
15 #define RBBISCAN_H |
|
16 |
|
17 #include "unicode/utypes.h" |
|
18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
19 |
|
20 #include "unicode/uobject.h" |
|
21 #include "unicode/uniset.h" |
|
22 #include "unicode/parseerr.h" |
|
23 #include "uhash.h" |
|
24 #include "uvector.h" |
|
25 |
|
26 |
|
27 |
|
28 U_NAMESPACE_BEGIN |
|
29 |
|
30 |
|
31 //-------------------------------------------------------------------------------- |
|
32 // |
|
33 // class RegexCompile Contains the regular expression compiler. |
|
34 // |
|
35 //-------------------------------------------------------------------------------- |
|
36 struct RegexTableEl; |
|
37 class RegexPattern; |
|
38 |
|
39 |
|
40 class RegexCompile : public UMemory { |
|
41 public: |
|
42 |
|
43 enum { |
|
44 kStackSize = 100 // The size of the state stack for |
|
45 }; // pattern parsing. Corresponds roughly |
|
46 // to the depth of parentheses nesting |
|
47 // that is allowed in the rules. |
|
48 |
|
49 struct RegexPatternChar { |
|
50 UChar32 fChar; |
|
51 UBool fQuoted; |
|
52 }; |
|
53 |
|
54 RegexCompile(RegexPattern *rp, UErrorCode &e); |
|
55 |
|
56 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); |
|
57 void compile(UText *pat, UParseError &pp, UErrorCode &e); |
|
58 |
|
59 |
|
60 virtual ~RegexCompile(); |
|
61 |
|
62 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. |
|
63 |
|
64 static void cleanup(); // Memory cleanup |
|
65 |
|
66 |
|
67 |
|
68 // Categories of parentheses in pattern. |
|
69 // The category is saved in the compile-time parentheses stack frame, and |
|
70 // determines the code to be generated when the matching close ) is encountered. |
|
71 enum EParenClass { |
|
72 plain = -1, // No special handling |
|
73 capturing = -2, |
|
74 atomic = -3, |
|
75 lookAhead = -4, |
|
76 negLookAhead = -5, |
|
77 flags = -6, |
|
78 lookBehind = -7, |
|
79 lookBehindN = -8 |
|
80 }; |
|
81 |
|
82 private: |
|
83 |
|
84 |
|
85 UBool doParseActions(int32_t a); |
|
86 void error(UErrorCode e); // error reporting convenience function. |
|
87 |
|
88 UChar32 nextCharLL(); |
|
89 UChar32 peekCharLL(); |
|
90 UnicodeSet *scanProp(); |
|
91 UnicodeSet *scanPosixProp(); |
|
92 void handleCloseParen(); |
|
93 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern |
|
94 // at the top of the just completed block |
|
95 // or operation, and optionally ensure that |
|
96 // there is space to add an opcode there. |
|
97 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for |
|
98 // a reference to a UnicodeSet. |
|
99 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. |
|
100 int32_t LoopOp); |
|
101 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier |
|
102 void literalChar(UChar32 c); // Compile a literal char |
|
103 void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. |
|
104 void insertOp(int32_t where); // Open up a slot for a new op in the |
|
105 // generated code at the specified location. |
|
106 int32_t minMatchLength(int32_t start, |
|
107 int32_t end); |
|
108 int32_t maxMatchLength(int32_t start, |
|
109 int32_t end); |
|
110 void matchStartType(); |
|
111 void stripNOPs(); |
|
112 |
|
113 void setEval(int32_t op); |
|
114 void setPushOp(int32_t op); |
|
115 UChar32 scanNamedChar(); |
|
116 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); |
|
117 |
|
118 |
|
119 UErrorCode *fStatus; |
|
120 RegexPattern *fRXPat; |
|
121 UParseError *fParseErr; |
|
122 |
|
123 // |
|
124 // Data associated with low level character scanning |
|
125 // |
|
126 int64_t fScanIndex; // Index of current character being processed |
|
127 // in the rule input string. |
|
128 UBool fQuoteMode; // Scan is in a \Q...\E quoted region |
|
129 UBool fInBackslashQuote; // Scan is between a '\' and the following char. |
|
130 UBool fEOLComments; // When scan is just after '(?', inhibit #... to |
|
131 // end of line comments, in favor of (?#...) comments. |
|
132 int64_t fLineNum; // Line number in input file. |
|
133 int64_t fCharNum; // Char position within the line. |
|
134 UChar32 fLastChar; // Previous char, needed to count CR-LF |
|
135 // as a single line, not two. |
|
136 UChar32 fPeekChar; // Saved char, if we've scanned ahead. |
|
137 |
|
138 |
|
139 RegexPatternChar fC; // Current char for parse state machine |
|
140 // processing. |
|
141 |
|
142 // |
|
143 // Data for the state machine that parses the regular expression. |
|
144 // |
|
145 RegexTableEl **fStateTable; // State Transition Table for regex Rule |
|
146 // parsing. index by p[state][char-class] |
|
147 |
|
148 uint16_t fStack[kStackSize]; // State stack, holds state pushes |
|
149 int32_t fStackPtr; // and pops as specified in the state |
|
150 // transition rules. |
|
151 |
|
152 // |
|
153 // Data associated with the generation of the pcode for the match engine |
|
154 // |
|
155 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) |
|
156 // Always has high bit (31) set so that flag values |
|
157 // on the paren stack are distinguished from relocatable |
|
158 // pcode addresses. |
|
159 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state |
|
160 // until last flag is scanned. |
|
161 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx |
|
162 |
|
163 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. |
|
164 // Once completed, meaning that some non-literal pattern |
|
165 // construct is encountered, the appropriate opcodes |
|
166 // to match the literal will be generated, and this |
|
167 // string will be cleared. |
|
168 |
|
169 int64_t fPatternLength; // Length of the input pattern string. |
|
170 |
|
171 UVector32 fParenStack; // parentheses stack. Each frame consists of |
|
172 // the positions of compiled pattern operations |
|
173 // needing fixup, followed by negative value. The |
|
174 // first entry in each frame is the position of the |
|
175 // spot reserved for use when a quantifier |
|
176 // needs to add a SAVE at the start of a (block) |
|
177 // The negative value (-1, -2,...) indicates |
|
178 // the kind of paren that opened the frame. Some |
|
179 // need special handling on close. |
|
180 |
|
181 |
|
182 int32_t fMatchOpenParen; // The position in the compiled pattern |
|
183 // of the slot reserved for a state save |
|
184 // at the start of the most recently processed |
|
185 // parenthesized block. |
|
186 int32_t fMatchCloseParen; // The position in the pattern of the first |
|
187 // location after the most recently processed |
|
188 // parenthesized block. |
|
189 |
|
190 int32_t fIntervalLow; // {lower, upper} interval quantifier values. |
|
191 int32_t fIntervalUpper; // Placed here temporarily, when pattern is |
|
192 // initially scanned. Each new interval |
|
193 // encountered overwrites these values. |
|
194 // -1 for the upper interval value means none |
|
195 // was specified (unlimited occurences.) |
|
196 |
|
197 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a |
|
198 // pattern, valid while remainder of name is |
|
199 // scanned. |
|
200 |
|
201 UStack fSetStack; // Stack of UnicodeSets, used while evaluating |
|
202 // (at compile time) set expressions within |
|
203 // the pattern. |
|
204 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) |
|
205 |
|
206 UChar32 fLastSetLiteral; // The last single code point added to a set. |
|
207 // needed when "-y" is scanned, and we need |
|
208 // to turn "x-y" into a range. |
|
209 }; |
|
210 |
|
211 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] |
|
212 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. |
|
213 |
|
214 enum SetOperations { |
|
215 setStart = 0 << 16 | 1, |
|
216 setEnd = 1 << 16 | 2, |
|
217 setNegation = 2 << 16 | 3, |
|
218 setCaseClose = 2 << 16 | 9, |
|
219 setDifference2 = 3 << 16 | 4, // '--' set difference operator |
|
220 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator |
|
221 setUnion = 4 << 16 | 6, // implicit union of adjacent items |
|
222 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. |
|
223 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. |
|
224 }; |
|
225 |
|
226 U_NAMESPACE_END |
|
227 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
|
228 #endif // RBBISCAN_H |