1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/regexcmp.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,228 @@ 1.4 +// 1.5 +// regexcmp.h 1.6 +// 1.7 +// Copyright (C) 2002-2012, International Business Machines Corporation and others. 1.8 +// All Rights Reserved. 1.9 +// 1.10 +// This file contains declarations for the class RegexCompile 1.11 +// 1.12 +// This class is internal to the regular expression implementation. 1.13 +// For the public Regular Expression API, see the file "unicode/regex.h" 1.14 +// 1.15 + 1.16 + 1.17 +#ifndef RBBISCAN_H 1.18 +#define RBBISCAN_H 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS 1.22 + 1.23 +#include "unicode/uobject.h" 1.24 +#include "unicode/uniset.h" 1.25 +#include "unicode/parseerr.h" 1.26 +#include "uhash.h" 1.27 +#include "uvector.h" 1.28 + 1.29 + 1.30 + 1.31 +U_NAMESPACE_BEGIN 1.32 + 1.33 + 1.34 +//-------------------------------------------------------------------------------- 1.35 +// 1.36 +// class RegexCompile Contains the regular expression compiler. 1.37 +// 1.38 +//-------------------------------------------------------------------------------- 1.39 +struct RegexTableEl; 1.40 +class RegexPattern; 1.41 + 1.42 + 1.43 +class RegexCompile : public UMemory { 1.44 +public: 1.45 + 1.46 + enum { 1.47 + kStackSize = 100 // The size of the state stack for 1.48 + }; // pattern parsing. Corresponds roughly 1.49 + // to the depth of parentheses nesting 1.50 + // that is allowed in the rules. 1.51 + 1.52 + struct RegexPatternChar { 1.53 + UChar32 fChar; 1.54 + UBool fQuoted; 1.55 + }; 1.56 + 1.57 + RegexCompile(RegexPattern *rp, UErrorCode &e); 1.58 + 1.59 + void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 1.60 + void compile(UText *pat, UParseError &pp, UErrorCode &e); 1.61 + 1.62 + 1.63 + virtual ~RegexCompile(); 1.64 + 1.65 + void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 1.66 + 1.67 + static void cleanup(); // Memory cleanup 1.68 + 1.69 + 1.70 + 1.71 + // Categories of parentheses in pattern. 1.72 + // The category is saved in the compile-time parentheses stack frame, and 1.73 + // determines the code to be generated when the matching close ) is encountered. 1.74 + enum EParenClass { 1.75 + plain = -1, // No special handling 1.76 + capturing = -2, 1.77 + atomic = -3, 1.78 + lookAhead = -4, 1.79 + negLookAhead = -5, 1.80 + flags = -6, 1.81 + lookBehind = -7, 1.82 + lookBehindN = -8 1.83 + }; 1.84 + 1.85 +private: 1.86 + 1.87 + 1.88 + UBool doParseActions(int32_t a); 1.89 + void error(UErrorCode e); // error reporting convenience function. 1.90 + 1.91 + UChar32 nextCharLL(); 1.92 + UChar32 peekCharLL(); 1.93 + UnicodeSet *scanProp(); 1.94 + UnicodeSet *scanPosixProp(); 1.95 + void handleCloseParen(); 1.96 + int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 1.97 + // at the top of the just completed block 1.98 + // or operation, and optionally ensure that 1.99 + // there is space to add an opcode there. 1.100 + void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 1.101 + // a reference to a UnicodeSet. 1.102 + void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 1.103 + int32_t LoopOp); 1.104 + UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 1.105 + void literalChar(UChar32 c); // Compile a literal char 1.106 + void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters. 1.107 + void insertOp(int32_t where); // Open up a slot for a new op in the 1.108 + // generated code at the specified location. 1.109 + int32_t minMatchLength(int32_t start, 1.110 + int32_t end); 1.111 + int32_t maxMatchLength(int32_t start, 1.112 + int32_t end); 1.113 + void matchStartType(); 1.114 + void stripNOPs(); 1.115 + 1.116 + void setEval(int32_t op); 1.117 + void setPushOp(int32_t op); 1.118 + UChar32 scanNamedChar(); 1.119 + UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 1.120 + 1.121 + 1.122 + UErrorCode *fStatus; 1.123 + RegexPattern *fRXPat; 1.124 + UParseError *fParseErr; 1.125 + 1.126 + // 1.127 + // Data associated with low level character scanning 1.128 + // 1.129 + int64_t fScanIndex; // Index of current character being processed 1.130 + // in the rule input string. 1.131 + UBool fQuoteMode; // Scan is in a \Q...\E quoted region 1.132 + UBool fInBackslashQuote; // Scan is between a '\' and the following char. 1.133 + UBool fEOLComments; // When scan is just after '(?', inhibit #... to 1.134 + // end of line comments, in favor of (?#...) comments. 1.135 + int64_t fLineNum; // Line number in input file. 1.136 + int64_t fCharNum; // Char position within the line. 1.137 + UChar32 fLastChar; // Previous char, needed to count CR-LF 1.138 + // as a single line, not two. 1.139 + UChar32 fPeekChar; // Saved char, if we've scanned ahead. 1.140 + 1.141 + 1.142 + RegexPatternChar fC; // Current char for parse state machine 1.143 + // processing. 1.144 + 1.145 + // 1.146 + // Data for the state machine that parses the regular expression. 1.147 + // 1.148 + RegexTableEl **fStateTable; // State Transition Table for regex Rule 1.149 + // parsing. index by p[state][char-class] 1.150 + 1.151 + uint16_t fStack[kStackSize]; // State stack, holds state pushes 1.152 + int32_t fStackPtr; // and pops as specified in the state 1.153 + // transition rules. 1.154 + 1.155 + // 1.156 + // Data associated with the generation of the pcode for the match engine 1.157 + // 1.158 + int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 1.159 + // Always has high bit (31) set so that flag values 1.160 + // on the paren stack are distinguished from relocatable 1.161 + // pcode addresses. 1.162 + int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 1.163 + // until last flag is scanned. 1.164 + UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 1.165 + 1.166 + UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 1.167 + // Once completed, meaning that some non-literal pattern 1.168 + // construct is encountered, the appropriate opcodes 1.169 + // to match the literal will be generated, and this 1.170 + // string will be cleared. 1.171 + 1.172 + int64_t fPatternLength; // Length of the input pattern string. 1.173 + 1.174 + UVector32 fParenStack; // parentheses stack. Each frame consists of 1.175 + // the positions of compiled pattern operations 1.176 + // needing fixup, followed by negative value. The 1.177 + // first entry in each frame is the position of the 1.178 + // spot reserved for use when a quantifier 1.179 + // needs to add a SAVE at the start of a (block) 1.180 + // The negative value (-1, -2,...) indicates 1.181 + // the kind of paren that opened the frame. Some 1.182 + // need special handling on close. 1.183 + 1.184 + 1.185 + int32_t fMatchOpenParen; // The position in the compiled pattern 1.186 + // of the slot reserved for a state save 1.187 + // at the start of the most recently processed 1.188 + // parenthesized block. 1.189 + int32_t fMatchCloseParen; // The position in the pattern of the first 1.190 + // location after the most recently processed 1.191 + // parenthesized block. 1.192 + 1.193 + int32_t fIntervalLow; // {lower, upper} interval quantifier values. 1.194 + int32_t fIntervalUpper; // Placed here temporarily, when pattern is 1.195 + // initially scanned. Each new interval 1.196 + // encountered overwrites these values. 1.197 + // -1 for the upper interval value means none 1.198 + // was specified (unlimited occurences.) 1.199 + 1.200 + int64_t fNameStartPos; // Starting position of a \N{NAME} name in a 1.201 + // pattern, valid while remainder of name is 1.202 + // scanned. 1.203 + 1.204 + UStack fSetStack; // Stack of UnicodeSets, used while evaluating 1.205 + // (at compile time) set expressions within 1.206 + // the pattern. 1.207 + UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 1.208 + 1.209 + UChar32 fLastSetLiteral; // The last single code point added to a set. 1.210 + // needed when "-y" is scanned, and we need 1.211 + // to turn "x-y" into a range. 1.212 +}; 1.213 + 1.214 +// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 1.215 +// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 1.216 + 1.217 +enum SetOperations { 1.218 + setStart = 0 << 16 | 1, 1.219 + setEnd = 1 << 16 | 2, 1.220 + setNegation = 2 << 16 | 3, 1.221 + setCaseClose = 2 << 16 | 9, 1.222 + setDifference2 = 3 << 16 | 4, // '--' set difference operator 1.223 + setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 1.224 + setUnion = 4 << 16 | 6, // implicit union of adjacent items 1.225 + setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 1.226 + setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 1.227 + }; 1.228 + 1.229 +U_NAMESPACE_END 1.230 +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1.231 +#endif // RBBISCAN_H