intl/icu/source/i18n/regexcmp.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/regexcmp.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,228 @@
     1.4 +//
     1.5 +//  regexcmp.h
     1.6 +//
     1.7 +//  Copyright (C) 2002-2012, International Business Machines Corporation and others.
     1.8 +//  All Rights Reserved.
     1.9 +//
    1.10 +//  This file contains declarations for the class RegexCompile
    1.11 +//
    1.12 +//  This class is internal to the regular expression implementation.
    1.13 +//  For the public Regular Expression API, see the file "unicode/regex.h"
    1.14 +//
    1.15 +
    1.16 +
    1.17 +#ifndef RBBISCAN_H
    1.18 +#define RBBISCAN_H
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    1.22 +
    1.23 +#include "unicode/uobject.h"
    1.24 +#include "unicode/uniset.h"
    1.25 +#include "unicode/parseerr.h"
    1.26 +#include "uhash.h"
    1.27 +#include "uvector.h"
    1.28 +
    1.29 +
    1.30 +
    1.31 +U_NAMESPACE_BEGIN
    1.32 +
    1.33 +
    1.34 +//--------------------------------------------------------------------------------
    1.35 +//
    1.36 +//  class RegexCompile    Contains the regular expression compiler.
    1.37 +//
    1.38 +//--------------------------------------------------------------------------------
    1.39 +struct  RegexTableEl;
    1.40 +class   RegexPattern;
    1.41 +
    1.42 +
    1.43 +class RegexCompile : public UMemory {
    1.44 +public:
    1.45 +
    1.46 +    enum {
    1.47 +        kStackSize = 100            // The size of the state stack for
    1.48 +    };                              //   pattern parsing.  Corresponds roughly
    1.49 +                                    //   to the depth of parentheses nesting
    1.50 +                                    //   that is allowed in the rules.
    1.51 +
    1.52 +    struct RegexPatternChar {
    1.53 +        UChar32             fChar;
    1.54 +        UBool               fQuoted;
    1.55 +    };
    1.56 +
    1.57 +    RegexCompile(RegexPattern *rp, UErrorCode &e);
    1.58 +
    1.59 +    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
    1.60 +    void       compile(UText *pat, UParseError &pp, UErrorCode &e);
    1.61 +    
    1.62 +
    1.63 +    virtual    ~RegexCompile();
    1.64 +
    1.65 +    void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.
    1.66 +
    1.67 +    static void cleanup();                       // Memory cleanup
    1.68 +
    1.69 +
    1.70 +
    1.71 +    // Categories of parentheses in pattern.
    1.72 +    //   The category is saved in the compile-time parentheses stack frame, and
    1.73 +    //   determines the code to be generated when the matching close ) is encountered.
    1.74 +    enum EParenClass {
    1.75 +        plain        = -1,               // No special handling
    1.76 +        capturing    = -2,
    1.77 +        atomic       = -3,
    1.78 +        lookAhead    = -4,
    1.79 +        negLookAhead = -5,
    1.80 +        flags        = -6,
    1.81 +        lookBehind   = -7,
    1.82 +        lookBehindN  = -8
    1.83 +    };
    1.84 +
    1.85 +private:
    1.86 +
    1.87 +
    1.88 +    UBool       doParseActions(int32_t a);
    1.89 +    void        error(UErrorCode e);                   // error reporting convenience function.
    1.90 +
    1.91 +    UChar32     nextCharLL();
    1.92 +    UChar32     peekCharLL();
    1.93 +    UnicodeSet  *scanProp();
    1.94 +    UnicodeSet  *scanPosixProp();
    1.95 +    void        handleCloseParen();
    1.96 +    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
    1.97 +                                                     //  at the top of the just completed block
    1.98 +                                                     //  or operation, and optionally ensure that
    1.99 +                                                     //  there is space to add an opcode there.
   1.100 +    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
   1.101 +                                                     //   a reference to a UnicodeSet.
   1.102 +    void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
   1.103 +                               int32_t LoopOp);
   1.104 +    UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
   1.105 +    void        literalChar(UChar32 c);              // Compile a literal char
   1.106 +    void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters.
   1.107 +    void        insertOp(int32_t where);             // Open up a slot for a new op in the
   1.108 +                                                     //   generated code at the specified location.
   1.109 +    int32_t     minMatchLength(int32_t start,
   1.110 +                               int32_t end);
   1.111 +    int32_t     maxMatchLength(int32_t start,
   1.112 +                               int32_t end);
   1.113 +    void        matchStartType();
   1.114 +    void        stripNOPs();
   1.115 +
   1.116 +    void        setEval(int32_t op);
   1.117 +    void        setPushOp(int32_t op);
   1.118 +    UChar32     scanNamedChar();
   1.119 +    UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
   1.120 +
   1.121 +
   1.122 +    UErrorCode                    *fStatus;
   1.123 +    RegexPattern                  *fRXPat;
   1.124 +    UParseError                   *fParseErr;
   1.125 +
   1.126 +    //
   1.127 +    //  Data associated with low level character scanning
   1.128 +    //
   1.129 +    int64_t                       fScanIndex;        // Index of current character being processed
   1.130 +                                                     //   in the rule input string.
   1.131 +    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
   1.132 +    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
   1.133 +    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
   1.134 +                                                     //   end of line comments, in favor of (?#...) comments.
   1.135 +    int64_t                       fLineNum;          // Line number in input file.
   1.136 +    int64_t                       fCharNum;          // Char position within the line.
   1.137 +    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
   1.138 +                                                     //   as a single line, not two.
   1.139 +    UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
   1.140 +
   1.141 +
   1.142 +    RegexPatternChar              fC;                // Current char for parse state machine
   1.143 +                                                     //   processing.
   1.144 +
   1.145 +    //
   1.146 +    //   Data for the state machine that parses the regular expression.
   1.147 +    //
   1.148 +    RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
   1.149 +                                                     //   parsing.  index by p[state][char-class]
   1.150 +
   1.151 +    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
   1.152 +    int32_t                       fStackPtr;           //  and pops as specified in the state
   1.153 +                                                       //  transition rules.
   1.154 +
   1.155 +    //
   1.156 +    //  Data associated with the generation of the pcode for the match engine
   1.157 +    //
   1.158 +    int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
   1.159 +                                                     //   Always has high bit (31) set so that flag values
   1.160 +                                                     //   on the paren stack are distinguished from relocatable
   1.161 +                                                     //   pcode addresses.
   1.162 +    int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
   1.163 +                                                     //   until last flag is scanned.
   1.164 +    UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
   1.165 +
   1.166 +    UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
   1.167 +                                                     //   Once completed, meaning that some non-literal pattern
   1.168 +                                                     //   construct is encountered, the appropriate opcodes
   1.169 +                                                     //   to match the literal will be generated, and this
   1.170 +                                                     //   string will be cleared.
   1.171 +
   1.172 +    int64_t                       fPatternLength;    // Length of the input pattern string.
   1.173 +    
   1.174 +    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
   1.175 +                                                     //   the positions of compiled pattern operations
   1.176 +                                                     //   needing fixup, followed by negative value.  The
   1.177 +                                                     //   first entry in each frame is the position of the
   1.178 +                                                     //   spot reserved for use when a quantifier
   1.179 +                                                     //   needs to add a SAVE at the start of a (block)
   1.180 +                                                     //   The negative value (-1, -2,...) indicates
   1.181 +                                                     //   the kind of paren that opened the frame.  Some
   1.182 +                                                     //   need special handling on close.
   1.183 +
   1.184 +
   1.185 +    int32_t                       fMatchOpenParen;   // The position in the compiled pattern
   1.186 +                                                     //   of the slot reserved for a state save
   1.187 +                                                     //   at the start of the most recently processed
   1.188 +                                                     //   parenthesized block.
   1.189 +    int32_t                       fMatchCloseParen;  // The position in the pattern of the first
   1.190 +                                                     //   location after the most recently processed
   1.191 +                                                     //   parenthesized block.
   1.192 +
   1.193 +    int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
   1.194 +    int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
   1.195 +                                                     //   initially scanned.  Each new interval
   1.196 +                                                     //   encountered overwrites these values.
   1.197 +                                                     //   -1 for the upper interval value means none
   1.198 +                                                     //   was specified (unlimited occurences.)
   1.199 +
   1.200 +    int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
   1.201 +                                                     //   pattern, valid while remainder of name is
   1.202 +                                                     //   scanned.
   1.203 +
   1.204 +    UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
   1.205 +                                                     //   (at compile time) set expressions within
   1.206 +                                                     //   the pattern.
   1.207 +    UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)
   1.208 +
   1.209 +    UChar32                       fLastSetLiteral;   // The last single code point added to a set.
   1.210 +                                                     //   needed when "-y" is scanned, and we need
   1.211 +                                                     //   to turn "x-y" into a range.
   1.212 +};
   1.213 +
   1.214 +// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
   1.215 +//   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
   1.216 +
   1.217 +enum SetOperations {
   1.218 +    setStart         = 0 << 16 | 1,
   1.219 +    setEnd           = 1 << 16 | 2,
   1.220 +    setNegation      = 2 << 16 | 3,
   1.221 +    setCaseClose     = 2 << 16 | 9,
   1.222 +    setDifference2   = 3 << 16 | 4,    // '--' set difference operator
   1.223 +    setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
   1.224 +    setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
   1.225 +    setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
   1.226 +    setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
   1.227 +    };
   1.228 +
   1.229 +U_NAMESPACE_END
   1.230 +#endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1.231 +#endif   // RBBISCAN_H

mercurial