intl/icu/source/i18n/regexcmp.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 //
     2 //  regexcmp.h
     3 //
     4 //  Copyright (C) 2002-2012, International Business Machines Corporation and others.
     5 //  All Rights Reserved.
     6 //
     7 //  This file contains declarations for the class RegexCompile
     8 //
     9 //  This class is internal to the regular expression implementation.
    10 //  For the public Regular Expression API, see the file "unicode/regex.h"
    11 //
    14 #ifndef RBBISCAN_H
    15 #define RBBISCAN_H
    17 #include "unicode/utypes.h"
    18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    20 #include "unicode/uobject.h"
    21 #include "unicode/uniset.h"
    22 #include "unicode/parseerr.h"
    23 #include "uhash.h"
    24 #include "uvector.h"
    28 U_NAMESPACE_BEGIN
    31 //--------------------------------------------------------------------------------
    32 //
    33 //  class RegexCompile    Contains the regular expression compiler.
    34 //
    35 //--------------------------------------------------------------------------------
    36 struct  RegexTableEl;
    37 class   RegexPattern;
    40 class RegexCompile : public UMemory {
    41 public:
    43     enum {
    44         kStackSize = 100            // The size of the state stack for
    45     };                              //   pattern parsing.  Corresponds roughly
    46                                     //   to the depth of parentheses nesting
    47                                     //   that is allowed in the rules.
    49     struct RegexPatternChar {
    50         UChar32             fChar;
    51         UBool               fQuoted;
    52     };
    54     RegexCompile(RegexPattern *rp, UErrorCode &e);
    56     void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
    57     void       compile(UText *pat, UParseError &pp, UErrorCode &e);
    60     virtual    ~RegexCompile();
    62     void        nextChar(RegexPatternChar &c);      // Get the next char from the input stream.
    64     static void cleanup();                       // Memory cleanup
    68     // Categories of parentheses in pattern.
    69     //   The category is saved in the compile-time parentheses stack frame, and
    70     //   determines the code to be generated when the matching close ) is encountered.
    71     enum EParenClass {
    72         plain        = -1,               // No special handling
    73         capturing    = -2,
    74         atomic       = -3,
    75         lookAhead    = -4,
    76         negLookAhead = -5,
    77         flags        = -6,
    78         lookBehind   = -7,
    79         lookBehindN  = -8
    80     };
    82 private:
    85     UBool       doParseActions(int32_t a);
    86     void        error(UErrorCode e);                   // error reporting convenience function.
    88     UChar32     nextCharLL();
    89     UChar32     peekCharLL();
    90     UnicodeSet  *scanProp();
    91     UnicodeSet  *scanPosixProp();
    92     void        handleCloseParen();
    93     int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
    94                                                      //  at the top of the just completed block
    95                                                      //  or operation, and optionally ensure that
    96                                                      //  there is space to add an opcode there.
    97     void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
    98                                                      //   a reference to a UnicodeSet.
    99     void        compileInterval(int32_t InitOp,      // Generate the code for a {min,max} quantifier.
   100                                int32_t LoopOp);
   101     UBool       compileInlineInterval();             // Generate inline code for a {min,max} quantifier
   102     void        literalChar(UChar32 c);              // Compile a literal char
   103     void        fixLiterals(UBool split=FALSE);      // Generate code for pending literal characters.
   104     void        insertOp(int32_t where);             // Open up a slot for a new op in the
   105                                                      //   generated code at the specified location.
   106     int32_t     minMatchLength(int32_t start,
   107                                int32_t end);
   108     int32_t     maxMatchLength(int32_t start,
   109                                int32_t end);
   110     void        matchStartType();
   111     void        stripNOPs();
   113     void        setEval(int32_t op);
   114     void        setPushOp(int32_t op);
   115     UChar32     scanNamedChar();
   116     UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
   119     UErrorCode                    *fStatus;
   120     RegexPattern                  *fRXPat;
   121     UParseError                   *fParseErr;
   123     //
   124     //  Data associated with low level character scanning
   125     //
   126     int64_t                       fScanIndex;        // Index of current character being processed
   127                                                      //   in the rule input string.
   128     UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
   129     UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
   130     UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
   131                                                      //   end of line comments, in favor of (?#...) comments.
   132     int64_t                       fLineNum;          // Line number in input file.
   133     int64_t                       fCharNum;          // Char position within the line.
   134     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
   135                                                      //   as a single line, not two.
   136     UChar32                       fPeekChar;         // Saved char, if we've scanned ahead.
   139     RegexPatternChar              fC;                // Current char for parse state machine
   140                                                      //   processing.
   142     //
   143     //   Data for the state machine that parses the regular expression.
   144     //
   145     RegexTableEl                  **fStateTable;     // State Transition Table for regex Rule
   146                                                      //   parsing.  index by p[state][char-class]
   148     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
   149     int32_t                       fStackPtr;           //  and pops as specified in the state
   150                                                        //  transition rules.
   152     //
   153     //  Data associated with the generation of the pcode for the match engine
   154     //
   155     int32_t                       fModeFlags;        // Match Flags.  (Case Insensitive, etc.)
   156                                                      //   Always has high bit (31) set so that flag values
   157                                                      //   on the paren stack are distinguished from relocatable
   158                                                      //   pcode addresses.
   159     int32_t                       fNewModeFlags;     // New flags, while compiling (?i, holds state
   160                                                      //   until last flag is scanned.
   161     UBool                         fSetModeFlag;      // true for (?ismx, false for (?-ismx
   163     UnicodeString                 fLiteralChars;     // Literal chars or strings from the pattern are accumulated here.
   164                                                      //   Once completed, meaning that some non-literal pattern
   165                                                      //   construct is encountered, the appropriate opcodes
   166                                                      //   to match the literal will be generated, and this
   167                                                      //   string will be cleared.
   169     int64_t                       fPatternLength;    // Length of the input pattern string.
   171     UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
   172                                                      //   the positions of compiled pattern operations
   173                                                      //   needing fixup, followed by negative value.  The
   174                                                      //   first entry in each frame is the position of the
   175                                                      //   spot reserved for use when a quantifier
   176                                                      //   needs to add a SAVE at the start of a (block)
   177                                                      //   The negative value (-1, -2,...) indicates
   178                                                      //   the kind of paren that opened the frame.  Some
   179                                                      //   need special handling on close.
   182     int32_t                       fMatchOpenParen;   // The position in the compiled pattern
   183                                                      //   of the slot reserved for a state save
   184                                                      //   at the start of the most recently processed
   185                                                      //   parenthesized block.
   186     int32_t                       fMatchCloseParen;  // The position in the pattern of the first
   187                                                      //   location after the most recently processed
   188                                                      //   parenthesized block.
   190     int32_t                       fIntervalLow;      // {lower, upper} interval quantifier values.
   191     int32_t                       fIntervalUpper;    // Placed here temporarily, when pattern is
   192                                                      //   initially scanned.  Each new interval
   193                                                      //   encountered overwrites these values.
   194                                                      //   -1 for the upper interval value means none
   195                                                      //   was specified (unlimited occurences.)
   197     int64_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
   198                                                      //   pattern, valid while remainder of name is
   199                                                      //   scanned.
   201     UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
   202                                                      //   (at compile time) set expressions within
   203                                                      //   the pattern.
   204     UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)
   206     UChar32                       fLastSetLiteral;   // The last single code point added to a set.
   207                                                      //   needed when "-y" is scanned, and we need
   208                                                      //   to turn "x-y" into a range.
   209 };
   211 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
   212 //   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
   214 enum SetOperations {
   215     setStart         = 0 << 16 | 1,
   216     setEnd           = 1 << 16 | 2,
   217     setNegation      = 2 << 16 | 3,
   218     setCaseClose     = 2 << 16 | 9,
   219     setDifference2   = 3 << 16 | 4,    // '--' set difference operator
   220     setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
   221     setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
   222     setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
   223     setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
   224     };
   226 U_NAMESPACE_END
   227 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   228 #endif   // RBBISCAN_H

mercurial