1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbiscan.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,162 @@ 1.4 +// 1.5 +// rbbiscan.h 1.6 +// 1.7 +// Copyright (C) 2002-2008, International Business Machines Corporation and others. 1.8 +// All Rights Reserved. 1.9 +// 1.10 +// This file contains declarations for class RBBIRuleScanner 1.11 +// 1.12 + 1.13 + 1.14 +#ifndef RBBISCAN_H 1.15 +#define RBBISCAN_H 1.16 + 1.17 +#include "unicode/utypes.h" 1.18 +#include "unicode/uobject.h" 1.19 +#include "unicode/rbbi.h" 1.20 +#include "unicode/uniset.h" 1.21 +#include "unicode/parseerr.h" 1.22 +#include "uhash.h" 1.23 +#include "uvector.h" 1.24 +#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 1.25 + // looks up references to $variables within a set. 1.26 +#include "rbbinode.h" 1.27 +//#include "rbbitblb.h" 1.28 + 1.29 + 1.30 + 1.31 +U_NAMESPACE_BEGIN 1.32 + 1.33 +class RBBIRuleBuilder; 1.34 +class RBBISymbolTable; 1.35 + 1.36 + 1.37 +//-------------------------------------------------------------------------------- 1.38 +// 1.39 +// class RBBIRuleScanner does the lowest level, character-at-a-time 1.40 +// scanning of break iterator rules. 1.41 +// 1.42 +// The output of the scanner is parse trees for 1.43 +// the rule expressions and a list of all Unicode Sets 1.44 +// encountered. 1.45 +// 1.46 +//-------------------------------------------------------------------------------- 1.47 + 1.48 +class RBBIRuleScanner : public UMemory { 1.49 +public: 1.50 + 1.51 + enum { 1.52 + kStackSize = 100 // The size of the state stack for 1.53 + }; // rules parsing. Corresponds roughly 1.54 + // to the depth of parentheses nesting 1.55 + // that is allowed in the rules. 1.56 + 1.57 + struct RBBIRuleChar { 1.58 + UChar32 fChar; 1.59 + UBool fEscaped; 1.60 + }; 1.61 + 1.62 + RBBIRuleScanner(RBBIRuleBuilder *rb); 1.63 + 1.64 + 1.65 + virtual ~RBBIRuleScanner(); 1.66 + 1.67 + void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 1.68 + // Return false if at end. 1.69 + 1.70 + UBool push(const RBBIRuleChar &c); // Push (unget) one character. 1.71 + // Only a single character may be pushed. 1.72 + 1.73 + void parse(); // Parse the rules, generating two parse 1.74 + // trees, one each for the forward and 1.75 + // reverse rules, 1.76 + // and a list of UnicodeSets encountered. 1.77 + 1.78 + /** 1.79 + * Return a rules string without unnecessary 1.80 + * characters. 1.81 + */ 1.82 + static UnicodeString stripRules(const UnicodeString &rules); 1.83 +private: 1.84 + 1.85 + UBool doParseActions(int32_t a); 1.86 + void error(UErrorCode e); // error reporting convenience function. 1.87 + void fixOpStack(RBBINode::OpPrecedence p); 1.88 + // a character. 1.89 + void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 1.90 + 1.91 + UChar32 nextCharLL(); 1.92 +#ifdef RBBI_DEBUG 1.93 + void printNodeStack(const char *title); 1.94 +#endif 1.95 + RBBINode *pushNewNode(RBBINode::NodeType t); 1.96 + void scanSet(); 1.97 + 1.98 + 1.99 + RBBIRuleBuilder *fRB; // The rule builder that we are part of. 1.100 + 1.101 + int32_t fScanIndex; // Index of current character being processed 1.102 + // in the rule input string. 1.103 + int32_t fNextIndex; // Index of the next character, which 1.104 + // is the first character not yet scanned. 1.105 + UBool fQuoteMode; // Scan is in a 'quoted region' 1.106 + int32_t fLineNum; // Line number in input file. 1.107 + int32_t fCharNum; // Char position within the line. 1.108 + UChar32 fLastChar; // Previous char, needed to count CR-LF 1.109 + // as a single line, not two. 1.110 + 1.111 + RBBIRuleChar fC; // Current char for parse state machine 1.112 + // processing. 1.113 + UnicodeString fVarName; // $variableName, valid when we've just 1.114 + // scanned one. 1.115 + 1.116 + RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 1.117 + // parsing. index by p[state][char-class] 1.118 + 1.119 + uint16_t fStack[kStackSize]; // State stack, holds state pushes 1.120 + int32_t fStackPtr; // and pops as specified in the state 1.121 + // transition rules. 1.122 + 1.123 + RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 1.124 + // during the parse of a rule 1.125 + int32_t fNodeStackPtr; 1.126 + 1.127 + 1.128 + UBool fReverseRule; // True if the rule currently being scanned 1.129 + // is a reverse direction rule (if it 1.130 + // starts with a '!') 1.131 + 1.132 + UBool fLookAheadRule; // True if the rule includes a '/' 1.133 + // somewhere within it. 1.134 + 1.135 + RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 1.136 + // $variable symbols. 1.137 + 1.138 + UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 1.139 + // the sets created while parsing rules. 1.140 + // The key is the string used for creating 1.141 + // the set. 1.142 + 1.143 + UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 1.144 + // the scanning of RBBI rules. The 1.145 + // indicies for these are assigned by the 1.146 + // perl script that builds the state tables. 1.147 + // See rbbirpt.h. 1.148 + 1.149 + int32_t fRuleNum; // Counts each rule as it is scanned. 1.150 + 1.151 + int32_t fOptionStart; // Input index of start of a !!option 1.152 + // keyword, while being scanned. 1.153 + 1.154 + UnicodeSet *gRuleSet_rule_char; 1.155 + UnicodeSet *gRuleSet_white_space; 1.156 + UnicodeSet *gRuleSet_name_char; 1.157 + UnicodeSet *gRuleSet_name_start_char; 1.158 + 1.159 + RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 1.160 + RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 1.161 +}; 1.162 + 1.163 +U_NAMESPACE_END 1.164 + 1.165 +#endif