Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | // |
michael@0 | 2 | // rbbiscan.h |
michael@0 | 3 | // |
michael@0 | 4 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
michael@0 | 5 | // All Rights Reserved. |
michael@0 | 6 | // |
michael@0 | 7 | // This file contains declarations for class RBBIRuleScanner |
michael@0 | 8 | // |
michael@0 | 9 | |
michael@0 | 10 | |
michael@0 | 11 | #ifndef RBBISCAN_H |
michael@0 | 12 | #define RBBISCAN_H |
michael@0 | 13 | |
michael@0 | 14 | #include "unicode/utypes.h" |
michael@0 | 15 | #include "unicode/uobject.h" |
michael@0 | 16 | #include "unicode/rbbi.h" |
michael@0 | 17 | #include "unicode/uniset.h" |
michael@0 | 18 | #include "unicode/parseerr.h" |
michael@0 | 19 | #include "uhash.h" |
michael@0 | 20 | #include "uvector.h" |
michael@0 | 21 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
michael@0 | 22 | // looks up references to $variables within a set. |
michael@0 | 23 | #include "rbbinode.h" |
michael@0 | 24 | //#include "rbbitblb.h" |
michael@0 | 25 | |
michael@0 | 26 | |
michael@0 | 27 | |
michael@0 | 28 | U_NAMESPACE_BEGIN |
michael@0 | 29 | |
michael@0 | 30 | class RBBIRuleBuilder; |
michael@0 | 31 | class RBBISymbolTable; |
michael@0 | 32 | |
michael@0 | 33 | |
michael@0 | 34 | //-------------------------------------------------------------------------------- |
michael@0 | 35 | // |
michael@0 | 36 | // class RBBIRuleScanner does the lowest level, character-at-a-time |
michael@0 | 37 | // scanning of break iterator rules. |
michael@0 | 38 | // |
michael@0 | 39 | // The output of the scanner is parse trees for |
michael@0 | 40 | // the rule expressions and a list of all Unicode Sets |
michael@0 | 41 | // encountered. |
michael@0 | 42 | // |
michael@0 | 43 | //-------------------------------------------------------------------------------- |
michael@0 | 44 | |
michael@0 | 45 | class RBBIRuleScanner : public UMemory { |
michael@0 | 46 | public: |
michael@0 | 47 | |
michael@0 | 48 | enum { |
michael@0 | 49 | kStackSize = 100 // The size of the state stack for |
michael@0 | 50 | }; // rules parsing. Corresponds roughly |
michael@0 | 51 | // to the depth of parentheses nesting |
michael@0 | 52 | // that is allowed in the rules. |
michael@0 | 53 | |
michael@0 | 54 | struct RBBIRuleChar { |
michael@0 | 55 | UChar32 fChar; |
michael@0 | 56 | UBool fEscaped; |
michael@0 | 57 | }; |
michael@0 | 58 | |
michael@0 | 59 | RBBIRuleScanner(RBBIRuleBuilder *rb); |
michael@0 | 60 | |
michael@0 | 61 | |
michael@0 | 62 | virtual ~RBBIRuleScanner(); |
michael@0 | 63 | |
michael@0 | 64 | void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. |
michael@0 | 65 | // Return false if at end. |
michael@0 | 66 | |
michael@0 | 67 | UBool push(const RBBIRuleChar &c); // Push (unget) one character. |
michael@0 | 68 | // Only a single character may be pushed. |
michael@0 | 69 | |
michael@0 | 70 | void parse(); // Parse the rules, generating two parse |
michael@0 | 71 | // trees, one each for the forward and |
michael@0 | 72 | // reverse rules, |
michael@0 | 73 | // and a list of UnicodeSets encountered. |
michael@0 | 74 | |
michael@0 | 75 | /** |
michael@0 | 76 | * Return a rules string without unnecessary |
michael@0 | 77 | * characters. |
michael@0 | 78 | */ |
michael@0 | 79 | static UnicodeString stripRules(const UnicodeString &rules); |
michael@0 | 80 | private: |
michael@0 | 81 | |
michael@0 | 82 | UBool doParseActions(int32_t a); |
michael@0 | 83 | void error(UErrorCode e); // error reporting convenience function. |
michael@0 | 84 | void fixOpStack(RBBINode::OpPrecedence p); |
michael@0 | 85 | // a character. |
michael@0 | 86 | void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); |
michael@0 | 87 | |
michael@0 | 88 | UChar32 nextCharLL(); |
michael@0 | 89 | #ifdef RBBI_DEBUG |
michael@0 | 90 | void printNodeStack(const char *title); |
michael@0 | 91 | #endif |
michael@0 | 92 | RBBINode *pushNewNode(RBBINode::NodeType t); |
michael@0 | 93 | void scanSet(); |
michael@0 | 94 | |
michael@0 | 95 | |
michael@0 | 96 | RBBIRuleBuilder *fRB; // The rule builder that we are part of. |
michael@0 | 97 | |
michael@0 | 98 | int32_t fScanIndex; // Index of current character being processed |
michael@0 | 99 | // in the rule input string. |
michael@0 | 100 | int32_t fNextIndex; // Index of the next character, which |
michael@0 | 101 | // is the first character not yet scanned. |
michael@0 | 102 | UBool fQuoteMode; // Scan is in a 'quoted region' |
michael@0 | 103 | int32_t fLineNum; // Line number in input file. |
michael@0 | 104 | int32_t fCharNum; // Char position within the line. |
michael@0 | 105 | UChar32 fLastChar; // Previous char, needed to count CR-LF |
michael@0 | 106 | // as a single line, not two. |
michael@0 | 107 | |
michael@0 | 108 | RBBIRuleChar fC; // Current char for parse state machine |
michael@0 | 109 | // processing. |
michael@0 | 110 | UnicodeString fVarName; // $variableName, valid when we've just |
michael@0 | 111 | // scanned one. |
michael@0 | 112 | |
michael@0 | 113 | RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule |
michael@0 | 114 | // parsing. index by p[state][char-class] |
michael@0 | 115 | |
michael@0 | 116 | uint16_t fStack[kStackSize]; // State stack, holds state pushes |
michael@0 | 117 | int32_t fStackPtr; // and pops as specified in the state |
michael@0 | 118 | // transition rules. |
michael@0 | 119 | |
michael@0 | 120 | RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created |
michael@0 | 121 | // during the parse of a rule |
michael@0 | 122 | int32_t fNodeStackPtr; |
michael@0 | 123 | |
michael@0 | 124 | |
michael@0 | 125 | UBool fReverseRule; // True if the rule currently being scanned |
michael@0 | 126 | // is a reverse direction rule (if it |
michael@0 | 127 | // starts with a '!') |
michael@0 | 128 | |
michael@0 | 129 | UBool fLookAheadRule; // True if the rule includes a '/' |
michael@0 | 130 | // somewhere within it. |
michael@0 | 131 | |
michael@0 | 132 | RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
michael@0 | 133 | // $variable symbols. |
michael@0 | 134 | |
michael@0 | 135 | UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to |
michael@0 | 136 | // the sets created while parsing rules. |
michael@0 | 137 | // The key is the string used for creating |
michael@0 | 138 | // the set. |
michael@0 | 139 | |
michael@0 | 140 | UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
michael@0 | 141 | // the scanning of RBBI rules. The |
michael@0 | 142 | // indicies for these are assigned by the |
michael@0 | 143 | // perl script that builds the state tables. |
michael@0 | 144 | // See rbbirpt.h. |
michael@0 | 145 | |
michael@0 | 146 | int32_t fRuleNum; // Counts each rule as it is scanned. |
michael@0 | 147 | |
michael@0 | 148 | int32_t fOptionStart; // Input index of start of a !!option |
michael@0 | 149 | // keyword, while being scanned. |
michael@0 | 150 | |
michael@0 | 151 | UnicodeSet *gRuleSet_rule_char; |
michael@0 | 152 | UnicodeSet *gRuleSet_white_space; |
michael@0 | 153 | UnicodeSet *gRuleSet_name_char; |
michael@0 | 154 | UnicodeSet *gRuleSet_name_start_char; |
michael@0 | 155 | |
michael@0 | 156 | RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class |
michael@0 | 157 | RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class |
michael@0 | 158 | }; |
michael@0 | 159 | |
michael@0 | 160 | U_NAMESPACE_END |
michael@0 | 161 | |
michael@0 | 162 | #endif |