|
1 // |
|
2 // rbbiscan.h |
|
3 // |
|
4 // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
|
5 // All Rights Reserved. |
|
6 // |
|
7 // This file contains declarations for class RBBIRuleScanner |
|
8 // |
|
9 |
|
10 |
|
11 #ifndef RBBISCAN_H |
|
12 #define RBBISCAN_H |
|
13 |
|
14 #include "unicode/utypes.h" |
|
15 #include "unicode/uobject.h" |
|
16 #include "unicode/rbbi.h" |
|
17 #include "unicode/uniset.h" |
|
18 #include "unicode/parseerr.h" |
|
19 #include "uhash.h" |
|
20 #include "uvector.h" |
|
21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
|
22 // looks up references to $variables within a set. |
|
23 #include "rbbinode.h" |
|
24 //#include "rbbitblb.h" |
|
25 |
|
26 |
|
27 |
|
28 U_NAMESPACE_BEGIN |
|
29 |
|
30 class RBBIRuleBuilder; |
|
31 class RBBISymbolTable; |
|
32 |
|
33 |
|
34 //-------------------------------------------------------------------------------- |
|
35 // |
|
36 // class RBBIRuleScanner does the lowest level, character-at-a-time |
|
37 // scanning of break iterator rules. |
|
38 // |
|
39 // The output of the scanner is parse trees for |
|
40 // the rule expressions and a list of all Unicode Sets |
|
41 // encountered. |
|
42 // |
|
43 //-------------------------------------------------------------------------------- |
|
44 |
|
45 class RBBIRuleScanner : public UMemory { |
|
46 public: |
|
47 |
|
48 enum { |
|
49 kStackSize = 100 // The size of the state stack for |
|
50 }; // rules parsing. Corresponds roughly |
|
51 // to the depth of parentheses nesting |
|
52 // that is allowed in the rules. |
|
53 |
|
54 struct RBBIRuleChar { |
|
55 UChar32 fChar; |
|
56 UBool fEscaped; |
|
57 }; |
|
58 |
|
59 RBBIRuleScanner(RBBIRuleBuilder *rb); |
|
60 |
|
61 |
|
62 virtual ~RBBIRuleScanner(); |
|
63 |
|
64 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. |
|
65 // Return false if at end. |
|
66 |
|
67 UBool push(const RBBIRuleChar &c); // Push (unget) one character. |
|
68 // Only a single character may be pushed. |
|
69 |
|
70 void parse(); // Parse the rules, generating two parse |
|
71 // trees, one each for the forward and |
|
72 // reverse rules, |
|
73 // and a list of UnicodeSets encountered. |
|
74 |
|
75 /** |
|
76 * Return a rules string without unnecessary |
|
77 * characters. |
|
78 */ |
|
79 static UnicodeString stripRules(const UnicodeString &rules); |
|
80 private: |
|
81 |
|
82 UBool doParseActions(int32_t a); |
|
83 void error(UErrorCode e); // error reporting convenience function. |
|
84 void fixOpStack(RBBINode::OpPrecedence p); |
|
85 // a character. |
|
86 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); |
|
87 |
|
88 UChar32 nextCharLL(); |
|
89 #ifdef RBBI_DEBUG |
|
90 void printNodeStack(const char *title); |
|
91 #endif |
|
92 RBBINode *pushNewNode(RBBINode::NodeType t); |
|
93 void scanSet(); |
|
94 |
|
95 |
|
96 RBBIRuleBuilder *fRB; // The rule builder that we are part of. |
|
97 |
|
98 int32_t fScanIndex; // Index of current character being processed |
|
99 // in the rule input string. |
|
100 int32_t fNextIndex; // Index of the next character, which |
|
101 // is the first character not yet scanned. |
|
102 UBool fQuoteMode; // Scan is in a 'quoted region' |
|
103 int32_t fLineNum; // Line number in input file. |
|
104 int32_t fCharNum; // Char position within the line. |
|
105 UChar32 fLastChar; // Previous char, needed to count CR-LF |
|
106 // as a single line, not two. |
|
107 |
|
108 RBBIRuleChar fC; // Current char for parse state machine |
|
109 // processing. |
|
110 UnicodeString fVarName; // $variableName, valid when we've just |
|
111 // scanned one. |
|
112 |
|
113 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule |
|
114 // parsing. index by p[state][char-class] |
|
115 |
|
116 uint16_t fStack[kStackSize]; // State stack, holds state pushes |
|
117 int32_t fStackPtr; // and pops as specified in the state |
|
118 // transition rules. |
|
119 |
|
120 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created |
|
121 // during the parse of a rule |
|
122 int32_t fNodeStackPtr; |
|
123 |
|
124 |
|
125 UBool fReverseRule; // True if the rule currently being scanned |
|
126 // is a reverse direction rule (if it |
|
127 // starts with a '!') |
|
128 |
|
129 UBool fLookAheadRule; // True if the rule includes a '/' |
|
130 // somewhere within it. |
|
131 |
|
132 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of |
|
133 // $variable symbols. |
|
134 |
|
135 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to |
|
136 // the sets created while parsing rules. |
|
137 // The key is the string used for creating |
|
138 // the set. |
|
139 |
|
140 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during |
|
141 // the scanning of RBBI rules. The |
|
142 // indicies for these are assigned by the |
|
143 // perl script that builds the state tables. |
|
144 // See rbbirpt.h. |
|
145 |
|
146 int32_t fRuleNum; // Counts each rule as it is scanned. |
|
147 |
|
148 int32_t fOptionStart; // Input index of start of a !!option |
|
149 // keyword, while being scanned. |
|
150 |
|
151 UnicodeSet *gRuleSet_rule_char; |
|
152 UnicodeSet *gRuleSet_white_space; |
|
153 UnicodeSet *gRuleSet_name_char; |
|
154 UnicodeSet *gRuleSet_name_start_char; |
|
155 |
|
156 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class |
|
157 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class |
|
158 }; |
|
159 |
|
160 U_NAMESPACE_END |
|
161 |
|
162 #endif |