|
1 // |
|
2 // file: rbbirb.cpp |
|
3 // |
|
4 // Copyright (C) 2002-2011, International Business Machines Corporation and others. |
|
5 // All Rights Reserved. |
|
6 // |
|
7 // This file contains the RBBIRuleBuilder class implementation. This is the main class for |
|
8 // building (compiling) break rules into the tables required by the runtime |
|
9 // RBBI engine. |
|
10 // |
|
11 |
|
12 #include "unicode/utypes.h" |
|
13 |
|
14 #if !UCONFIG_NO_BREAK_ITERATION |
|
15 |
|
16 #include "unicode/brkiter.h" |
|
17 #include "unicode/rbbi.h" |
|
18 #include "unicode/ubrk.h" |
|
19 #include "unicode/unistr.h" |
|
20 #include "unicode/uniset.h" |
|
21 #include "unicode/uchar.h" |
|
22 #include "unicode/uchriter.h" |
|
23 #include "unicode/parsepos.h" |
|
24 #include "unicode/parseerr.h" |
|
25 #include "cmemory.h" |
|
26 #include "cstring.h" |
|
27 |
|
28 #include "rbbirb.h" |
|
29 #include "rbbinode.h" |
|
30 |
|
31 #include "rbbiscan.h" |
|
32 #include "rbbisetb.h" |
|
33 #include "rbbitblb.h" |
|
34 #include "rbbidata.h" |
|
35 |
|
36 |
|
37 U_NAMESPACE_BEGIN |
|
38 |
|
39 |
|
40 //---------------------------------------------------------------------------------------- |
|
41 // |
|
42 // Constructor. |
|
43 // |
|
44 //---------------------------------------------------------------------------------------- |
|
45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, |
|
46 UParseError *parseErr, |
|
47 UErrorCode &status) |
|
48 : fRules(rules) |
|
49 { |
|
50 fStatus = &status; // status is checked below |
|
51 fParseError = parseErr; |
|
52 fDebugEnv = NULL; |
|
53 #ifdef RBBI_DEBUG |
|
54 fDebugEnv = getenv("U_RBBIDEBUG"); |
|
55 #endif |
|
56 |
|
57 |
|
58 fForwardTree = NULL; |
|
59 fReverseTree = NULL; |
|
60 fSafeFwdTree = NULL; |
|
61 fSafeRevTree = NULL; |
|
62 fDefaultTree = &fForwardTree; |
|
63 fForwardTables = NULL; |
|
64 fReverseTables = NULL; |
|
65 fSafeFwdTables = NULL; |
|
66 fSafeRevTables = NULL; |
|
67 fRuleStatusVals = NULL; |
|
68 fChainRules = FALSE; |
|
69 fLBCMNoChain = FALSE; |
|
70 fLookAheadHardBreak = FALSE; |
|
71 fUSetNodes = NULL; |
|
72 fRuleStatusVals = NULL; |
|
73 fScanner = NULL; |
|
74 fSetBuilder = NULL; |
|
75 if (parseErr) { |
|
76 uprv_memset(parseErr, 0, sizeof(UParseError)); |
|
77 } |
|
78 |
|
79 if (U_FAILURE(status)) { |
|
80 return; |
|
81 } |
|
82 |
|
83 fUSetNodes = new UVector(status); // bcos status gets overwritten here |
|
84 fRuleStatusVals = new UVector(status); |
|
85 fScanner = new RBBIRuleScanner(this); |
|
86 fSetBuilder = new RBBISetBuilder(this); |
|
87 if (U_FAILURE(status)) { |
|
88 return; |
|
89 } |
|
90 if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { |
|
91 status = U_MEMORY_ALLOCATION_ERROR; |
|
92 } |
|
93 } |
|
94 |
|
95 |
|
96 |
|
97 //---------------------------------------------------------------------------------------- |
|
98 // |
|
99 // Destructor |
|
100 // |
|
101 //---------------------------------------------------------------------------------------- |
|
102 RBBIRuleBuilder::~RBBIRuleBuilder() { |
|
103 |
|
104 int i; |
|
105 for (i=0; ; i++) { |
|
106 RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); |
|
107 if (n==NULL) { |
|
108 break; |
|
109 } |
|
110 delete n; |
|
111 } |
|
112 |
|
113 delete fUSetNodes; |
|
114 delete fSetBuilder; |
|
115 delete fForwardTables; |
|
116 delete fReverseTables; |
|
117 delete fSafeFwdTables; |
|
118 delete fSafeRevTables; |
|
119 |
|
120 delete fForwardTree; |
|
121 delete fReverseTree; |
|
122 delete fSafeFwdTree; |
|
123 delete fSafeRevTree; |
|
124 delete fScanner; |
|
125 delete fRuleStatusVals; |
|
126 } |
|
127 |
|
128 |
|
129 |
|
130 |
|
131 |
|
132 //---------------------------------------------------------------------------------------- |
|
133 // |
|
134 // flattenData() - Collect up the compiled RBBI rule data and put it into |
|
135 // the format for saving in ICU data files, |
|
136 // which is also the format needed by the RBBI runtime engine. |
|
137 // |
|
138 //---------------------------------------------------------------------------------------- |
|
139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} |
|
140 |
|
141 RBBIDataHeader *RBBIRuleBuilder::flattenData() { |
|
142 int32_t i; |
|
143 |
|
144 if (U_FAILURE(*fStatus)) { |
|
145 return NULL; |
|
146 } |
|
147 |
|
148 // Remove comments and whitespace from the rules to make it smaller. |
|
149 UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); |
|
150 |
|
151 // Calculate the size of each section in the data. |
|
152 // Sizes here are padded up to a multiple of 8 for better memory alignment. |
|
153 // Sections sizes actually stored in the header are for the actual data |
|
154 // without the padding. |
|
155 // |
|
156 int32_t headerSize = align8(sizeof(RBBIDataHeader)); |
|
157 int32_t forwardTableSize = align8(fForwardTables->getTableSize()); |
|
158 int32_t reverseTableSize = align8(fReverseTables->getTableSize()); |
|
159 int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); |
|
160 int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); |
|
161 int32_t trieSize = align8(fSetBuilder->getTrieSize()); |
|
162 int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); |
|
163 int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); |
|
164 |
|
165 int32_t totalSize = headerSize + forwardTableSize + reverseTableSize |
|
166 + safeFwdTableSize + safeRevTableSize |
|
167 + statusTableSize + trieSize + rulesSize; |
|
168 |
|
169 RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); |
|
170 if (data == NULL) { |
|
171 *fStatus = U_MEMORY_ALLOCATION_ERROR; |
|
172 return NULL; |
|
173 } |
|
174 uprv_memset(data, 0, totalSize); |
|
175 |
|
176 |
|
177 data->fMagic = 0xb1a0; |
|
178 data->fFormatVersion[0] = 3; |
|
179 data->fFormatVersion[1] = 1; |
|
180 data->fFormatVersion[2] = 0; |
|
181 data->fFormatVersion[3] = 0; |
|
182 data->fLength = totalSize; |
|
183 data->fCatCount = fSetBuilder->getNumCharCategories(); |
|
184 |
|
185 data->fFTable = headerSize; |
|
186 data->fFTableLen = forwardTableSize; |
|
187 data->fRTable = data->fFTable + forwardTableSize; |
|
188 data->fRTableLen = reverseTableSize; |
|
189 data->fSFTable = data->fRTable + reverseTableSize; |
|
190 data->fSFTableLen = safeFwdTableSize; |
|
191 data->fSRTable = data->fSFTable + safeFwdTableSize; |
|
192 data->fSRTableLen = safeRevTableSize; |
|
193 |
|
194 data->fTrie = data->fSRTable + safeRevTableSize; |
|
195 data->fTrieLen = fSetBuilder->getTrieSize(); |
|
196 data->fStatusTable = data->fTrie + trieSize; |
|
197 data->fStatusTableLen= statusTableSize; |
|
198 data->fRuleSource = data->fStatusTable + statusTableSize; |
|
199 data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); |
|
200 |
|
201 uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); |
|
202 |
|
203 fForwardTables->exportTable((uint8_t *)data + data->fFTable); |
|
204 fReverseTables->exportTable((uint8_t *)data + data->fRTable); |
|
205 fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); |
|
206 fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); |
|
207 fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); |
|
208 |
|
209 int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); |
|
210 for (i=0; i<fRuleStatusVals->size(); i++) { |
|
211 ruleStatusTable[i] = fRuleStatusVals->elementAti(i); |
|
212 } |
|
213 |
|
214 strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); |
|
215 |
|
216 return data; |
|
217 } |
|
218 |
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
|
224 //---------------------------------------------------------------------------------------- |
|
225 // |
|
226 // createRuleBasedBreakIterator construct from source rules that are passed in |
|
227 // in a UnicodeString |
|
228 // |
|
229 //---------------------------------------------------------------------------------------- |
|
230 BreakIterator * |
|
231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, |
|
232 UParseError *parseError, |
|
233 UErrorCode &status) |
|
234 { |
|
235 // status checked below |
|
236 |
|
237 // |
|
238 // Read the input rules, generate a parse tree, symbol table, |
|
239 // and list of all Unicode Sets referenced by the rules. |
|
240 // |
|
241 RBBIRuleBuilder builder(rules, parseError, status); |
|
242 if (U_FAILURE(status)) { // status checked here bcos build below doesn't |
|
243 return NULL; |
|
244 } |
|
245 builder.fScanner->parse(); |
|
246 |
|
247 // |
|
248 // UnicodeSet processing. |
|
249 // Munge the Unicode Sets to create a set of character categories. |
|
250 // Generate the mapping tables (TRIE) from input 32-bit characters to |
|
251 // the character categories. |
|
252 // |
|
253 builder.fSetBuilder->build(); |
|
254 |
|
255 |
|
256 // |
|
257 // Generate the DFA state transition table. |
|
258 // |
|
259 builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); |
|
260 builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); |
|
261 builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); |
|
262 builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); |
|
263 if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || |
|
264 builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) |
|
265 { |
|
266 status = U_MEMORY_ALLOCATION_ERROR; |
|
267 delete builder.fForwardTables; builder.fForwardTables = NULL; |
|
268 delete builder.fReverseTables; builder.fReverseTables = NULL; |
|
269 delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; |
|
270 delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; |
|
271 return NULL; |
|
272 } |
|
273 |
|
274 builder.fForwardTables->build(); |
|
275 builder.fReverseTables->build(); |
|
276 builder.fSafeFwdTables->build(); |
|
277 builder.fSafeRevTables->build(); |
|
278 |
|
279 #ifdef RBBI_DEBUG |
|
280 if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { |
|
281 builder.fForwardTables->printRuleStatusTable(); |
|
282 } |
|
283 #endif |
|
284 |
|
285 // |
|
286 // Package up the compiled data into a memory image |
|
287 // in the run-time format. |
|
288 // |
|
289 RBBIDataHeader *data = builder.flattenData(); // returns NULL if error |
|
290 if (U_FAILURE(*builder.fStatus)) { |
|
291 return NULL; |
|
292 } |
|
293 |
|
294 |
|
295 // |
|
296 // Clean up the compiler related stuff |
|
297 // |
|
298 |
|
299 |
|
300 // |
|
301 // Create a break iterator from the compiled rules. |
|
302 // (Identical to creation from stored pre-compiled rules) |
|
303 // |
|
304 // status is checked after init in construction. |
|
305 RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); |
|
306 if (U_FAILURE(status)) { |
|
307 delete This; |
|
308 This = NULL; |
|
309 } |
|
310 else if(This == NULL) { // test for NULL |
|
311 status = U_MEMORY_ALLOCATION_ERROR; |
|
312 } |
|
313 return This; |
|
314 } |
|
315 |
|
316 U_NAMESPACE_END |
|
317 |
|
318 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |