1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbiscan.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1213 @@ 1.4 + 1.5 +// 1.6 +// file: rbbiscan.cpp 1.7 +// 1.8 +// Copyright (C) 2002-2012, International Business Machines Corporation and others. 1.9 +// All Rights Reserved. 1.10 +// 1.11 +// This file contains the Rule Based Break Iterator Rule Builder functions for 1.12 +// scanning the rules and assembling a parse tree. This is the first phase 1.13 +// of compiling the rules. 1.14 +// 1.15 +// The overall of the rules is managed by class RBBIRuleBuilder, which will 1.16 +// create and use an instance of this class as part of the process. 1.17 +// 1.18 + 1.19 +#include "unicode/utypes.h" 1.20 + 1.21 +#if !UCONFIG_NO_BREAK_ITERATION 1.22 + 1.23 +#include "unicode/unistr.h" 1.24 +#include "unicode/uniset.h" 1.25 +#include "unicode/uchar.h" 1.26 +#include "unicode/uchriter.h" 1.27 +#include "unicode/parsepos.h" 1.28 +#include "unicode/parseerr.h" 1.29 +#include "cmemory.h" 1.30 +#include "cstring.h" 1.31 + 1.32 +#include "rbbirpt.h" // Contains state table for the rbbi rules parser. 1.33 + // generated by a Perl script. 1.34 +#include "rbbirb.h" 1.35 +#include "rbbinode.h" 1.36 +#include "rbbiscan.h" 1.37 +#include "rbbitblb.h" 1.38 + 1.39 +#include "uassert.h" 1.40 + 1.41 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.42 + 1.43 +//------------------------------------------------------------------------------ 1.44 +// 1.45 +// Unicode Set init strings for each of the character classes needed for parsing a rule file. 1.46 +// (Initialized with hex values for portability to EBCDIC based machines. 1.47 +// Really ugly, but there's no good way to avoid it.) 1.48 +// 1.49 +// The sets are referred to by name in the rbbirpt.txt, which is the 1.50 +// source form of the state transition table for the RBBI rule parser. 1.51 +// 1.52 +//------------------------------------------------------------------------------ 1.53 +static const UChar gRuleSet_rule_char_pattern[] = { 1.54 + // [ ^ [ \ p { Z } \ u 0 0 2 0 1.55 + 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 1.56 + // - \ u 0 0 7 f ] - [ \ p 1.57 + 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 1.58 + // { L } ] - [ \ p { N } ] ] 1.59 + 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 1.60 + 1.61 +static const UChar gRuleSet_name_char_pattern[] = { 1.62 +// [ _ \ p { L } \ p { N } ] 1.63 + 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 1.64 + 1.65 +static const UChar gRuleSet_digit_char_pattern[] = { 1.66 +// [ 0 - 9 ] 1.67 + 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 1.68 + 1.69 +static const UChar gRuleSet_name_start_char_pattern[] = { 1.70 +// [ _ \ p { L } ] 1.71 + 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 1.72 + 1.73 +static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 1.74 + 1.75 + 1.76 +U_CDECL_BEGIN 1.77 +static void U_CALLCONV RBBISetTable_deleter(void *p) { 1.78 + icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 1.79 + delete px->key; 1.80 + // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 1.81 + // Don't delete the value nodes here. 1.82 + uprv_free(px); 1.83 +} 1.84 +U_CDECL_END 1.85 + 1.86 +U_NAMESPACE_BEGIN 1.87 + 1.88 +//------------------------------------------------------------------------------ 1.89 +// 1.90 +// Constructor. 1.91 +// 1.92 +//------------------------------------------------------------------------------ 1.93 +RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 1.94 +{ 1.95 + fRB = rb; 1.96 + fStackPtr = 0; 1.97 + fStack[fStackPtr] = 0; 1.98 + fNodeStackPtr = 0; 1.99 + fRuleNum = 0; 1.100 + fNodeStack[0] = NULL; 1.101 + 1.102 + fSymbolTable = NULL; 1.103 + fSetTable = NULL; 1.104 + 1.105 + fScanIndex = 0; 1.106 + fNextIndex = 0; 1.107 + 1.108 + fReverseRule = FALSE; 1.109 + fLookAheadRule = FALSE; 1.110 + 1.111 + fLineNum = 1; 1.112 + fCharNum = 0; 1.113 + fQuoteMode = FALSE; 1.114 + 1.115 + // Do not check status until after all critical fields are sufficiently initialized 1.116 + // that the destructor can run cleanly. 1.117 + if (U_FAILURE(*rb->fStatus)) { 1.118 + return; 1.119 + } 1.120 + 1.121 + // 1.122 + // Set up the constant Unicode Sets. 1.123 + // Note: These could be made static, lazily initialized, and shared among 1.124 + // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 1.125 + // and the time to build these few sets should be small compared to a 1.126 + // full break iterator build. 1.127 + fRuleSets[kRuleSet_rule_char-128] 1.128 + = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 1.129 + // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 1.130 + fRuleSets[kRuleSet_white_space-128]. 1.131 + add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 1.132 + fRuleSets[kRuleSet_name_char-128] 1.133 + = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 1.134 + fRuleSets[kRuleSet_name_start_char-128] 1.135 + = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 1.136 + fRuleSets[kRuleSet_digit_char-128] 1.137 + = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 1.138 + if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 1.139 + // This case happens if ICU's data is missing. UnicodeSet tries to look up property 1.140 + // names from the init string, can't find them, and claims an illegal argument. 1.141 + // Change the error so that the actual problem will be clearer to users. 1.142 + *rb->fStatus = U_BRK_INIT_ERROR; 1.143 + } 1.144 + if (U_FAILURE(*rb->fStatus)) { 1.145 + return; 1.146 + } 1.147 + 1.148 + fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 1.149 + if (fSymbolTable == NULL) { 1.150 + *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 1.151 + return; 1.152 + } 1.153 + fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 1.154 + if (U_FAILURE(*rb->fStatus)) { 1.155 + return; 1.156 + } 1.157 + uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 1.158 +} 1.159 + 1.160 + 1.161 + 1.162 +//------------------------------------------------------------------------------ 1.163 +// 1.164 +// Destructor 1.165 +// 1.166 +//------------------------------------------------------------------------------ 1.167 +RBBIRuleScanner::~RBBIRuleScanner() { 1.168 + delete fSymbolTable; 1.169 + if (fSetTable != NULL) { 1.170 + uhash_close(fSetTable); 1.171 + fSetTable = NULL; 1.172 + 1.173 + } 1.174 + 1.175 + 1.176 + // Node Stack. 1.177 + // Normally has one entry, which is the entire parse tree for the rules. 1.178 + // If errors occured, there may be additional subtrees left on the stack. 1.179 + while (fNodeStackPtr > 0) { 1.180 + delete fNodeStack[fNodeStackPtr]; 1.181 + fNodeStackPtr--; 1.182 + } 1.183 + 1.184 +} 1.185 + 1.186 +//------------------------------------------------------------------------------ 1.187 +// 1.188 +// doParseAction Do some action during rule parsing. 1.189 +// Called by the parse state machine. 1.190 +// Actions build the parse tree and Unicode Sets, 1.191 +// and maintain the parse stack for nested expressions. 1.192 +// 1.193 +// TODO: unify EParseAction and RBBI_RuleParseAction enum types. 1.194 +// They represent exactly the same thing. They're separate 1.195 +// only to work around enum forward declaration restrictions 1.196 +// in some compilers, while at the same time avoiding multiple 1.197 +// definitions problems. I'm sure that there's a better way. 1.198 +// 1.199 +//------------------------------------------------------------------------------ 1.200 +UBool RBBIRuleScanner::doParseActions(int32_t action) 1.201 +{ 1.202 + RBBINode *n = NULL; 1.203 + 1.204 + UBool returnVal = TRUE; 1.205 + 1.206 + switch (action) { 1.207 + 1.208 + case doExprStart: 1.209 + pushNewNode(RBBINode::opStart); 1.210 + fRuleNum++; 1.211 + break; 1.212 + 1.213 + 1.214 + case doExprOrOperator: 1.215 + { 1.216 + fixOpStack(RBBINode::precOpCat); 1.217 + RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 1.218 + RBBINode *orNode = pushNewNode(RBBINode::opOr); 1.219 + orNode->fLeftChild = operandNode; 1.220 + operandNode->fParent = orNode; 1.221 + } 1.222 + break; 1.223 + 1.224 + case doExprCatOperator: 1.225 + // concatenation operator. 1.226 + // For the implicit concatenation of adjacent terms in an expression that are 1.227 + // not separated by any other operator. Action is invoked between the 1.228 + // actions for the two terms. 1.229 + { 1.230 + fixOpStack(RBBINode::precOpCat); 1.231 + RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 1.232 + RBBINode *catNode = pushNewNode(RBBINode::opCat); 1.233 + catNode->fLeftChild = operandNode; 1.234 + operandNode->fParent = catNode; 1.235 + } 1.236 + break; 1.237 + 1.238 + case doLParen: 1.239 + // Open Paren. 1.240 + // The openParen node is a dummy operation type with a low precedence, 1.241 + // which has the affect of ensuring that any real binary op that 1.242 + // follows within the parens binds more tightly to the operands than 1.243 + // stuff outside of the parens. 1.244 + pushNewNode(RBBINode::opLParen); 1.245 + break; 1.246 + 1.247 + case doExprRParen: 1.248 + fixOpStack(RBBINode::precLParen); 1.249 + break; 1.250 + 1.251 + case doNOP: 1.252 + break; 1.253 + 1.254 + case doStartAssign: 1.255 + // We've just scanned "$variable = " 1.256 + // The top of the node stack has the $variable ref node. 1.257 + 1.258 + // Save the start position of the RHS text in the StartExpression node 1.259 + // that precedes the $variableReference node on the stack. 1.260 + // This will eventually be used when saving the full $variable replacement 1.261 + // text as a string. 1.262 + n = fNodeStack[fNodeStackPtr-1]; 1.263 + n->fFirstPos = fNextIndex; // move past the '=' 1.264 + 1.265 + // Push a new start-of-expression node; needed to keep parse of the 1.266 + // RHS expression happy. 1.267 + pushNewNode(RBBINode::opStart); 1.268 + break; 1.269 + 1.270 + 1.271 + 1.272 + 1.273 + case doEndAssign: 1.274 + { 1.275 + // We have reached the end of an assignement statement. 1.276 + // Current scan char is the ';' that terminates the assignment. 1.277 + 1.278 + // Terminate expression, leaves expression parse tree rooted in TOS node. 1.279 + fixOpStack(RBBINode::precStart); 1.280 + 1.281 + RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 1.282 + RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 1.283 + RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 1.284 + 1.285 + // Save original text of right side of assignment, excluding the terminating ';' 1.286 + // in the root of the node for the right-hand-side expression. 1.287 + RHSExprNode->fFirstPos = startExprNode->fFirstPos; 1.288 + RHSExprNode->fLastPos = fScanIndex; 1.289 + fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 1.290 + 1.291 + // Expression parse tree becomes l. child of the $variable reference node. 1.292 + varRefNode->fLeftChild = RHSExprNode; 1.293 + RHSExprNode->fParent = varRefNode; 1.294 + 1.295 + // Make a symbol table entry for the $variableRef node. 1.296 + fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 1.297 + if (U_FAILURE(*fRB->fStatus)) { 1.298 + // This is a round-about way to get the parse position set 1.299 + // so that duplicate symbols error messages include a line number. 1.300 + UErrorCode t = *fRB->fStatus; 1.301 + *fRB->fStatus = U_ZERO_ERROR; 1.302 + error(t); 1.303 + } 1.304 + 1.305 + // Clean up the stack. 1.306 + delete startExprNode; 1.307 + fNodeStackPtr-=3; 1.308 + break; 1.309 + } 1.310 + 1.311 + case doEndOfRule: 1.312 + { 1.313 + fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 1.314 + if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 1.315 + break; 1.316 + } 1.317 +#ifdef RBBI_DEBUG 1.318 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 1.319 +#endif 1.320 + U_ASSERT(fNodeStackPtr == 1); 1.321 + 1.322 + // If this rule includes a look-ahead '/', add a endMark node to the 1.323 + // expression tree. 1.324 + if (fLookAheadRule) { 1.325 + RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 1.326 + RBBINode *endNode = pushNewNode(RBBINode::endMark); 1.327 + RBBINode *catNode = pushNewNode(RBBINode::opCat); 1.328 + fNodeStackPtr -= 2; 1.329 + catNode->fLeftChild = thisRule; 1.330 + catNode->fRightChild = endNode; 1.331 + fNodeStack[fNodeStackPtr] = catNode; 1.332 + endNode->fVal = fRuleNum; 1.333 + endNode->fLookAheadEnd = TRUE; 1.334 + } 1.335 + 1.336 + // All rule expressions are ORed together. 1.337 + // The ';' that terminates an expression really just functions as a '|' with 1.338 + // a low operator prededence. 1.339 + // 1.340 + // Each of the four sets of rules are collected separately. 1.341 + // (forward, reverse, safe_forward, safe_reverse) 1.342 + // OR this rule into the appropriate group of them. 1.343 + // 1.344 + RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); 1.345 + 1.346 + if (*destRules != NULL) { 1.347 + // This is not the first rule encounted. 1.348 + // OR previous stuff (from *destRules) 1.349 + // with the current rule expression (on the Node Stack) 1.350 + // with the resulting OR expression going to *destRules 1.351 + // 1.352 + RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 1.353 + RBBINode *prevRules = *destRules; 1.354 + RBBINode *orNode = pushNewNode(RBBINode::opOr); 1.355 + orNode->fLeftChild = prevRules; 1.356 + prevRules->fParent = orNode; 1.357 + orNode->fRightChild = thisRule; 1.358 + thisRule->fParent = orNode; 1.359 + *destRules = orNode; 1.360 + } 1.361 + else 1.362 + { 1.363 + // This is the first rule encountered (for this direction). 1.364 + // Just move its parse tree from the stack to *destRules. 1.365 + *destRules = fNodeStack[fNodeStackPtr]; 1.366 + } 1.367 + fReverseRule = FALSE; // in preparation for the next rule. 1.368 + fLookAheadRule = FALSE; 1.369 + fNodeStackPtr = 0; 1.370 + } 1.371 + break; 1.372 + 1.373 + 1.374 + case doRuleError: 1.375 + error(U_BRK_RULE_SYNTAX); 1.376 + returnVal = FALSE; 1.377 + break; 1.378 + 1.379 + 1.380 + case doVariableNameExpectedErr: 1.381 + error(U_BRK_RULE_SYNTAX); 1.382 + break; 1.383 + 1.384 + 1.385 + // 1.386 + // Unary operands + ? * 1.387 + // These all appear after the operand to which they apply. 1.388 + // When we hit one, the operand (may be a whole sub expression) 1.389 + // will be on the top of the stack. 1.390 + // Unary Operator becomes TOS, with the old TOS as its one child. 1.391 + case doUnaryOpPlus: 1.392 + { 1.393 + RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 1.394 + RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 1.395 + plusNode->fLeftChild = operandNode; 1.396 + operandNode->fParent = plusNode; 1.397 + } 1.398 + break; 1.399 + 1.400 + case doUnaryOpQuestion: 1.401 + { 1.402 + RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 1.403 + RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 1.404 + qNode->fLeftChild = operandNode; 1.405 + operandNode->fParent = qNode; 1.406 + } 1.407 + break; 1.408 + 1.409 + case doUnaryOpStar: 1.410 + { 1.411 + RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 1.412 + RBBINode *starNode = pushNewNode(RBBINode::opStar); 1.413 + starNode->fLeftChild = operandNode; 1.414 + operandNode->fParent = starNode; 1.415 + } 1.416 + break; 1.417 + 1.418 + case doRuleChar: 1.419 + // A "Rule Character" is any single character that is a literal part 1.420 + // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 1.421 + // These are pretty uncommon in break rules; the terms are more commonly 1.422 + // sets. To keep things uniform, treat these characters like as 1.423 + // sets that just happen to contain only one character. 1.424 + { 1.425 + n = pushNewNode(RBBINode::setRef); 1.426 + findSetFor(UnicodeString(fC.fChar), n); 1.427 + n->fFirstPos = fScanIndex; 1.428 + n->fLastPos = fNextIndex; 1.429 + fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1.430 + break; 1.431 + } 1.432 + 1.433 + case doDotAny: 1.434 + // scanned a ".", meaning match any single character. 1.435 + { 1.436 + n = pushNewNode(RBBINode::setRef); 1.437 + findSetFor(UnicodeString(TRUE, kAny, 3), n); 1.438 + n->fFirstPos = fScanIndex; 1.439 + n->fLastPos = fNextIndex; 1.440 + fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1.441 + break; 1.442 + } 1.443 + 1.444 + case doSlash: 1.445 + // Scanned a '/', which identifies a look-ahead break position in a rule. 1.446 + n = pushNewNode(RBBINode::lookAhead); 1.447 + n->fVal = fRuleNum; 1.448 + n->fFirstPos = fScanIndex; 1.449 + n->fLastPos = fNextIndex; 1.450 + fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1.451 + fLookAheadRule = TRUE; 1.452 + break; 1.453 + 1.454 + 1.455 + case doStartTagValue: 1.456 + // Scanned a '{', the opening delimiter for a tag value within a rule. 1.457 + n = pushNewNode(RBBINode::tag); 1.458 + n->fVal = 0; 1.459 + n->fFirstPos = fScanIndex; 1.460 + n->fLastPos = fNextIndex; 1.461 + break; 1.462 + 1.463 + case doTagDigit: 1.464 + // Just scanned a decimal digit that's part of a tag value 1.465 + { 1.466 + n = fNodeStack[fNodeStackPtr]; 1.467 + uint32_t v = u_charDigitValue(fC.fChar); 1.468 + U_ASSERT(v < 10); 1.469 + n->fVal = n->fVal*10 + v; 1.470 + break; 1.471 + } 1.472 + 1.473 + case doTagValue: 1.474 + n = fNodeStack[fNodeStackPtr]; 1.475 + n->fLastPos = fNextIndex; 1.476 + fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1.477 + break; 1.478 + 1.479 + case doTagExpectedError: 1.480 + error(U_BRK_MALFORMED_RULE_TAG); 1.481 + returnVal = FALSE; 1.482 + break; 1.483 + 1.484 + case doOptionStart: 1.485 + // Scanning a !!option. At the start of string. 1.486 + fOptionStart = fScanIndex; 1.487 + break; 1.488 + 1.489 + case doOptionEnd: 1.490 + { 1.491 + UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 1.492 + if (opt == UNICODE_STRING("chain", 5)) { 1.493 + fRB->fChainRules = TRUE; 1.494 + } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 1.495 + fRB->fLBCMNoChain = TRUE; 1.496 + } else if (opt == UNICODE_STRING("forward", 7)) { 1.497 + fRB->fDefaultTree = &fRB->fForwardTree; 1.498 + } else if (opt == UNICODE_STRING("reverse", 7)) { 1.499 + fRB->fDefaultTree = &fRB->fReverseTree; 1.500 + } else if (opt == UNICODE_STRING("safe_forward", 12)) { 1.501 + fRB->fDefaultTree = &fRB->fSafeFwdTree; 1.502 + } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 1.503 + fRB->fDefaultTree = &fRB->fSafeRevTree; 1.504 + } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 1.505 + fRB->fLookAheadHardBreak = TRUE; 1.506 + } else { 1.507 + error(U_BRK_UNRECOGNIZED_OPTION); 1.508 + } 1.509 + } 1.510 + break; 1.511 + 1.512 + case doReverseDir: 1.513 + fReverseRule = TRUE; 1.514 + break; 1.515 + 1.516 + case doStartVariableName: 1.517 + n = pushNewNode(RBBINode::varRef); 1.518 + if (U_FAILURE(*fRB->fStatus)) { 1.519 + break; 1.520 + } 1.521 + n->fFirstPos = fScanIndex; 1.522 + break; 1.523 + 1.524 + case doEndVariableName: 1.525 + n = fNodeStack[fNodeStackPtr]; 1.526 + if (n==NULL || n->fType != RBBINode::varRef) { 1.527 + error(U_BRK_INTERNAL_ERROR); 1.528 + break; 1.529 + } 1.530 + n->fLastPos = fScanIndex; 1.531 + fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 1.532 + // Look the newly scanned name up in the symbol table 1.533 + // If there's an entry, set the l. child of the var ref to the replacement expression. 1.534 + // (We also pass through here when scanning assignments, but no harm is done, other 1.535 + // than a slight wasted effort that seems hard to avoid. Lookup will be null) 1.536 + n->fLeftChild = fSymbolTable->lookupNode(n->fText); 1.537 + break; 1.538 + 1.539 + case doCheckVarDef: 1.540 + n = fNodeStack[fNodeStackPtr]; 1.541 + if (n->fLeftChild == NULL) { 1.542 + error(U_BRK_UNDEFINED_VARIABLE); 1.543 + returnVal = FALSE; 1.544 + } 1.545 + break; 1.546 + 1.547 + case doExprFinished: 1.548 + break; 1.549 + 1.550 + case doRuleErrorAssignExpr: 1.551 + error(U_BRK_ASSIGN_ERROR); 1.552 + returnVal = FALSE; 1.553 + break; 1.554 + 1.555 + case doExit: 1.556 + returnVal = FALSE; 1.557 + break; 1.558 + 1.559 + case doScanUnicodeSet: 1.560 + scanSet(); 1.561 + break; 1.562 + 1.563 + default: 1.564 + error(U_BRK_INTERNAL_ERROR); 1.565 + returnVal = FALSE; 1.566 + break; 1.567 + } 1.568 + return returnVal; 1.569 +} 1.570 + 1.571 + 1.572 + 1.573 + 1.574 +//------------------------------------------------------------------------------ 1.575 +// 1.576 +// Error Report a rule parse error. 1.577 +// Only report it if no previous error has been recorded. 1.578 +// 1.579 +//------------------------------------------------------------------------------ 1.580 +void RBBIRuleScanner::error(UErrorCode e) { 1.581 + if (U_SUCCESS(*fRB->fStatus)) { 1.582 + *fRB->fStatus = e; 1.583 + if (fRB->fParseError) { 1.584 + fRB->fParseError->line = fLineNum; 1.585 + fRB->fParseError->offset = fCharNum; 1.586 + fRB->fParseError->preContext[0] = 0; 1.587 + fRB->fParseError->preContext[0] = 0; 1.588 + } 1.589 + } 1.590 +} 1.591 + 1.592 + 1.593 + 1.594 + 1.595 +//------------------------------------------------------------------------------ 1.596 +// 1.597 +// fixOpStack The parse stack holds partially assembled chunks of the parse tree. 1.598 +// An entry on the stack may be as small as a single setRef node, 1.599 +// or as large as the parse tree 1.600 +// for an entire expression (this will be the one item left on the stack 1.601 +// when the parsing of an RBBI rule completes. 1.602 +// 1.603 +// This function is called when a binary operator is encountered. 1.604 +// It looks back up the stack for operators that are not yet associated 1.605 +// with a right operand, and if the precedence of the stacked operator >= 1.606 +// the precedence of the current operator, binds the operand left, 1.607 +// to the previously encountered operator. 1.608 +// 1.609 +//------------------------------------------------------------------------------ 1.610 +void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 1.611 + RBBINode *n; 1.612 + // printNodeStack("entering fixOpStack()"); 1.613 + for (;;) { 1.614 + n = fNodeStack[fNodeStackPtr-1]; // an operator node 1.615 + if (n->fPrecedence == 0) { 1.616 + RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 1.617 + error(U_BRK_INTERNAL_ERROR); 1.618 + return; 1.619 + } 1.620 + 1.621 + if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 1.622 + // The most recent operand goes with the current operator, 1.623 + // not with the previously stacked one. 1.624 + break; 1.625 + } 1.626 + // Stack operator is a binary op ( '|' or concatenation) 1.627 + // TOS operand becomes right child of this operator. 1.628 + // Resulting subexpression becomes the TOS operand. 1.629 + n->fRightChild = fNodeStack[fNodeStackPtr]; 1.630 + fNodeStack[fNodeStackPtr]->fParent = n; 1.631 + fNodeStackPtr--; 1.632 + // printNodeStack("looping in fixOpStack() "); 1.633 + } 1.634 + 1.635 + if (p <= RBBINode::precLParen) { 1.636 + // Scan is at a right paren or end of expression. 1.637 + // The scanned item must match the stack, or else there was an error. 1.638 + // Discard the left paren (or start expr) node from the stack, 1.639 + // leaving the completed (sub)expression as TOS. 1.640 + if (n->fPrecedence != p) { 1.641 + // Right paren encountered matched start of expression node, or 1.642 + // end of expression matched with a left paren node. 1.643 + error(U_BRK_MISMATCHED_PAREN); 1.644 + } 1.645 + fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 1.646 + fNodeStackPtr--; 1.647 + // Delete the now-discarded LParen or Start node. 1.648 + delete n; 1.649 + } 1.650 + // printNodeStack("leaving fixOpStack()"); 1.651 +} 1.652 + 1.653 + 1.654 + 1.655 + 1.656 +//------------------------------------------------------------------------------ 1.657 +// 1.658 +// findSetFor given a UnicodeString, 1.659 +// - find the corresponding Unicode Set (uset node) 1.660 +// (create one if necessary) 1.661 +// - Set fLeftChild of the caller's node (should be a setRef node) 1.662 +// to the uset node 1.663 +// Maintain a hash table of uset nodes, so the same one is always used 1.664 +// for the same string. 1.665 +// If a "to adopt" set is provided and we haven't seen this key before, 1.666 +// add the provided set to the hash table. 1.667 +// If the string is one (32 bit) char in length, the set contains 1.668 +// just one element which is the char in question. 1.669 +// If the string is "any", return a set containing all chars. 1.670 +// 1.671 +//------------------------------------------------------------------------------ 1.672 +void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 1.673 + 1.674 + RBBISetTableEl *el; 1.675 + 1.676 + // First check whether we've already cached a set for this string. 1.677 + // If so, just use the cached set in the new node. 1.678 + // delete any set provided by the caller, since we own it. 1.679 + el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 1.680 + if (el != NULL) { 1.681 + delete setToAdopt; 1.682 + node->fLeftChild = el->val; 1.683 + U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 1.684 + return; 1.685 + } 1.686 + 1.687 + // Haven't seen this set before. 1.688 + // If the caller didn't provide us with a prebuilt set, 1.689 + // create a new UnicodeSet now. 1.690 + if (setToAdopt == NULL) { 1.691 + if (s.compare(kAny, -1) == 0) { 1.692 + setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 1.693 + } else { 1.694 + UChar32 c; 1.695 + c = s.char32At(0); 1.696 + setToAdopt = new UnicodeSet(c, c); 1.697 + } 1.698 + } 1.699 + 1.700 + // 1.701 + // Make a new uset node to refer to this UnicodeSet 1.702 + // This new uset node becomes the child of the caller's setReference node. 1.703 + // 1.704 + RBBINode *usetNode = new RBBINode(RBBINode::uset); 1.705 + if (usetNode == NULL) { 1.706 + error(U_MEMORY_ALLOCATION_ERROR); 1.707 + return; 1.708 + } 1.709 + usetNode->fInputSet = setToAdopt; 1.710 + usetNode->fParent = node; 1.711 + node->fLeftChild = usetNode; 1.712 + usetNode->fText = s; 1.713 + 1.714 + 1.715 + // 1.716 + // Add the new uset node to the list of all uset nodes. 1.717 + // 1.718 + fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 1.719 + 1.720 + 1.721 + // 1.722 + // Add the new set to the set hash table. 1.723 + // 1.724 + el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 1.725 + UnicodeString *tkey = new UnicodeString(s); 1.726 + if (tkey == NULL || el == NULL || setToAdopt == NULL) { 1.727 + // Delete to avoid memory leak 1.728 + delete tkey; 1.729 + tkey = NULL; 1.730 + uprv_free(el); 1.731 + el = NULL; 1.732 + delete setToAdopt; 1.733 + setToAdopt = NULL; 1.734 + 1.735 + error(U_MEMORY_ALLOCATION_ERROR); 1.736 + return; 1.737 + } 1.738 + el->key = tkey; 1.739 + el->val = usetNode; 1.740 + uhash_put(fSetTable, el->key, el, fRB->fStatus); 1.741 + 1.742 + return; 1.743 +} 1.744 + 1.745 + 1.746 + 1.747 +// 1.748 +// Assorted Unicode character constants. 1.749 +// Numeric because there is no portable way to enter them as literals. 1.750 +// (Think EBCDIC). 1.751 +// 1.752 +static const UChar chCR = 0x0d; // New lines, for terminating comments. 1.753 +static const UChar chLF = 0x0a; 1.754 +static const UChar chNEL = 0x85; // NEL newline variant 1.755 +static const UChar chLS = 0x2028; // Unicode Line Separator 1.756 +static const UChar chApos = 0x27; // single quote, for quoted chars. 1.757 +static const UChar chPound = 0x23; // '#', introduces a comment. 1.758 +static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 1.759 +static const UChar chLParen = 0x28; 1.760 +static const UChar chRParen = 0x29; 1.761 + 1.762 + 1.763 +//------------------------------------------------------------------------------ 1.764 +// 1.765 +// stripRules Return a rules string without unnecessary 1.766 +// characters. 1.767 +// 1.768 +//------------------------------------------------------------------------------ 1.769 +UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 1.770 + UnicodeString strippedRules; 1.771 + int rulesLength = rules.length(); 1.772 + for (int idx = 0; idx < rulesLength; ) { 1.773 + UChar ch = rules[idx++]; 1.774 + if (ch == chPound) { 1.775 + while (idx < rulesLength 1.776 + && ch != chCR && ch != chLF && ch != chNEL) 1.777 + { 1.778 + ch = rules[idx++]; 1.779 + } 1.780 + } 1.781 + if (!u_isISOControl(ch)) { 1.782 + strippedRules.append(ch); 1.783 + } 1.784 + } 1.785 + // strippedRules = strippedRules.unescape(); 1.786 + return strippedRules; 1.787 +} 1.788 + 1.789 + 1.790 +//------------------------------------------------------------------------------ 1.791 +// 1.792 +// nextCharLL Low Level Next Char from rule input source. 1.793 +// Get a char from the input character iterator, 1.794 +// keep track of input position for error reporting. 1.795 +// 1.796 +//------------------------------------------------------------------------------ 1.797 +UChar32 RBBIRuleScanner::nextCharLL() { 1.798 + UChar32 ch; 1.799 + 1.800 + if (fNextIndex >= fRB->fRules.length()) { 1.801 + return (UChar32)-1; 1.802 + } 1.803 + ch = fRB->fRules.char32At(fNextIndex); 1.804 + fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 1.805 + 1.806 + if (ch == chCR || 1.807 + ch == chNEL || 1.808 + ch == chLS || 1.809 + (ch == chLF && fLastChar != chCR)) { 1.810 + // Character is starting a new line. Bump up the line number, and 1.811 + // reset the column to 0. 1.812 + fLineNum++; 1.813 + fCharNum=0; 1.814 + if (fQuoteMode) { 1.815 + error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 1.816 + fQuoteMode = FALSE; 1.817 + } 1.818 + } 1.819 + else { 1.820 + // Character is not starting a new line. Except in the case of a 1.821 + // LF following a CR, increment the column position. 1.822 + if (ch != chLF) { 1.823 + fCharNum++; 1.824 + } 1.825 + } 1.826 + fLastChar = ch; 1.827 + return ch; 1.828 +} 1.829 + 1.830 + 1.831 +//------------------------------------------------------------------------------ 1.832 +// 1.833 +// nextChar for rules scanning. At this level, we handle stripping 1.834 +// out comments and processing backslash character escapes. 1.835 +// The rest of the rules grammar is handled at the next level up. 1.836 +// 1.837 +//------------------------------------------------------------------------------ 1.838 +void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 1.839 + 1.840 + // Unicode Character constants needed for the processing done by nextChar(), 1.841 + // in hex because literals wont work on EBCDIC machines. 1.842 + 1.843 + fScanIndex = fNextIndex; 1.844 + c.fChar = nextCharLL(); 1.845 + c.fEscaped = FALSE; 1.846 + 1.847 + // 1.848 + // check for '' sequence. 1.849 + // These are recognized in all contexts, whether in quoted text or not. 1.850 + // 1.851 + if (c.fChar == chApos) { 1.852 + if (fRB->fRules.char32At(fNextIndex) == chApos) { 1.853 + c.fChar = nextCharLL(); // get nextChar officially so character counts 1.854 + c.fEscaped = TRUE; // stay correct. 1.855 + } 1.856 + else 1.857 + { 1.858 + // Single quote, by itself. 1.859 + // Toggle quoting mode. 1.860 + // Return either '(' or ')', because quotes cause a grouping of the quoted text. 1.861 + fQuoteMode = !fQuoteMode; 1.862 + if (fQuoteMode == TRUE) { 1.863 + c.fChar = chLParen; 1.864 + } else { 1.865 + c.fChar = chRParen; 1.866 + } 1.867 + c.fEscaped = FALSE; // The paren that we return is not escaped. 1.868 + return; 1.869 + } 1.870 + } 1.871 + 1.872 + if (fQuoteMode) { 1.873 + c.fEscaped = TRUE; 1.874 + } 1.875 + else 1.876 + { 1.877 + // We are not in a 'quoted region' of the source. 1.878 + // 1.879 + if (c.fChar == chPound) { 1.880 + // Start of a comment. Consume the rest of it. 1.881 + // The new-line char that terminates the comment is always returned. 1.882 + // It will be treated as white-space, and serves to break up anything 1.883 + // that might otherwise incorrectly clump together with a comment in 1.884 + // the middle (a variable name, for example.) 1.885 + for (;;) { 1.886 + c.fChar = nextCharLL(); 1.887 + if (c.fChar == (UChar32)-1 || // EOF 1.888 + c.fChar == chCR || 1.889 + c.fChar == chLF || 1.890 + c.fChar == chNEL || 1.891 + c.fChar == chLS) {break;} 1.892 + } 1.893 + } 1.894 + if (c.fChar == (UChar32)-1) { 1.895 + return; 1.896 + } 1.897 + 1.898 + // 1.899 + // check for backslash escaped characters. 1.900 + // Use UnicodeString::unescapeAt() to handle them. 1.901 + // 1.902 + if (c.fChar == chBackSlash) { 1.903 + c.fEscaped = TRUE; 1.904 + int32_t startX = fNextIndex; 1.905 + c.fChar = fRB->fRules.unescapeAt(fNextIndex); 1.906 + if (fNextIndex == startX) { 1.907 + error(U_BRK_HEX_DIGITS_EXPECTED); 1.908 + } 1.909 + fCharNum += fNextIndex-startX; 1.910 + } 1.911 + } 1.912 + // putc(c.fChar, stdout); 1.913 +} 1.914 + 1.915 +//------------------------------------------------------------------------------ 1.916 +// 1.917 +// Parse RBBI rules. The state machine for rules parsing is here. 1.918 +// The state tables are hand-written in the file rbbirpt.txt, 1.919 +// and converted to the form used here by a perl 1.920 +// script rbbicst.pl 1.921 +// 1.922 +//------------------------------------------------------------------------------ 1.923 +void RBBIRuleScanner::parse() { 1.924 + uint16_t state; 1.925 + const RBBIRuleTableEl *tableEl; 1.926 + 1.927 + if (U_FAILURE(*fRB->fStatus)) { 1.928 + return; 1.929 + } 1.930 + 1.931 + state = 1; 1.932 + nextChar(fC); 1.933 + // 1.934 + // Main loop for the rule parsing state machine. 1.935 + // Runs once per state transition. 1.936 + // Each time through optionally performs, depending on the state table, 1.937 + // - an advance to the the next input char 1.938 + // - an action to be performed. 1.939 + // - pushing or popping a state to/from the local state return stack. 1.940 + // 1.941 + for (;;) { 1.942 + // Bail out if anything has gone wrong. 1.943 + // RBBI rule file parsing stops on the first error encountered. 1.944 + if (U_FAILURE(*fRB->fStatus)) { 1.945 + break; 1.946 + } 1.947 + 1.948 + // Quit if state == 0. This is the normal way to exit the state machine. 1.949 + // 1.950 + if (state == 0) { 1.951 + break; 1.952 + } 1.953 + 1.954 + // Find the state table element that matches the input char from the rule, or the 1.955 + // class of the input character. Start with the first table row for this 1.956 + // state, then linearly scan forward until we find a row that matches the 1.957 + // character. The last row for each state always matches all characters, so 1.958 + // the search will stop there, if not before. 1.959 + // 1.960 + tableEl = &gRuleParseStateTable[state]; 1.961 + #ifdef RBBI_DEBUG 1.962 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 1.963 + RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 1.964 + fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 1.965 + } 1.966 + #endif 1.967 + 1.968 + for (;;) { 1.969 + #ifdef RBBI_DEBUG 1.970 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");} 1.971 + #endif 1.972 + if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { 1.973 + // Table row specified an individual character, not a set, and 1.974 + // the input character is not escaped, and 1.975 + // the input character matched it. 1.976 + break; 1.977 + } 1.978 + if (tableEl->fCharClass == 255) { 1.979 + // Table row specified default, match anything character class. 1.980 + break; 1.981 + } 1.982 + if (tableEl->fCharClass == 254 && fC.fEscaped) { 1.983 + // Table row specified "escaped" and the char was escaped. 1.984 + break; 1.985 + } 1.986 + if (tableEl->fCharClass == 253 && fC.fEscaped && 1.987 + (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 1.988 + // Table row specified "escaped P" and the char is either 'p' or 'P'. 1.989 + break; 1.990 + } 1.991 + if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 1.992 + // Table row specified eof and we hit eof on the input. 1.993 + break; 1.994 + } 1.995 + 1.996 + if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 1.997 + fC.fEscaped == FALSE && // char is not escaped && 1.998 + fC.fChar != (UChar32)-1) { // char is not EOF 1.999 + U_ASSERT((tableEl->fCharClass-128) < LENGTHOF(fRuleSets)); 1.1000 + if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 1.1001 + // Table row specified a character class, or set of characters, 1.1002 + // and the current char matches it. 1.1003 + break; 1.1004 + } 1.1005 + } 1.1006 + 1.1007 + // No match on this row, advance to the next row for this state, 1.1008 + tableEl++; 1.1009 + } 1.1010 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1.1011 + 1.1012 + // 1.1013 + // We've found the row of the state table that matches the current input 1.1014 + // character from the rules string. 1.1015 + // Perform any action specified by this row in the state table. 1.1016 + if (doParseActions((int32_t)tableEl->fAction) == FALSE) { 1.1017 + // Break out of the state machine loop if the 1.1018 + // the action signalled some kind of error, or 1.1019 + // the action was to exit, occurs on normal end-of-rules-input. 1.1020 + break; 1.1021 + } 1.1022 + 1.1023 + if (tableEl->fPushState != 0) { 1.1024 + fStackPtr++; 1.1025 + if (fStackPtr >= kStackSize) { 1.1026 + error(U_BRK_INTERNAL_ERROR); 1.1027 + RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1.1028 + fStackPtr--; 1.1029 + } 1.1030 + fStack[fStackPtr] = tableEl->fPushState; 1.1031 + } 1.1032 + 1.1033 + if (tableEl->fNextChar) { 1.1034 + nextChar(fC); 1.1035 + } 1.1036 + 1.1037 + // Get the next state from the table entry, or from the 1.1038 + // state stack if the next state was specified as "pop". 1.1039 + if (tableEl->fNextState != 255) { 1.1040 + state = tableEl->fNextState; 1.1041 + } else { 1.1042 + state = fStack[fStackPtr]; 1.1043 + fStackPtr--; 1.1044 + if (fStackPtr < 0) { 1.1045 + error(U_BRK_INTERNAL_ERROR); 1.1046 + RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1.1047 + fStackPtr++; 1.1048 + } 1.1049 + } 1.1050 + 1.1051 + } 1.1052 + 1.1053 + // 1.1054 + // If there were NO user specified reverse rules, set up the equivalent of ".*;" 1.1055 + // 1.1056 + if (fRB->fReverseTree == NULL) { 1.1057 + fRB->fReverseTree = pushNewNode(RBBINode::opStar); 1.1058 + RBBINode *operand = pushNewNode(RBBINode::setRef); 1.1059 + findSetFor(UnicodeString(TRUE, kAny, 3), operand); 1.1060 + fRB->fReverseTree->fLeftChild = operand; 1.1061 + operand->fParent = fRB->fReverseTree; 1.1062 + fNodeStackPtr -= 2; 1.1063 + } 1.1064 + 1.1065 + 1.1066 + // 1.1067 + // Parsing of the input RBBI rules is complete. 1.1068 + // We now have a parse tree for the rule expressions 1.1069 + // and a list of all UnicodeSets that are referenced. 1.1070 + // 1.1071 +#ifdef RBBI_DEBUG 1.1072 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1.1073 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) 1.1074 + { 1.1075 + RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1.1076 + fRB->fForwardTree->printTree(TRUE); 1.1077 + RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1.1078 + fRB->fReverseTree->printTree(TRUE); 1.1079 + RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1.1080 + fRB->fSafeFwdTree->printTree(TRUE); 1.1081 + RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1.1082 + fRB->fSafeRevTree->printTree(TRUE); 1.1083 + } 1.1084 +#endif 1.1085 +} 1.1086 + 1.1087 + 1.1088 +//------------------------------------------------------------------------------ 1.1089 +// 1.1090 +// printNodeStack for debugging... 1.1091 +// 1.1092 +//------------------------------------------------------------------------------ 1.1093 +#ifdef RBBI_DEBUG 1.1094 +void RBBIRuleScanner::printNodeStack(const char *title) { 1.1095 + int i; 1.1096 + RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1.1097 + for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);} 1.1098 +} 1.1099 +#endif 1.1100 + 1.1101 + 1.1102 + 1.1103 + 1.1104 +//------------------------------------------------------------------------------ 1.1105 +// 1.1106 +// pushNewNode create a new RBBINode of the specified type and push it 1.1107 +// onto the stack of nodes. 1.1108 +// 1.1109 +//------------------------------------------------------------------------------ 1.1110 +RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1.1111 + fNodeStackPtr++; 1.1112 + if (fNodeStackPtr >= kStackSize) { 1.1113 + error(U_BRK_INTERNAL_ERROR); 1.1114 + RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1.1115 + *fRB->fStatus = U_BRK_INTERNAL_ERROR; 1.1116 + return NULL; 1.1117 + } 1.1118 + fNodeStack[fNodeStackPtr] = new RBBINode(t); 1.1119 + if (fNodeStack[fNodeStackPtr] == NULL) { 1.1120 + *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1.1121 + } 1.1122 + return fNodeStack[fNodeStackPtr]; 1.1123 +} 1.1124 + 1.1125 + 1.1126 + 1.1127 +//------------------------------------------------------------------------------ 1.1128 +// 1.1129 +// scanSet Construct a UnicodeSet from the text at the current scan 1.1130 +// position. Advance the scan position to the first character 1.1131 +// after the set. 1.1132 +// 1.1133 +// A new RBBI setref node referring to the set is pushed onto the node 1.1134 +// stack. 1.1135 +// 1.1136 +// The scan position is normally under the control of the state machine 1.1137 +// that controls rule parsing. UnicodeSets, however, are parsed by 1.1138 +// the UnicodeSet constructor, not by the RBBI rule parser. 1.1139 +// 1.1140 +//------------------------------------------------------------------------------ 1.1141 +void RBBIRuleScanner::scanSet() { 1.1142 + UnicodeSet *uset; 1.1143 + ParsePosition pos; 1.1144 + int startPos; 1.1145 + int i; 1.1146 + 1.1147 + if (U_FAILURE(*fRB->fStatus)) { 1.1148 + return; 1.1149 + } 1.1150 + 1.1151 + pos.setIndex(fScanIndex); 1.1152 + startPos = fScanIndex; 1.1153 + UErrorCode localStatus = U_ZERO_ERROR; 1.1154 + uset = new UnicodeSet(); 1.1155 + if (uset == NULL) { 1.1156 + localStatus = U_MEMORY_ALLOCATION_ERROR; 1.1157 + } else { 1.1158 + uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 1.1159 + } 1.1160 + if (U_FAILURE(localStatus)) { 1.1161 + // TODO: Get more accurate position of the error from UnicodeSet's return info. 1.1162 + // UnicodeSet appears to not be reporting correctly at this time. 1.1163 + #ifdef RBBI_DEBUG 1.1164 + RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); 1.1165 + #endif 1.1166 + error(localStatus); 1.1167 + delete uset; 1.1168 + return; 1.1169 + } 1.1170 + 1.1171 + // Verify that the set contains at least one code point. 1.1172 + // 1.1173 + U_ASSERT(uset!=NULL); 1.1174 + if (uset->isEmpty()) { 1.1175 + // This set is empty. 1.1176 + // Make it an error, because it almost certainly is not what the user wanted. 1.1177 + // Also, avoids having to think about corner cases in the tree manipulation code 1.1178 + // that occurs later on. 1.1179 + error(U_BRK_RULE_EMPTY_SET); 1.1180 + delete uset; 1.1181 + return; 1.1182 + } 1.1183 + 1.1184 + 1.1185 + // Advance the RBBI parse postion over the UnicodeSet pattern. 1.1186 + // Don't just set fScanIndex because the line/char positions maintained 1.1187 + // for error reporting would be thrown off. 1.1188 + i = pos.getIndex(); 1.1189 + for (;;) { 1.1190 + if (fNextIndex >= i) { 1.1191 + break; 1.1192 + } 1.1193 + nextCharLL(); 1.1194 + } 1.1195 + 1.1196 + if (U_SUCCESS(*fRB->fStatus)) { 1.1197 + RBBINode *n; 1.1198 + 1.1199 + n = pushNewNode(RBBINode::setRef); 1.1200 + n->fFirstPos = startPos; 1.1201 + n->fLastPos = fNextIndex; 1.1202 + fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1.1203 + // findSetFor() serves several purposes here: 1.1204 + // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1.1205 + // - Mantains collection of all sets in use, needed later for establishing 1.1206 + // character categories for run time engine. 1.1207 + // - Eliminates mulitiple instances of the same set. 1.1208 + // - Creates a new uset node if necessary (if this isn't a duplicate.) 1.1209 + findSetFor(n->fText, n, uset); 1.1210 + } 1.1211 + 1.1212 +} 1.1213 + 1.1214 +U_NAMESPACE_END 1.1215 + 1.1216 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */