intl/icu/source/common/rbbiscan.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbiscan.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1213 @@
     1.4 +
     1.5 +//
     1.6 +//  file:  rbbiscan.cpp
     1.7 +//
     1.8 +//  Copyright (C) 2002-2012, International Business Machines Corporation and others.
     1.9 +//  All Rights Reserved.
    1.10 +//
    1.11 +//  This file contains the Rule Based Break Iterator Rule Builder functions for
    1.12 +//   scanning the rules and assembling a parse tree.  This is the first phase
    1.13 +//   of compiling the rules.
    1.14 +//
    1.15 +//  The overall of the rules is managed by class RBBIRuleBuilder, which will
    1.16 +//  create and use an instance of this class as part of the process.
    1.17 +//
    1.18 +
    1.19 +#include "unicode/utypes.h"
    1.20 +
    1.21 +#if !UCONFIG_NO_BREAK_ITERATION
    1.22 +
    1.23 +#include "unicode/unistr.h"
    1.24 +#include "unicode/uniset.h"
    1.25 +#include "unicode/uchar.h"
    1.26 +#include "unicode/uchriter.h"
    1.27 +#include "unicode/parsepos.h"
    1.28 +#include "unicode/parseerr.h"
    1.29 +#include "cmemory.h"
    1.30 +#include "cstring.h"
    1.31 +
    1.32 +#include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
    1.33 +                       //   generated by a Perl script.
    1.34 +#include "rbbirb.h"
    1.35 +#include "rbbinode.h"
    1.36 +#include "rbbiscan.h"
    1.37 +#include "rbbitblb.h"
    1.38 +
    1.39 +#include "uassert.h"
    1.40 +
    1.41 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    1.42 +
    1.43 +//------------------------------------------------------------------------------
    1.44 +//
    1.45 +// Unicode Set init strings for each of the character classes needed for parsing a rule file.
    1.46 +//               (Initialized with hex values for portability to EBCDIC based machines.
    1.47 +//                Really ugly, but there's no good way to avoid it.)
    1.48 +//
    1.49 +//              The sets are referred to by name in the rbbirpt.txt, which is the
    1.50 +//              source form of the state transition table for the RBBI rule parser.
    1.51 +//
    1.52 +//------------------------------------------------------------------------------
    1.53 +static const UChar gRuleSet_rule_char_pattern[]       = {
    1.54 + //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
    1.55 +    0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
    1.56 + //   -    \      u    0     0     7      f     ]     -     [    \      p
    1.57 +    0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
    1.58 + //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
    1.59 +    0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
    1.60 +
    1.61 +static const UChar gRuleSet_name_char_pattern[]       = {
    1.62 +//    [    _      \    p     {     L      }     \     p     {    N      }     ]
    1.63 +    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
    1.64 +
    1.65 +static const UChar gRuleSet_digit_char_pattern[] = {
    1.66 +//    [    0      -    9     ]
    1.67 +    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
    1.68 +
    1.69 +static const UChar gRuleSet_name_start_char_pattern[] = {
    1.70 +//    [    _      \    p     {     L      }     ]
    1.71 +    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
    1.72 +
    1.73 +static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
    1.74 +
    1.75 +
    1.76 +U_CDECL_BEGIN
    1.77 +static void U_CALLCONV RBBISetTable_deleter(void *p) {
    1.78 +    icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
    1.79 +    delete px->key;
    1.80 +    // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
    1.81 +    //        Don't delete the value nodes here.
    1.82 +    uprv_free(px);
    1.83 +}
    1.84 +U_CDECL_END
    1.85 +
    1.86 +U_NAMESPACE_BEGIN
    1.87 +
    1.88 +//------------------------------------------------------------------------------
    1.89 +//
    1.90 +//  Constructor.
    1.91 +//
    1.92 +//------------------------------------------------------------------------------
    1.93 +RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
    1.94 +{
    1.95 +    fRB                 = rb;
    1.96 +    fStackPtr           = 0;
    1.97 +    fStack[fStackPtr]   = 0;
    1.98 +    fNodeStackPtr       = 0;
    1.99 +    fRuleNum            = 0;
   1.100 +    fNodeStack[0]       = NULL;
   1.101 +
   1.102 +    fSymbolTable                            = NULL;
   1.103 +    fSetTable                               = NULL;
   1.104 +
   1.105 +    fScanIndex = 0;
   1.106 +    fNextIndex = 0;
   1.107 +
   1.108 +    fReverseRule        = FALSE;
   1.109 +    fLookAheadRule      = FALSE;
   1.110 +
   1.111 +    fLineNum    = 1;
   1.112 +    fCharNum    = 0;
   1.113 +    fQuoteMode  = FALSE;
   1.114 +
   1.115 +    // Do not check status until after all critical fields are sufficiently initialized
   1.116 +    //   that the destructor can run cleanly.
   1.117 +    if (U_FAILURE(*rb->fStatus)) {
   1.118 +        return;
   1.119 +    }
   1.120 +
   1.121 +    //
   1.122 +    //  Set up the constant Unicode Sets.
   1.123 +    //     Note:  These could be made static, lazily initialized, and shared among
   1.124 +    //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
   1.125 +    //            and the time to build these few sets should be small compared to a
   1.126 +    //            full break iterator build.
   1.127 +    fRuleSets[kRuleSet_rule_char-128]
   1.128 +        = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus);
   1.129 +    // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
   1.130 +    fRuleSets[kRuleSet_white_space-128].
   1.131 +        add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
   1.132 +    fRuleSets[kRuleSet_name_char-128]
   1.133 +        = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus);
   1.134 +    fRuleSets[kRuleSet_name_start_char-128]
   1.135 +        = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
   1.136 +    fRuleSets[kRuleSet_digit_char-128]
   1.137 +        = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus);
   1.138 +    if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
   1.139 +        // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
   1.140 +        //   names from the init string, can't find them, and claims an illegal argument.
   1.141 +        //   Change the error so that the actual problem will be clearer to users.
   1.142 +        *rb->fStatus = U_BRK_INIT_ERROR;
   1.143 +    }
   1.144 +    if (U_FAILURE(*rb->fStatus)) {
   1.145 +        return;
   1.146 +    }
   1.147 +
   1.148 +    fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
   1.149 +    if (fSymbolTable == NULL) {
   1.150 +        *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.151 +        return;
   1.152 +    }
   1.153 +    fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus);
   1.154 +    if (U_FAILURE(*rb->fStatus)) {
   1.155 +        return;
   1.156 +    }
   1.157 +    uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
   1.158 +}
   1.159 +
   1.160 +
   1.161 +
   1.162 +//------------------------------------------------------------------------------
   1.163 +//
   1.164 +//  Destructor
   1.165 +//
   1.166 +//------------------------------------------------------------------------------
   1.167 +RBBIRuleScanner::~RBBIRuleScanner() {
   1.168 +    delete fSymbolTable;
   1.169 +    if (fSetTable != NULL) {
   1.170 +         uhash_close(fSetTable);
   1.171 +         fSetTable = NULL;
   1.172 +
   1.173 +    }
   1.174 +
   1.175 +
   1.176 +    // Node Stack.
   1.177 +    //   Normally has one entry, which is the entire parse tree for the rules.
   1.178 +    //   If errors occured, there may be additional subtrees left on the stack.
   1.179 +    while (fNodeStackPtr > 0) {
   1.180 +        delete fNodeStack[fNodeStackPtr];
   1.181 +        fNodeStackPtr--;
   1.182 +    }
   1.183 +
   1.184 +}
   1.185 +
   1.186 +//------------------------------------------------------------------------------
   1.187 +//
   1.188 +//  doParseAction        Do some action during rule parsing.
   1.189 +//                       Called by the parse state machine.
   1.190 +//                       Actions build the parse tree and Unicode Sets,
   1.191 +//                       and maintain the parse stack for nested expressions.
   1.192 +//
   1.193 +//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
   1.194 +//                              They represent exactly the same thing.  They're separate
   1.195 +//                              only to work around enum forward declaration restrictions
   1.196 +//                              in some compilers, while at the same time avoiding multiple
   1.197 +//                              definitions problems.  I'm sure that there's a better way.
   1.198 +//
   1.199 +//------------------------------------------------------------------------------
   1.200 +UBool RBBIRuleScanner::doParseActions(int32_t action)
   1.201 +{
   1.202 +    RBBINode *n       = NULL;
   1.203 +
   1.204 +    UBool   returnVal = TRUE;
   1.205 +
   1.206 +    switch (action) {
   1.207 +
   1.208 +    case doExprStart:
   1.209 +        pushNewNode(RBBINode::opStart);
   1.210 +        fRuleNum++;
   1.211 +        break;
   1.212 +
   1.213 +
   1.214 +    case doExprOrOperator:
   1.215 +        {
   1.216 +            fixOpStack(RBBINode::precOpCat);
   1.217 +            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
   1.218 +            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
   1.219 +            orNode->fLeftChild     = operandNode;
   1.220 +            operandNode->fParent   = orNode;
   1.221 +        }
   1.222 +        break;
   1.223 +
   1.224 +    case doExprCatOperator:
   1.225 +        // concatenation operator.
   1.226 +        // For the implicit concatenation of adjacent terms in an expression that are
   1.227 +        //   not separated by any other operator.  Action is invoked between the
   1.228 +        //   actions for the two terms.
   1.229 +        {
   1.230 +            fixOpStack(RBBINode::precOpCat);
   1.231 +            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
   1.232 +            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
   1.233 +            catNode->fLeftChild    = operandNode;
   1.234 +            operandNode->fParent   = catNode;
   1.235 +        }
   1.236 +        break;
   1.237 +
   1.238 +    case doLParen:
   1.239 +        // Open Paren.
   1.240 +        //   The openParen node is a dummy operation type with a low precedence,
   1.241 +        //     which has the affect of ensuring that any real binary op that
   1.242 +        //     follows within the parens binds more tightly to the operands than
   1.243 +        //     stuff outside of the parens.
   1.244 +        pushNewNode(RBBINode::opLParen);
   1.245 +        break;
   1.246 +
   1.247 +    case doExprRParen:
   1.248 +        fixOpStack(RBBINode::precLParen);
   1.249 +        break;
   1.250 +
   1.251 +    case doNOP:
   1.252 +        break;
   1.253 +
   1.254 +    case doStartAssign:
   1.255 +        // We've just scanned "$variable = "
   1.256 +        // The top of the node stack has the $variable ref node.
   1.257 +
   1.258 +        // Save the start position of the RHS text in the StartExpression node
   1.259 +        //   that precedes the $variableReference node on the stack.
   1.260 +        //   This will eventually be used when saving the full $variable replacement
   1.261 +        //   text as a string.
   1.262 +        n = fNodeStack[fNodeStackPtr-1];
   1.263 +        n->fFirstPos = fNextIndex;              // move past the '='
   1.264 +
   1.265 +        // Push a new start-of-expression node; needed to keep parse of the
   1.266 +        //   RHS expression happy.
   1.267 +        pushNewNode(RBBINode::opStart);
   1.268 +        break;
   1.269 +
   1.270 +
   1.271 +
   1.272 +
   1.273 +    case doEndAssign:
   1.274 +        {
   1.275 +            // We have reached the end of an assignement statement.
   1.276 +            //   Current scan char is the ';' that terminates the assignment.
   1.277 +
   1.278 +            // Terminate expression, leaves expression parse tree rooted in TOS node.
   1.279 +            fixOpStack(RBBINode::precStart);
   1.280 +
   1.281 +            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
   1.282 +            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
   1.283 +            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
   1.284 +
   1.285 +            // Save original text of right side of assignment, excluding the terminating ';'
   1.286 +            //  in the root of the node for the right-hand-side expression.
   1.287 +            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
   1.288 +            RHSExprNode->fLastPos  = fScanIndex;
   1.289 +            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
   1.290 +
   1.291 +            // Expression parse tree becomes l. child of the $variable reference node.
   1.292 +            varRefNode->fLeftChild = RHSExprNode;
   1.293 +            RHSExprNode->fParent   = varRefNode;
   1.294 +
   1.295 +            // Make a symbol table entry for the $variableRef node.
   1.296 +            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
   1.297 +            if (U_FAILURE(*fRB->fStatus)) {
   1.298 +                // This is a round-about way to get the parse position set
   1.299 +                //  so that duplicate symbols error messages include a line number.
   1.300 +                UErrorCode t = *fRB->fStatus;
   1.301 +                *fRB->fStatus = U_ZERO_ERROR;
   1.302 +                error(t);
   1.303 +            }
   1.304 +
   1.305 +            // Clean up the stack.
   1.306 +            delete startExprNode;
   1.307 +            fNodeStackPtr-=3;
   1.308 +            break;
   1.309 +        }
   1.310 +
   1.311 +    case doEndOfRule:
   1.312 +        {
   1.313 +        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
   1.314 +        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
   1.315 +            break;
   1.316 +        }
   1.317 +#ifdef RBBI_DEBUG
   1.318 +        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
   1.319 +#endif
   1.320 +        U_ASSERT(fNodeStackPtr == 1);
   1.321 +
   1.322 +        // If this rule includes a look-ahead '/', add a endMark node to the
   1.323 +        //   expression tree.
   1.324 +        if (fLookAheadRule) {
   1.325 +            RBBINode  *thisRule       = fNodeStack[fNodeStackPtr];
   1.326 +            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
   1.327 +            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
   1.328 +            fNodeStackPtr -= 2;
   1.329 +            catNode->fLeftChild       = thisRule;
   1.330 +            catNode->fRightChild      = endNode;
   1.331 +            fNodeStack[fNodeStackPtr] = catNode;
   1.332 +            endNode->fVal             = fRuleNum;
   1.333 +            endNode->fLookAheadEnd    = TRUE;
   1.334 +        }
   1.335 +
   1.336 +        // All rule expressions are ORed together.
   1.337 +        // The ';' that terminates an expression really just functions as a '|' with
   1.338 +        //   a low operator prededence.
   1.339 +        //
   1.340 +        // Each of the four sets of rules are collected separately.
   1.341 +        //  (forward, reverse, safe_forward, safe_reverse)
   1.342 +        //  OR this rule into the appropriate group of them.
   1.343 +        //
   1.344 +        RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
   1.345 +
   1.346 +        if (*destRules != NULL) {
   1.347 +            // This is not the first rule encounted.
   1.348 +            // OR previous stuff  (from *destRules)
   1.349 +            // with the current rule expression (on the Node Stack)
   1.350 +            //  with the resulting OR expression going to *destRules
   1.351 +            //
   1.352 +            RBBINode  *thisRule    = fNodeStack[fNodeStackPtr];
   1.353 +            RBBINode  *prevRules   = *destRules;
   1.354 +            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
   1.355 +            orNode->fLeftChild     = prevRules;
   1.356 +            prevRules->fParent     = orNode;
   1.357 +            orNode->fRightChild    = thisRule;
   1.358 +            thisRule->fParent      = orNode;
   1.359 +            *destRules             = orNode;
   1.360 +        }
   1.361 +        else
   1.362 +        {
   1.363 +            // This is the first rule encountered (for this direction).
   1.364 +            // Just move its parse tree from the stack to *destRules.
   1.365 +            *destRules = fNodeStack[fNodeStackPtr];
   1.366 +        }
   1.367 +        fReverseRule   = FALSE;   // in preparation for the next rule.
   1.368 +        fLookAheadRule = FALSE;
   1.369 +        fNodeStackPtr  = 0;
   1.370 +        }
   1.371 +        break;
   1.372 +
   1.373 +
   1.374 +    case doRuleError:
   1.375 +        error(U_BRK_RULE_SYNTAX);
   1.376 +        returnVal = FALSE;
   1.377 +        break;
   1.378 +
   1.379 +
   1.380 +    case doVariableNameExpectedErr:
   1.381 +        error(U_BRK_RULE_SYNTAX);
   1.382 +        break;
   1.383 +
   1.384 +
   1.385 +    //
   1.386 +    //  Unary operands  + ? *
   1.387 +    //    These all appear after the operand to which they apply.
   1.388 +    //    When we hit one, the operand (may be a whole sub expression)
   1.389 +    //    will be on the top of the stack.
   1.390 +    //    Unary Operator becomes TOS, with the old TOS as its one child.
   1.391 +    case doUnaryOpPlus:
   1.392 +        {
   1.393 +            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
   1.394 +            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
   1.395 +            plusNode->fLeftChild   = operandNode;
   1.396 +            operandNode->fParent   = plusNode;
   1.397 +        }
   1.398 +        break;
   1.399 +
   1.400 +    case doUnaryOpQuestion:
   1.401 +        {
   1.402 +            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
   1.403 +            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
   1.404 +            qNode->fLeftChild      = operandNode;
   1.405 +            operandNode->fParent   = qNode;
   1.406 +        }
   1.407 +        break;
   1.408 +
   1.409 +    case doUnaryOpStar:
   1.410 +        {
   1.411 +            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
   1.412 +            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
   1.413 +            starNode->fLeftChild   = operandNode;
   1.414 +            operandNode->fParent   = starNode;
   1.415 +        }
   1.416 +        break;
   1.417 +
   1.418 +    case doRuleChar:
   1.419 +        // A "Rule Character" is any single character that is a literal part
   1.420 +        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
   1.421 +        // These are pretty uncommon in break rules; the terms are more commonly
   1.422 +        //  sets.  To keep things uniform, treat these characters like as
   1.423 +        // sets that just happen to contain only one character.
   1.424 +        {
   1.425 +            n = pushNewNode(RBBINode::setRef);
   1.426 +            findSetFor(UnicodeString(fC.fChar), n);
   1.427 +            n->fFirstPos = fScanIndex;
   1.428 +            n->fLastPos  = fNextIndex;
   1.429 +            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
   1.430 +            break;
   1.431 +        }
   1.432 +
   1.433 +    case doDotAny:
   1.434 +        // scanned a ".", meaning match any single character.
   1.435 +        {
   1.436 +            n = pushNewNode(RBBINode::setRef);
   1.437 +            findSetFor(UnicodeString(TRUE, kAny, 3), n);
   1.438 +            n->fFirstPos = fScanIndex;
   1.439 +            n->fLastPos  = fNextIndex;
   1.440 +            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
   1.441 +            break;
   1.442 +        }
   1.443 +
   1.444 +    case doSlash:
   1.445 +        // Scanned a '/', which identifies a look-ahead break position in a rule.
   1.446 +        n = pushNewNode(RBBINode::lookAhead);
   1.447 +        n->fVal      = fRuleNum;
   1.448 +        n->fFirstPos = fScanIndex;
   1.449 +        n->fLastPos  = fNextIndex;
   1.450 +        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
   1.451 +        fLookAheadRule = TRUE;
   1.452 +        break;
   1.453 +
   1.454 +
   1.455 +    case doStartTagValue:
   1.456 +        // Scanned a '{', the opening delimiter for a tag value within a rule.
   1.457 +        n = pushNewNode(RBBINode::tag);
   1.458 +        n->fVal      = 0;
   1.459 +        n->fFirstPos = fScanIndex;
   1.460 +        n->fLastPos  = fNextIndex;
   1.461 +        break;
   1.462 +
   1.463 +    case doTagDigit:
   1.464 +        // Just scanned a decimal digit that's part of a tag value
   1.465 +        {
   1.466 +            n = fNodeStack[fNodeStackPtr];
   1.467 +            uint32_t v = u_charDigitValue(fC.fChar);
   1.468 +            U_ASSERT(v < 10);
   1.469 +            n->fVal = n->fVal*10 + v;
   1.470 +            break;
   1.471 +        }
   1.472 +
   1.473 +    case doTagValue:
   1.474 +        n = fNodeStack[fNodeStackPtr];
   1.475 +        n->fLastPos = fNextIndex;
   1.476 +        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
   1.477 +        break;
   1.478 +
   1.479 +    case doTagExpectedError:
   1.480 +        error(U_BRK_MALFORMED_RULE_TAG);
   1.481 +        returnVal = FALSE;
   1.482 +        break;
   1.483 +
   1.484 +    case doOptionStart:
   1.485 +        // Scanning a !!option.   At the start of string.
   1.486 +        fOptionStart = fScanIndex;
   1.487 +        break;
   1.488 +
   1.489 +    case doOptionEnd:
   1.490 +        {
   1.491 +            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
   1.492 +            if (opt == UNICODE_STRING("chain", 5)) {
   1.493 +                fRB->fChainRules = TRUE;
   1.494 +            } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) {
   1.495 +                fRB->fLBCMNoChain = TRUE;
   1.496 +            } else if (opt == UNICODE_STRING("forward", 7)) {
   1.497 +                fRB->fDefaultTree   = &fRB->fForwardTree;
   1.498 +            } else if (opt == UNICODE_STRING("reverse", 7)) {
   1.499 +                fRB->fDefaultTree   = &fRB->fReverseTree;
   1.500 +            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
   1.501 +                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
   1.502 +            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
   1.503 +                fRB->fDefaultTree   = &fRB->fSafeRevTree;
   1.504 +            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
   1.505 +                fRB->fLookAheadHardBreak = TRUE;
   1.506 +            } else {
   1.507 +                error(U_BRK_UNRECOGNIZED_OPTION);
   1.508 +            }
   1.509 +        }
   1.510 +        break;
   1.511 +
   1.512 +    case doReverseDir:
   1.513 +        fReverseRule = TRUE;
   1.514 +        break;
   1.515 +
   1.516 +    case doStartVariableName:
   1.517 +        n = pushNewNode(RBBINode::varRef);
   1.518 +        if (U_FAILURE(*fRB->fStatus)) {
   1.519 +            break;
   1.520 +        }
   1.521 +        n->fFirstPos = fScanIndex;
   1.522 +        break;
   1.523 +
   1.524 +    case doEndVariableName:
   1.525 +        n = fNodeStack[fNodeStackPtr];
   1.526 +        if (n==NULL || n->fType != RBBINode::varRef) {
   1.527 +            error(U_BRK_INTERNAL_ERROR);
   1.528 +            break;
   1.529 +        }
   1.530 +        n->fLastPos = fScanIndex;
   1.531 +        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
   1.532 +        // Look the newly scanned name up in the symbol table
   1.533 +        //   If there's an entry, set the l. child of the var ref to the replacement expression.
   1.534 +        //   (We also pass through here when scanning assignments, but no harm is done, other
   1.535 +        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
   1.536 +        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
   1.537 +        break;
   1.538 +
   1.539 +    case doCheckVarDef:
   1.540 +        n = fNodeStack[fNodeStackPtr];
   1.541 +        if (n->fLeftChild == NULL) {
   1.542 +            error(U_BRK_UNDEFINED_VARIABLE);
   1.543 +            returnVal = FALSE;
   1.544 +        }
   1.545 +        break;
   1.546 +
   1.547 +    case doExprFinished:
   1.548 +        break;
   1.549 +
   1.550 +    case doRuleErrorAssignExpr:
   1.551 +        error(U_BRK_ASSIGN_ERROR);
   1.552 +        returnVal = FALSE;
   1.553 +        break;
   1.554 +
   1.555 +    case doExit:
   1.556 +        returnVal = FALSE;
   1.557 +        break;
   1.558 +
   1.559 +    case doScanUnicodeSet:
   1.560 +        scanSet();
   1.561 +        break;
   1.562 +
   1.563 +    default:
   1.564 +        error(U_BRK_INTERNAL_ERROR);
   1.565 +        returnVal = FALSE;
   1.566 +        break;
   1.567 +    }
   1.568 +    return returnVal;
   1.569 +}
   1.570 +
   1.571 +
   1.572 +
   1.573 +
   1.574 +//------------------------------------------------------------------------------
   1.575 +//
   1.576 +//  Error         Report a rule parse error.
   1.577 +//                Only report it if no previous error has been recorded.
   1.578 +//
   1.579 +//------------------------------------------------------------------------------
   1.580 +void RBBIRuleScanner::error(UErrorCode e) {
   1.581 +    if (U_SUCCESS(*fRB->fStatus)) {
   1.582 +        *fRB->fStatus = e;
   1.583 +        if (fRB->fParseError) {
   1.584 +            fRB->fParseError->line  = fLineNum;
   1.585 +            fRB->fParseError->offset = fCharNum;
   1.586 +            fRB->fParseError->preContext[0] = 0;
   1.587 +            fRB->fParseError->preContext[0] = 0;
   1.588 +        }
   1.589 +    }
   1.590 +}
   1.591 +
   1.592 +
   1.593 +
   1.594 +
   1.595 +//------------------------------------------------------------------------------
   1.596 +//
   1.597 +//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
   1.598 +//               An entry on the stack may be as small as a single setRef node,
   1.599 +//               or as large as the parse tree
   1.600 +//               for an entire expression (this will be the one item left on the stack
   1.601 +//               when the parsing of an RBBI rule completes.
   1.602 +//
   1.603 +//               This function is called when a binary operator is encountered.
   1.604 +//               It looks back up the stack for operators that are not yet associated
   1.605 +//               with a right operand, and if the precedence of the stacked operator >=
   1.606 +//               the precedence of the current operator, binds the operand left,
   1.607 +//               to the previously encountered operator.
   1.608 +//
   1.609 +//------------------------------------------------------------------------------
   1.610 +void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
   1.611 +    RBBINode *n;
   1.612 +    // printNodeStack("entering fixOpStack()");
   1.613 +    for (;;) {
   1.614 +        n = fNodeStack[fNodeStackPtr-1];   // an operator node
   1.615 +        if (n->fPrecedence == 0) {
   1.616 +            RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
   1.617 +            error(U_BRK_INTERNAL_ERROR);
   1.618 +            return;
   1.619 +        }
   1.620 +
   1.621 +        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
   1.622 +            // The most recent operand goes with the current operator,
   1.623 +            //   not with the previously stacked one.
   1.624 +            break;
   1.625 +        }
   1.626 +            // Stack operator is a binary op  ( '|' or concatenation)
   1.627 +            //   TOS operand becomes right child of this operator.
   1.628 +            //   Resulting subexpression becomes the TOS operand.
   1.629 +            n->fRightChild = fNodeStack[fNodeStackPtr];
   1.630 +            fNodeStack[fNodeStackPtr]->fParent = n;
   1.631 +            fNodeStackPtr--;
   1.632 +        // printNodeStack("looping in fixOpStack()   ");
   1.633 +    }
   1.634 +
   1.635 +    if (p <= RBBINode::precLParen) {
   1.636 +        // Scan is at a right paren or end of expression.
   1.637 +        //  The scanned item must match the stack, or else there was an error.
   1.638 +        //  Discard the left paren (or start expr) node from the stack,
   1.639 +            //  leaving the completed (sub)expression as TOS.
   1.640 +            if (n->fPrecedence != p) {
   1.641 +                // Right paren encountered matched start of expression node, or
   1.642 +                // end of expression matched with a left paren node.
   1.643 +                error(U_BRK_MISMATCHED_PAREN);
   1.644 +            }
   1.645 +            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
   1.646 +            fNodeStackPtr--;
   1.647 +            // Delete the now-discarded LParen or Start node.
   1.648 +            delete n;
   1.649 +    }
   1.650 +    // printNodeStack("leaving fixOpStack()");
   1.651 +}
   1.652 +
   1.653 +
   1.654 +
   1.655 +
   1.656 +//------------------------------------------------------------------------------
   1.657 +//
   1.658 +//   findSetFor    given a UnicodeString,
   1.659 +//                  - find the corresponding Unicode Set  (uset node)
   1.660 +//                         (create one if necessary)
   1.661 +//                  - Set fLeftChild of the caller's node (should be a setRef node)
   1.662 +//                         to the uset node
   1.663 +//                 Maintain a hash table of uset nodes, so the same one is always used
   1.664 +//                    for the same string.
   1.665 +//                 If a "to adopt" set is provided and we haven't seen this key before,
   1.666 +//                    add the provided set to the hash table.
   1.667 +//                 If the string is one (32 bit) char in length, the set contains
   1.668 +//                    just one element which is the char in question.
   1.669 +//                 If the string is "any", return a set containing all chars.
   1.670 +//
   1.671 +//------------------------------------------------------------------------------
   1.672 +void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
   1.673 +
   1.674 +    RBBISetTableEl   *el;
   1.675 +
   1.676 +    // First check whether we've already cached a set for this string.
   1.677 +    // If so, just use the cached set in the new node.
   1.678 +    //   delete any set provided by the caller, since we own it.
   1.679 +    el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
   1.680 +    if (el != NULL) {
   1.681 +        delete setToAdopt;
   1.682 +        node->fLeftChild = el->val;
   1.683 +        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
   1.684 +        return;
   1.685 +    }
   1.686 +
   1.687 +    // Haven't seen this set before.
   1.688 +    // If the caller didn't provide us with a prebuilt set,
   1.689 +    //   create a new UnicodeSet now.
   1.690 +    if (setToAdopt == NULL) {
   1.691 +        if (s.compare(kAny, -1) == 0) {
   1.692 +            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
   1.693 +        } else {
   1.694 +            UChar32 c;
   1.695 +            c = s.char32At(0);
   1.696 +            setToAdopt = new UnicodeSet(c, c);
   1.697 +        }
   1.698 +    }
   1.699 +
   1.700 +    //
   1.701 +    // Make a new uset node to refer to this UnicodeSet
   1.702 +    // This new uset node becomes the child of the caller's setReference node.
   1.703 +    //
   1.704 +    RBBINode *usetNode    = new RBBINode(RBBINode::uset);
   1.705 +    if (usetNode == NULL) {
   1.706 +        error(U_MEMORY_ALLOCATION_ERROR);
   1.707 +        return;
   1.708 +    }
   1.709 +    usetNode->fInputSet   = setToAdopt;
   1.710 +    usetNode->fParent     = node;
   1.711 +    node->fLeftChild      = usetNode;
   1.712 +    usetNode->fText = s;
   1.713 +
   1.714 +
   1.715 +    //
   1.716 +    // Add the new uset node to the list of all uset nodes.
   1.717 +    //
   1.718 +    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
   1.719 +
   1.720 +
   1.721 +    //
   1.722 +    // Add the new set to the set hash table.
   1.723 +    //
   1.724 +    el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
   1.725 +    UnicodeString *tkey = new UnicodeString(s);
   1.726 +    if (tkey == NULL || el == NULL || setToAdopt == NULL) {
   1.727 +        // Delete to avoid memory leak
   1.728 +        delete tkey;
   1.729 +        tkey = NULL;
   1.730 +        uprv_free(el);
   1.731 +        el = NULL;
   1.732 +        delete setToAdopt;
   1.733 +        setToAdopt = NULL;
   1.734 +
   1.735 +        error(U_MEMORY_ALLOCATION_ERROR);
   1.736 +        return;
   1.737 +    }
   1.738 +    el->key = tkey;
   1.739 +    el->val = usetNode;
   1.740 +    uhash_put(fSetTable, el->key, el, fRB->fStatus);
   1.741 +
   1.742 +    return;
   1.743 +}
   1.744 +
   1.745 +
   1.746 +
   1.747 +//
   1.748 +//  Assorted Unicode character constants.
   1.749 +//     Numeric because there is no portable way to enter them as literals.
   1.750 +//     (Think EBCDIC).
   1.751 +//
   1.752 +static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
   1.753 +static const UChar      chLF        = 0x0a;
   1.754 +static const UChar      chNEL       = 0x85;      //    NEL newline variant
   1.755 +static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
   1.756 +static const UChar      chApos      = 0x27;      //  single quote, for quoted chars.
   1.757 +static const UChar      chPound     = 0x23;      // '#', introduces a comment.
   1.758 +static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
   1.759 +static const UChar      chLParen    = 0x28;
   1.760 +static const UChar      chRParen    = 0x29;
   1.761 +
   1.762 +
   1.763 +//------------------------------------------------------------------------------
   1.764 +//
   1.765 +//  stripRules    Return a rules string without unnecessary
   1.766 +//                characters.
   1.767 +//
   1.768 +//------------------------------------------------------------------------------
   1.769 +UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
   1.770 +    UnicodeString strippedRules;
   1.771 +    int rulesLength = rules.length();
   1.772 +    for (int idx = 0; idx < rulesLength; ) {
   1.773 +        UChar ch = rules[idx++];
   1.774 +        if (ch == chPound) {
   1.775 +            while (idx < rulesLength
   1.776 +                && ch != chCR && ch != chLF && ch != chNEL)
   1.777 +            {
   1.778 +                ch = rules[idx++];
   1.779 +            }
   1.780 +        }
   1.781 +        if (!u_isISOControl(ch)) {
   1.782 +            strippedRules.append(ch);
   1.783 +        }
   1.784 +    }
   1.785 +    // strippedRules = strippedRules.unescape();
   1.786 +    return strippedRules;
   1.787 +}
   1.788 +
   1.789 +
   1.790 +//------------------------------------------------------------------------------
   1.791 +//
   1.792 +//  nextCharLL    Low Level Next Char from rule input source.
   1.793 +//                Get a char from the input character iterator,
   1.794 +//                keep track of input position for error reporting.
   1.795 +//
   1.796 +//------------------------------------------------------------------------------
   1.797 +UChar32  RBBIRuleScanner::nextCharLL() {
   1.798 +    UChar32  ch;
   1.799 +
   1.800 +    if (fNextIndex >= fRB->fRules.length()) {
   1.801 +        return (UChar32)-1;
   1.802 +    }
   1.803 +    ch         = fRB->fRules.char32At(fNextIndex);
   1.804 +    fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
   1.805 +
   1.806 +    if (ch == chCR ||
   1.807 +        ch == chNEL ||
   1.808 +        ch == chLS   ||
   1.809 +        (ch == chLF && fLastChar != chCR)) {
   1.810 +        // Character is starting a new line.  Bump up the line number, and
   1.811 +        //  reset the column to 0.
   1.812 +        fLineNum++;
   1.813 +        fCharNum=0;
   1.814 +        if (fQuoteMode) {
   1.815 +            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
   1.816 +            fQuoteMode = FALSE;
   1.817 +        }
   1.818 +    }
   1.819 +    else {
   1.820 +        // Character is not starting a new line.  Except in the case of a
   1.821 +        //   LF following a CR, increment the column position.
   1.822 +        if (ch != chLF) {
   1.823 +            fCharNum++;
   1.824 +        }
   1.825 +    }
   1.826 +    fLastChar = ch;
   1.827 +    return ch;
   1.828 +}
   1.829 +
   1.830 +
   1.831 +//------------------------------------------------------------------------------
   1.832 +//
   1.833 +//   nextChar     for rules scanning.  At this level, we handle stripping
   1.834 +//                out comments and processing backslash character escapes.
   1.835 +//                The rest of the rules grammar is handled at the next level up.
   1.836 +//
   1.837 +//------------------------------------------------------------------------------
   1.838 +void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
   1.839 +
   1.840 +    // Unicode Character constants needed for the processing done by nextChar(),
   1.841 +    //   in hex because literals wont work on EBCDIC machines.
   1.842 +
   1.843 +    fScanIndex = fNextIndex;
   1.844 +    c.fChar    = nextCharLL();
   1.845 +    c.fEscaped = FALSE;
   1.846 +
   1.847 +    //
   1.848 +    //  check for '' sequence.
   1.849 +    //  These are recognized in all contexts, whether in quoted text or not.
   1.850 +    //
   1.851 +    if (c.fChar == chApos) {
   1.852 +        if (fRB->fRules.char32At(fNextIndex) == chApos) {
   1.853 +            c.fChar    = nextCharLL();        // get nextChar officially so character counts
   1.854 +            c.fEscaped = TRUE;                //   stay correct.
   1.855 +        }
   1.856 +        else
   1.857 +        {
   1.858 +            // Single quote, by itself.
   1.859 +            //   Toggle quoting mode.
   1.860 +            //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
   1.861 +            fQuoteMode = !fQuoteMode;
   1.862 +            if (fQuoteMode == TRUE) {
   1.863 +                c.fChar = chLParen;
   1.864 +            } else {
   1.865 +                c.fChar = chRParen;
   1.866 +            }
   1.867 +            c.fEscaped = FALSE;      // The paren that we return is not escaped.
   1.868 +            return;
   1.869 +        }
   1.870 +    }
   1.871 +
   1.872 +    if (fQuoteMode) {
   1.873 +        c.fEscaped = TRUE;
   1.874 +    }
   1.875 +    else
   1.876 +    {
   1.877 +        // We are not in a 'quoted region' of the source.
   1.878 +        //
   1.879 +        if (c.fChar == chPound) {
   1.880 +            // Start of a comment.  Consume the rest of it.
   1.881 +            //  The new-line char that terminates the comment is always returned.
   1.882 +            //  It will be treated as white-space, and serves to break up anything
   1.883 +            //    that might otherwise incorrectly clump together with a comment in
   1.884 +            //    the middle (a variable name, for example.)
   1.885 +            for (;;) {
   1.886 +                c.fChar = nextCharLL();
   1.887 +                if (c.fChar == (UChar32)-1 ||  // EOF
   1.888 +                    c.fChar == chCR     ||
   1.889 +                    c.fChar == chLF     ||
   1.890 +                    c.fChar == chNEL    ||
   1.891 +                    c.fChar == chLS)       {break;}
   1.892 +            }
   1.893 +        }
   1.894 +        if (c.fChar == (UChar32)-1) {
   1.895 +            return;
   1.896 +        }
   1.897 +
   1.898 +        //
   1.899 +        //  check for backslash escaped characters.
   1.900 +        //  Use UnicodeString::unescapeAt() to handle them.
   1.901 +        //
   1.902 +        if (c.fChar == chBackSlash) {
   1.903 +            c.fEscaped = TRUE;
   1.904 +            int32_t startX = fNextIndex;
   1.905 +            c.fChar = fRB->fRules.unescapeAt(fNextIndex);
   1.906 +            if (fNextIndex == startX) {
   1.907 +                error(U_BRK_HEX_DIGITS_EXPECTED);
   1.908 +            }
   1.909 +            fCharNum += fNextIndex-startX;
   1.910 +        }
   1.911 +    }
   1.912 +    // putc(c.fChar, stdout);
   1.913 +}
   1.914 +
   1.915 +//------------------------------------------------------------------------------
   1.916 +//
   1.917 +//  Parse RBBI rules.   The state machine for rules parsing is here.
   1.918 +//                      The state tables are hand-written in the file rbbirpt.txt,
   1.919 +//                      and converted to the form used here by a perl
   1.920 +//                      script rbbicst.pl
   1.921 +//
   1.922 +//------------------------------------------------------------------------------
   1.923 +void RBBIRuleScanner::parse() {
   1.924 +    uint16_t                state;
   1.925 +    const RBBIRuleTableEl  *tableEl;
   1.926 +
   1.927 +    if (U_FAILURE(*fRB->fStatus)) {
   1.928 +        return;
   1.929 +    }
   1.930 +
   1.931 +    state = 1;
   1.932 +    nextChar(fC);
   1.933 +    //
   1.934 +    // Main loop for the rule parsing state machine.
   1.935 +    //   Runs once per state transition.
   1.936 +    //   Each time through optionally performs, depending on the state table,
   1.937 +    //      - an advance to the the next input char
   1.938 +    //      - an action to be performed.
   1.939 +    //      - pushing or popping a state to/from the local state return stack.
   1.940 +    //
   1.941 +    for (;;) {
   1.942 +        //  Bail out if anything has gone wrong.
   1.943 +        //  RBBI rule file parsing stops on the first error encountered.
   1.944 +        if (U_FAILURE(*fRB->fStatus)) {
   1.945 +            break;
   1.946 +        }
   1.947 +
   1.948 +        // Quit if state == 0.  This is the normal way to exit the state machine.
   1.949 +        //
   1.950 +        if (state == 0) {
   1.951 +            break;
   1.952 +        }
   1.953 +
   1.954 +        // Find the state table element that matches the input char from the rule, or the
   1.955 +        //    class of the input character.  Start with the first table row for this
   1.956 +        //    state, then linearly scan forward until we find a row that matches the
   1.957 +        //    character.  The last row for each state always matches all characters, so
   1.958 +        //    the search will stop there, if not before.
   1.959 +        //
   1.960 +        tableEl = &gRuleParseStateTable[state];
   1.961 +        #ifdef RBBI_DEBUG
   1.962 +            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
   1.963 +                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
   1.964 +                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
   1.965 +            }
   1.966 +        #endif
   1.967 +
   1.968 +        for (;;) {
   1.969 +            #ifdef RBBI_DEBUG
   1.970 +                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");}
   1.971 +            #endif
   1.972 +            if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
   1.973 +                // Table row specified an individual character, not a set, and
   1.974 +                //   the input character is not escaped, and
   1.975 +                //   the input character matched it.
   1.976 +                break;
   1.977 +            }
   1.978 +            if (tableEl->fCharClass == 255) {
   1.979 +                // Table row specified default, match anything character class.
   1.980 +                break;
   1.981 +            }
   1.982 +            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
   1.983 +                // Table row specified "escaped" and the char was escaped.
   1.984 +                break;
   1.985 +            }
   1.986 +            if (tableEl->fCharClass == 253 && fC.fEscaped &&
   1.987 +                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
   1.988 +                // Table row specified "escaped P" and the char is either 'p' or 'P'.
   1.989 +                break;
   1.990 +            }
   1.991 +            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
   1.992 +                // Table row specified eof and we hit eof on the input.
   1.993 +                break;
   1.994 +            }
   1.995 +
   1.996 +            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
   1.997 +                fC.fEscaped == FALSE &&                                      //   char is not escaped &&
   1.998 +                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
   1.999 +                U_ASSERT((tableEl->fCharClass-128) < LENGTHOF(fRuleSets));
  1.1000 +                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
  1.1001 +                    // Table row specified a character class, or set of characters,
  1.1002 +                    //   and the current char matches it.
  1.1003 +                    break;
  1.1004 +                }
  1.1005 +            }
  1.1006 +
  1.1007 +            // No match on this row, advance to the next  row for this state,
  1.1008 +            tableEl++;
  1.1009 +        }
  1.1010 +        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
  1.1011 +
  1.1012 +        //
  1.1013 +        // We've found the row of the state table that matches the current input
  1.1014 +        //   character from the rules string.
  1.1015 +        // Perform any action specified  by this row in the state table.
  1.1016 +        if (doParseActions((int32_t)tableEl->fAction) == FALSE) {
  1.1017 +            // Break out of the state machine loop if the
  1.1018 +            //   the action signalled some kind of error, or
  1.1019 +            //   the action was to exit, occurs on normal end-of-rules-input.
  1.1020 +            break;
  1.1021 +        }
  1.1022 +
  1.1023 +        if (tableEl->fPushState != 0) {
  1.1024 +            fStackPtr++;
  1.1025 +            if (fStackPtr >= kStackSize) {
  1.1026 +                error(U_BRK_INTERNAL_ERROR);
  1.1027 +                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
  1.1028 +                fStackPtr--;
  1.1029 +            }
  1.1030 +            fStack[fStackPtr] = tableEl->fPushState;
  1.1031 +        }
  1.1032 +
  1.1033 +        if (tableEl->fNextChar) {
  1.1034 +            nextChar(fC);
  1.1035 +        }
  1.1036 +
  1.1037 +        // Get the next state from the table entry, or from the
  1.1038 +        //   state stack if the next state was specified as "pop".
  1.1039 +        if (tableEl->fNextState != 255) {
  1.1040 +            state = tableEl->fNextState;
  1.1041 +        } else {
  1.1042 +            state = fStack[fStackPtr];
  1.1043 +            fStackPtr--;
  1.1044 +            if (fStackPtr < 0) {
  1.1045 +                error(U_BRK_INTERNAL_ERROR);
  1.1046 +                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
  1.1047 +                fStackPtr++;
  1.1048 +            }
  1.1049 +        }
  1.1050 +
  1.1051 +    }
  1.1052 +
  1.1053 +    //
  1.1054 +    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
  1.1055 +    //
  1.1056 +    if (fRB->fReverseTree == NULL) {
  1.1057 +        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
  1.1058 +        RBBINode  *operand = pushNewNode(RBBINode::setRef);
  1.1059 +        findSetFor(UnicodeString(TRUE, kAny, 3), operand);
  1.1060 +        fRB->fReverseTree->fLeftChild = operand;
  1.1061 +        operand->fParent              = fRB->fReverseTree;
  1.1062 +        fNodeStackPtr -= 2;
  1.1063 +    }
  1.1064 +
  1.1065 +
  1.1066 +    //
  1.1067 +    // Parsing of the input RBBI rules is complete.
  1.1068 +    // We now have a parse tree for the rule expressions
  1.1069 +    // and a list of all UnicodeSets that are referenced.
  1.1070 +    //
  1.1071 +#ifdef RBBI_DEBUG
  1.1072 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
  1.1073 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
  1.1074 +    {
  1.1075 +        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
  1.1076 +        fRB->fForwardTree->printTree(TRUE);
  1.1077 +        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
  1.1078 +        fRB->fReverseTree->printTree(TRUE);
  1.1079 +        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
  1.1080 +        fRB->fSafeFwdTree->printTree(TRUE);
  1.1081 +        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
  1.1082 +        fRB->fSafeRevTree->printTree(TRUE);
  1.1083 +    }
  1.1084 +#endif
  1.1085 +}
  1.1086 +
  1.1087 +
  1.1088 +//------------------------------------------------------------------------------
  1.1089 +//
  1.1090 +//  printNodeStack     for debugging...
  1.1091 +//
  1.1092 +//------------------------------------------------------------------------------
  1.1093 +#ifdef RBBI_DEBUG
  1.1094 +void RBBIRuleScanner::printNodeStack(const char *title) {
  1.1095 +    int i;
  1.1096 +    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
  1.1097 +    for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);}
  1.1098 +}
  1.1099 +#endif
  1.1100 +
  1.1101 +
  1.1102 +
  1.1103 +
  1.1104 +//------------------------------------------------------------------------------
  1.1105 +//
  1.1106 +//  pushNewNode   create a new RBBINode of the specified type and push it
  1.1107 +//                onto the stack of nodes.
  1.1108 +//
  1.1109 +//------------------------------------------------------------------------------
  1.1110 +RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
  1.1111 +    fNodeStackPtr++;
  1.1112 +    if (fNodeStackPtr >= kStackSize) {
  1.1113 +        error(U_BRK_INTERNAL_ERROR);
  1.1114 +        RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
  1.1115 +        *fRB->fStatus = U_BRK_INTERNAL_ERROR;
  1.1116 +        return NULL;
  1.1117 +    }
  1.1118 +    fNodeStack[fNodeStackPtr] = new RBBINode(t);
  1.1119 +    if (fNodeStack[fNodeStackPtr] == NULL) {
  1.1120 +        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
  1.1121 +    }
  1.1122 +    return fNodeStack[fNodeStackPtr];
  1.1123 +}
  1.1124 +
  1.1125 +
  1.1126 +
  1.1127 +//------------------------------------------------------------------------------
  1.1128 +//
  1.1129 +//  scanSet    Construct a UnicodeSet from the text at the current scan
  1.1130 +//             position.  Advance the scan position to the first character
  1.1131 +//             after the set.
  1.1132 +//
  1.1133 +//             A new RBBI setref node referring to the set is pushed onto the node
  1.1134 +//             stack.
  1.1135 +//
  1.1136 +//             The scan position is normally under the control of the state machine
  1.1137 +//             that controls rule parsing.  UnicodeSets, however, are parsed by
  1.1138 +//             the UnicodeSet constructor, not by the RBBI rule parser.
  1.1139 +//
  1.1140 +//------------------------------------------------------------------------------
  1.1141 +void RBBIRuleScanner::scanSet() {
  1.1142 +    UnicodeSet    *uset;
  1.1143 +    ParsePosition  pos;
  1.1144 +    int            startPos;
  1.1145 +    int            i;
  1.1146 +
  1.1147 +    if (U_FAILURE(*fRB->fStatus)) {
  1.1148 +        return;
  1.1149 +    }
  1.1150 +
  1.1151 +    pos.setIndex(fScanIndex);
  1.1152 +    startPos = fScanIndex;
  1.1153 +    UErrorCode localStatus = U_ZERO_ERROR;
  1.1154 +    uset = new UnicodeSet();
  1.1155 +    if (uset == NULL) {
  1.1156 +        localStatus = U_MEMORY_ALLOCATION_ERROR;
  1.1157 +    } else {
  1.1158 +        uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
  1.1159 +    }
  1.1160 +    if (U_FAILURE(localStatus)) {
  1.1161 +        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
  1.1162 +        //         UnicodeSet appears to not be reporting correctly at this time.
  1.1163 +        #ifdef RBBI_DEBUG
  1.1164 +            RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
  1.1165 +        #endif
  1.1166 +        error(localStatus);
  1.1167 +        delete uset;
  1.1168 +        return;
  1.1169 +    }
  1.1170 +
  1.1171 +    // Verify that the set contains at least one code point.
  1.1172 +    //
  1.1173 +    U_ASSERT(uset!=NULL);
  1.1174 +    if (uset->isEmpty()) {
  1.1175 +        // This set is empty.
  1.1176 +        //  Make it an error, because it almost certainly is not what the user wanted.
  1.1177 +        //  Also, avoids having to think about corner cases in the tree manipulation code
  1.1178 +        //   that occurs later on.
  1.1179 +        error(U_BRK_RULE_EMPTY_SET);
  1.1180 +        delete uset;
  1.1181 +        return;
  1.1182 +    }
  1.1183 +
  1.1184 +
  1.1185 +    // Advance the RBBI parse postion over the UnicodeSet pattern.
  1.1186 +    //   Don't just set fScanIndex because the line/char positions maintained
  1.1187 +    //   for error reporting would be thrown off.
  1.1188 +    i = pos.getIndex();
  1.1189 +    for (;;) {
  1.1190 +        if (fNextIndex >= i) {
  1.1191 +            break;
  1.1192 +        }
  1.1193 +        nextCharLL();
  1.1194 +    }
  1.1195 +
  1.1196 +    if (U_SUCCESS(*fRB->fStatus)) {
  1.1197 +        RBBINode         *n;
  1.1198 +
  1.1199 +        n = pushNewNode(RBBINode::setRef);
  1.1200 +        n->fFirstPos = startPos;
  1.1201 +        n->fLastPos  = fNextIndex;
  1.1202 +        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
  1.1203 +        //  findSetFor() serves several purposes here:
  1.1204 +        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
  1.1205 +        //     - Mantains collection of all sets in use, needed later for establishing
  1.1206 +        //          character categories for run time engine.
  1.1207 +        //     - Eliminates mulitiple instances of the same set.
  1.1208 +        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
  1.1209 +        findSetFor(n->fText, n, uset);
  1.1210 +    }
  1.1211 +
  1.1212 +}
  1.1213 +
  1.1214 +U_NAMESPACE_END
  1.1215 +
  1.1216 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial