intl/icu/source/common/rbbitblb.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbitblb.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1260 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (c) 2002-2009, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*/
    1.10 +//
    1.11 +//  rbbitblb.cpp
    1.12 +//
    1.13 +
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +
    1.17 +#if !UCONFIG_NO_BREAK_ITERATION
    1.18 +
    1.19 +#include "unicode/unistr.h"
    1.20 +#include "rbbitblb.h"
    1.21 +#include "rbbirb.h"
    1.22 +#include "rbbisetb.h"
    1.23 +#include "rbbidata.h"
    1.24 +#include "cstring.h"
    1.25 +#include "uassert.h"
    1.26 +#include "cmemory.h"
    1.27 +
    1.28 +U_NAMESPACE_BEGIN
    1.29 +
    1.30 +RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
    1.31 + fTree(*rootNode) {
    1.32 +    fRB                 = rb;
    1.33 +    fStatus             = fRB->fStatus;
    1.34 +    UErrorCode status   = U_ZERO_ERROR;
    1.35 +    fDStates            = new UVector(status);
    1.36 +    if (U_FAILURE(*fStatus)) {
    1.37 +        return;
    1.38 +    }
    1.39 +    if (U_FAILURE(status)) {
    1.40 +        *fStatus = status;
    1.41 +        return;
    1.42 +    }
    1.43 +    if (fDStates == NULL) {
    1.44 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
    1.45 +    }
    1.46 +}
    1.47 +
    1.48 +
    1.49 +
    1.50 +RBBITableBuilder::~RBBITableBuilder() {
    1.51 +    int i;
    1.52 +    for (i=0; i<fDStates->size(); i++) {
    1.53 +        delete (RBBIStateDescriptor *)fDStates->elementAt(i);
    1.54 +    }
    1.55 +    delete   fDStates;
    1.56 +}
    1.57 +
    1.58 +
    1.59 +//-----------------------------------------------------------------------------
    1.60 +//
    1.61 +//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
    1.62 +//                               table from the RBBI rules parse tree.
    1.63 +//
    1.64 +//-----------------------------------------------------------------------------
    1.65 +void  RBBITableBuilder::build() {
    1.66 +
    1.67 +    if (U_FAILURE(*fStatus)) {
    1.68 +        return;
    1.69 +    }
    1.70 +
    1.71 +    // If there were no rules, just return.  This situation can easily arise
    1.72 +    //   for the reverse rules.
    1.73 +    if (fTree==NULL) {
    1.74 +        return;
    1.75 +    }
    1.76 +
    1.77 +    //
    1.78 +    // Walk through the tree, replacing any references to $variables with a copy of the
    1.79 +    //   parse tree for the substition expression.
    1.80 +    //
    1.81 +    fTree = fTree->flattenVariables();
    1.82 +#ifdef RBBI_DEBUG
    1.83 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
    1.84 +        RBBIDebugPuts("Parse tree after flattening variable references.");
    1.85 +        fTree->printTree(TRUE);
    1.86 +    }
    1.87 +#endif
    1.88 +
    1.89 +    //
    1.90 +    // If the rules contained any references to {bof} 
    1.91 +    //   add a {bof} <cat> <former root of tree> to the
    1.92 +    //   tree.  Means that all matches must start out with the 
    1.93 +    //   {bof} fake character.
    1.94 +    // 
    1.95 +    if (fRB->fSetBuilder->sawBOF()) {
    1.96 +        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
    1.97 +        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
    1.98 +        // Delete and exit if memory allocation failed.
    1.99 +        if (bofTop == NULL || bofLeaf == NULL) {
   1.100 +            *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.101 +            delete bofTop;
   1.102 +            delete bofLeaf;
   1.103 +            return;
   1.104 +        }
   1.105 +        bofTop->fLeftChild  = bofLeaf;
   1.106 +        bofTop->fRightChild = fTree;
   1.107 +        bofLeaf->fParent    = bofTop;
   1.108 +        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
   1.109 +        fTree               = bofTop;
   1.110 +    }
   1.111 +
   1.112 +    //
   1.113 +    // Add a unique right-end marker to the expression.
   1.114 +    //   Appears as a cat-node, left child being the original tree,
   1.115 +    //   right child being the end marker.
   1.116 +    //
   1.117 +    RBBINode *cn = new RBBINode(RBBINode::opCat);
   1.118 +    // Exit if memory allocation failed.
   1.119 +    if (cn == NULL) {
   1.120 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.121 +        return;
   1.122 +    }
   1.123 +    cn->fLeftChild = fTree;
   1.124 +    fTree->fParent = cn;
   1.125 +    cn->fRightChild = new RBBINode(RBBINode::endMark);
   1.126 +    // Delete and exit if memory allocation failed.
   1.127 +    if (cn->fRightChild == NULL) {
   1.128 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.129 +        delete cn;
   1.130 +        return;
   1.131 +    }
   1.132 +    cn->fRightChild->fParent = cn;
   1.133 +    fTree = cn;
   1.134 +
   1.135 +    //
   1.136 +    //  Replace all references to UnicodeSets with the tree for the equivalent
   1.137 +    //      expression.
   1.138 +    //
   1.139 +    fTree->flattenSets();
   1.140 +#ifdef RBBI_DEBUG
   1.141 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
   1.142 +        RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
   1.143 +        fTree->printTree(TRUE);
   1.144 +    }
   1.145 +#endif
   1.146 +
   1.147 +
   1.148 +    //
   1.149 +    // calculate the functions nullable, firstpos, lastpos and followpos on
   1.150 +    // nodes in the parse tree.
   1.151 +    //    See the alogrithm description in Aho.
   1.152 +    //    Understanding how this works by looking at the code alone will be
   1.153 +    //       nearly impossible.
   1.154 +    //
   1.155 +    calcNullable(fTree);
   1.156 +    calcFirstPos(fTree);
   1.157 +    calcLastPos(fTree);
   1.158 +    calcFollowPos(fTree);
   1.159 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
   1.160 +        RBBIDebugPuts("\n");
   1.161 +        printPosSets(fTree);
   1.162 +    }
   1.163 +
   1.164 +    //
   1.165 +    //  For "chained" rules, modify the followPos sets
   1.166 +    //
   1.167 +    if (fRB->fChainRules) {
   1.168 +        calcChainedFollowPos(fTree);
   1.169 +    }
   1.170 +
   1.171 +    //
   1.172 +    //  BOF (start of input) test fixup.
   1.173 +    //
   1.174 +    if (fRB->fSetBuilder->sawBOF()) {
   1.175 +        bofFixup();
   1.176 +    }
   1.177 +
   1.178 +    //
   1.179 +    // Build the DFA state transition tables.
   1.180 +    //
   1.181 +    buildStateTable();
   1.182 +    flagAcceptingStates();
   1.183 +    flagLookAheadStates();
   1.184 +    flagTaggedStates();
   1.185 +
   1.186 +    //
   1.187 +    // Update the global table of rule status {tag} values
   1.188 +    // The rule builder has a global vector of status values that are common
   1.189 +    //    for all tables.  Merge the ones from this table into the global set.
   1.190 +    //
   1.191 +    mergeRuleStatusVals();
   1.192 +
   1.193 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
   1.194 +}
   1.195 +
   1.196 +
   1.197 +
   1.198 +//-----------------------------------------------------------------------------
   1.199 +//
   1.200 +//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
   1.201 +//
   1.202 +//-----------------------------------------------------------------------------
   1.203 +void RBBITableBuilder::calcNullable(RBBINode *n) {
   1.204 +    if (n == NULL) {
   1.205 +        return;
   1.206 +    }
   1.207 +    if (n->fType == RBBINode::setRef ||
   1.208 +        n->fType == RBBINode::endMark ) {
   1.209 +        // These are non-empty leaf node types.
   1.210 +        n->fNullable = FALSE;
   1.211 +        return;
   1.212 +    }
   1.213 +
   1.214 +    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
   1.215 +        // Lookahead marker node.  It's a leaf, so no recursion on children.
   1.216 +        // It's nullable because it does not match any literal text from the input stream.
   1.217 +        n->fNullable = TRUE;
   1.218 +        return;
   1.219 +    }
   1.220 +
   1.221 +
   1.222 +    // The node is not a leaf.
   1.223 +    //  Calculate nullable on its children.
   1.224 +    calcNullable(n->fLeftChild);
   1.225 +    calcNullable(n->fRightChild);
   1.226 +
   1.227 +    // Apply functions from table 3.40 in Aho
   1.228 +    if (n->fType == RBBINode::opOr) {
   1.229 +        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
   1.230 +    }
   1.231 +    else if (n->fType == RBBINode::opCat) {
   1.232 +        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
   1.233 +    }
   1.234 +    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
   1.235 +        n->fNullable = TRUE;
   1.236 +    }
   1.237 +    else {
   1.238 +        n->fNullable = FALSE;
   1.239 +    }
   1.240 +}
   1.241 +
   1.242 +
   1.243 +
   1.244 +
   1.245 +//-----------------------------------------------------------------------------
   1.246 +//
   1.247 +//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
   1.248 +//
   1.249 +//-----------------------------------------------------------------------------
   1.250 +void RBBITableBuilder::calcFirstPos(RBBINode *n) {
   1.251 +    if (n == NULL) {
   1.252 +        return;
   1.253 +    }
   1.254 +    if (n->fType == RBBINode::leafChar  ||
   1.255 +        n->fType == RBBINode::endMark   ||
   1.256 +        n->fType == RBBINode::lookAhead ||
   1.257 +        n->fType == RBBINode::tag) {
   1.258 +        // These are non-empty leaf node types.
   1.259 +        // Note: In order to maintain the sort invariant on the set,
   1.260 +        // this function should only be called on a node whose set is
   1.261 +        // empty to start with.
   1.262 +        n->fFirstPosSet->addElement(n, *fStatus);
   1.263 +        return;
   1.264 +    }
   1.265 +
   1.266 +    // The node is not a leaf.
   1.267 +    //  Calculate firstPos on its children.
   1.268 +    calcFirstPos(n->fLeftChild);
   1.269 +    calcFirstPos(n->fRightChild);
   1.270 +
   1.271 +    // Apply functions from table 3.40 in Aho
   1.272 +    if (n->fType == RBBINode::opOr) {
   1.273 +        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   1.274 +        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
   1.275 +    }
   1.276 +    else if (n->fType == RBBINode::opCat) {
   1.277 +        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   1.278 +        if (n->fLeftChild->fNullable) {
   1.279 +            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
   1.280 +        }
   1.281 +    }
   1.282 +    else if (n->fType == RBBINode::opStar ||
   1.283 +             n->fType == RBBINode::opQuestion ||
   1.284 +             n->fType == RBBINode::opPlus) {
   1.285 +        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   1.286 +    }
   1.287 +}
   1.288 +
   1.289 +
   1.290 +
   1.291 +//-----------------------------------------------------------------------------
   1.292 +//
   1.293 +//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
   1.294 +//
   1.295 +//-----------------------------------------------------------------------------
   1.296 +void RBBITableBuilder::calcLastPos(RBBINode *n) {
   1.297 +    if (n == NULL) {
   1.298 +        return;
   1.299 +    }
   1.300 +    if (n->fType == RBBINode::leafChar  ||
   1.301 +        n->fType == RBBINode::endMark   ||
   1.302 +        n->fType == RBBINode::lookAhead ||
   1.303 +        n->fType == RBBINode::tag) {
   1.304 +        // These are non-empty leaf node types.
   1.305 +        // Note: In order to maintain the sort invariant on the set,
   1.306 +        // this function should only be called on a node whose set is
   1.307 +        // empty to start with.
   1.308 +        n->fLastPosSet->addElement(n, *fStatus);
   1.309 +        return;
   1.310 +    }
   1.311 +
   1.312 +    // The node is not a leaf.
   1.313 +    //  Calculate lastPos on its children.
   1.314 +    calcLastPos(n->fLeftChild);
   1.315 +    calcLastPos(n->fRightChild);
   1.316 +
   1.317 +    // Apply functions from table 3.40 in Aho
   1.318 +    if (n->fType == RBBINode::opOr) {
   1.319 +        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   1.320 +        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
   1.321 +    }
   1.322 +    else if (n->fType == RBBINode::opCat) {
   1.323 +        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
   1.324 +        if (n->fRightChild->fNullable) {
   1.325 +            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   1.326 +        }
   1.327 +    }
   1.328 +    else if (n->fType == RBBINode::opStar     ||
   1.329 +             n->fType == RBBINode::opQuestion ||
   1.330 +             n->fType == RBBINode::opPlus) {
   1.331 +        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   1.332 +    }
   1.333 +}
   1.334 +
   1.335 +
   1.336 +
   1.337 +//-----------------------------------------------------------------------------
   1.338 +//
   1.339 +//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
   1.340 +//
   1.341 +//-----------------------------------------------------------------------------
   1.342 +void RBBITableBuilder::calcFollowPos(RBBINode *n) {
   1.343 +    if (n == NULL ||
   1.344 +        n->fType == RBBINode::leafChar ||
   1.345 +        n->fType == RBBINode::endMark) {
   1.346 +        return;
   1.347 +    }
   1.348 +
   1.349 +    calcFollowPos(n->fLeftChild);
   1.350 +    calcFollowPos(n->fRightChild);
   1.351 +
   1.352 +    // Aho rule #1
   1.353 +    if (n->fType == RBBINode::opCat) {
   1.354 +        RBBINode *i;   // is 'i' in Aho's description
   1.355 +        uint32_t     ix;
   1.356 +
   1.357 +        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
   1.358 +
   1.359 +        for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
   1.360 +            i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
   1.361 +            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
   1.362 +        }
   1.363 +    }
   1.364 +
   1.365 +    // Aho rule #2
   1.366 +    if (n->fType == RBBINode::opStar ||
   1.367 +        n->fType == RBBINode::opPlus) {
   1.368 +        RBBINode   *i;  // again, n and i are the names from Aho's description.
   1.369 +        uint32_t    ix;
   1.370 +
   1.371 +        for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
   1.372 +            i = (RBBINode *)n->fLastPosSet->elementAt(ix);
   1.373 +            setAdd(i->fFollowPos, n->fFirstPosSet);
   1.374 +        }
   1.375 +    }
   1.376 +
   1.377 +
   1.378 +
   1.379 +}
   1.380 +
   1.381 +
   1.382 +//-----------------------------------------------------------------------------
   1.383 +//
   1.384 +//   calcChainedFollowPos.    Modify the previously calculated followPos sets
   1.385 +//                            to implement rule chaining.  NOT described by Aho
   1.386 +//
   1.387 +//-----------------------------------------------------------------------------
   1.388 +void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
   1.389 +
   1.390 +    UVector         endMarkerNodes(*fStatus);
   1.391 +    UVector         leafNodes(*fStatus);
   1.392 +    int32_t         i;
   1.393 +
   1.394 +    if (U_FAILURE(*fStatus)) {
   1.395 +        return;
   1.396 +    }
   1.397 +
   1.398 +    // get a list of all endmarker nodes.
   1.399 +    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
   1.400 +
   1.401 +    // get a list all leaf nodes
   1.402 +    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
   1.403 +    if (U_FAILURE(*fStatus)) {
   1.404 +        return;
   1.405 +    }
   1.406 +
   1.407 +    // Get all nodes that can be the start a match, which is FirstPosition()
   1.408 +    // of the portion of the tree corresponding to user-written rules.
   1.409 +    // See the tree description in bofFixup().
   1.410 +    RBBINode *userRuleRoot = tree;
   1.411 +    if (fRB->fSetBuilder->sawBOF()) {
   1.412 +        userRuleRoot = tree->fLeftChild->fRightChild;
   1.413 +    }
   1.414 +    U_ASSERT(userRuleRoot != NULL);
   1.415 +    UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
   1.416 +
   1.417 +
   1.418 +    // Iteratate over all leaf nodes,
   1.419 +    //
   1.420 +    int32_t  endNodeIx;
   1.421 +    int32_t  startNodeIx;
   1.422 +
   1.423 +    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
   1.424 +        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
   1.425 +        RBBINode *endNode = NULL;
   1.426 +
   1.427 +        // Identify leaf nodes that correspond to overall rule match positions.
   1.428 +        //   These include an endMarkerNode in their followPos sets.
   1.429 +        for (i=0; i<endMarkerNodes.size(); i++) {
   1.430 +            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
   1.431 +                endNode = tNode;
   1.432 +                break;
   1.433 +            }
   1.434 +        }
   1.435 +        if (endNode == NULL) {
   1.436 +            // node wasn't an end node.  Try again with the next.
   1.437 +            continue;
   1.438 +        }
   1.439 +
   1.440 +        // We've got a node that can end a match.
   1.441 +
   1.442 +        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
   1.443 +        //                            don't chain from it.
   1.444 +        // TODO:  Add rule syntax for this behavior, get specifics out of here and
   1.445 +        //        into the rule file.
   1.446 +        if (fRB->fLBCMNoChain) {
   1.447 +            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
   1.448 +            if (c != -1) {
   1.449 +                // c == -1 occurs with sets containing only the {eof} marker string.
   1.450 +                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
   1.451 +                if (cLBProp == U_LB_COMBINING_MARK) {
   1.452 +                    continue;
   1.453 +                }
   1.454 +            }
   1.455 +        }
   1.456 +
   1.457 +
   1.458 +        // Now iterate over the nodes that can start a match, looking for ones
   1.459 +        //   with the same char class as our ending node.
   1.460 +        RBBINode *startNode;
   1.461 +        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
   1.462 +            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
   1.463 +            if (startNode->fType != RBBINode::leafChar) {
   1.464 +                continue;
   1.465 +            }
   1.466 +
   1.467 +            if (endNode->fVal == startNode->fVal) {
   1.468 +                // The end val (character class) of one possible match is the
   1.469 +                //   same as the start of another.
   1.470 +
   1.471 +                // Add all nodes from the followPos of the start node to the
   1.472 +                //  followPos set of the end node, which will have the effect of
   1.473 +                //  letting matches transition from a match state at endNode
   1.474 +                //  to the second char of a match starting with startNode.
   1.475 +                setAdd(endNode->fFollowPos, startNode->fFollowPos);
   1.476 +            }
   1.477 +        }
   1.478 +    }
   1.479 +}
   1.480 +
   1.481 +
   1.482 +//-----------------------------------------------------------------------------
   1.483 +//
   1.484 +//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
   1.485 +//                Do an swizzle similar to chaining, modifying the followPos set of
   1.486 +//                the bofNode to include the followPos nodes from other {bot} nodes
   1.487 +//                scattered through the tree.
   1.488 +//
   1.489 +//                This function has much in common with calcChainedFollowPos().
   1.490 +//
   1.491 +//-----------------------------------------------------------------------------
   1.492 +void RBBITableBuilder::bofFixup() {
   1.493 +
   1.494 +    if (U_FAILURE(*fStatus)) {
   1.495 +        return;
   1.496 +    }
   1.497 +
   1.498 +    //   The parse tree looks like this ...
   1.499 +    //         fTree root  --->       <cat>
   1.500 +    //                               /     \       .
   1.501 +    //                            <cat>   <#end node>
   1.502 +    //                           /     \  .
   1.503 +    //                     <bofNode>   rest
   1.504 +    //                               of tree
   1.505 +    //
   1.506 +    //    We will be adding things to the followPos set of the <bofNode>
   1.507 +    //
   1.508 +    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
   1.509 +    U_ASSERT(bofNode->fType == RBBINode::leafChar);
   1.510 +    U_ASSERT(bofNode->fVal == 2);
   1.511 +
   1.512 +    // Get all nodes that can be the start a match of the user-written rules
   1.513 +    //  (excluding the fake bofNode)
   1.514 +    //  We want the nodes that can start a match in the
   1.515 +    //     part labeled "rest of tree"
   1.516 +    // 
   1.517 +    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
   1.518 +
   1.519 +    RBBINode *startNode;
   1.520 +    int       startNodeIx;
   1.521 +    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
   1.522 +        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
   1.523 +        if (startNode->fType != RBBINode::leafChar) {
   1.524 +            continue;
   1.525 +        }
   1.526 +
   1.527 +        if (startNode->fVal == bofNode->fVal) {
   1.528 +            //  We found a leaf node corresponding to a {bof} that was
   1.529 +            //    explicitly written into a rule.
   1.530 +            //  Add everything from the followPos set of this node to the
   1.531 +            //    followPos set of the fake bofNode at the start of the tree.
   1.532 +            //  
   1.533 +            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
   1.534 +        }
   1.535 +    }
   1.536 +}
   1.537 +
   1.538 +//-----------------------------------------------------------------------------
   1.539 +//
   1.540 +//   buildStateTable()    Determine the set of runtime DFA states and the
   1.541 +//                        transition tables for these states, by the algorithm
   1.542 +//                        of fig. 3.44 in Aho.
   1.543 +//
   1.544 +//                        Most of the comments are quotes of Aho's psuedo-code.
   1.545 +//
   1.546 +//-----------------------------------------------------------------------------
   1.547 +void RBBITableBuilder::buildStateTable() {
   1.548 +    if (U_FAILURE(*fStatus)) {
   1.549 +        return;
   1.550 +    }
   1.551 +    RBBIStateDescriptor *failState;
   1.552 +    // Set it to NULL to avoid uninitialized warning
   1.553 +    RBBIStateDescriptor *initialState = NULL; 
   1.554 +    //
   1.555 +    // Add a dummy state 0 - the stop state.  Not from Aho.
   1.556 +    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
   1.557 +    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   1.558 +    if (failState == NULL) {
   1.559 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.560 +        goto ExitBuildSTdeleteall;
   1.561 +    }
   1.562 +    failState->fPositions = new UVector(*fStatus);
   1.563 +    if (failState->fPositions == NULL) {
   1.564 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.565 +    }
   1.566 +    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
   1.567 +        goto ExitBuildSTdeleteall;
   1.568 +    }
   1.569 +    fDStates->addElement(failState, *fStatus);
   1.570 +    if (U_FAILURE(*fStatus)) {
   1.571 +        goto ExitBuildSTdeleteall;
   1.572 +    }
   1.573 +
   1.574 +    // initially, the only unmarked state in Dstates is firstpos(root),
   1.575 +    //       where toot is the root of the syntax tree for (r)#;
   1.576 +    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   1.577 +    if (initialState == NULL) {
   1.578 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.579 +    }
   1.580 +    if (U_FAILURE(*fStatus)) {
   1.581 +        goto ExitBuildSTdeleteall;
   1.582 +    }
   1.583 +    initialState->fPositions = new UVector(*fStatus);
   1.584 +    if (initialState->fPositions == NULL) {
   1.585 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.586 +    }
   1.587 +    if (U_FAILURE(*fStatus)) {
   1.588 +        goto ExitBuildSTdeleteall;
   1.589 +    }
   1.590 +    setAdd(initialState->fPositions, fTree->fFirstPosSet);
   1.591 +    fDStates->addElement(initialState, *fStatus);
   1.592 +    if (U_FAILURE(*fStatus)) {
   1.593 +        goto ExitBuildSTdeleteall;
   1.594 +    }
   1.595 +
   1.596 +    // while there is an unmarked state T in Dstates do begin
   1.597 +    for (;;) {
   1.598 +        RBBIStateDescriptor *T = NULL;
   1.599 +        int32_t              tx;
   1.600 +        for (tx=1; tx<fDStates->size(); tx++) {
   1.601 +            RBBIStateDescriptor *temp;
   1.602 +            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
   1.603 +            if (temp->fMarked == FALSE) {
   1.604 +                T = temp;
   1.605 +                break;
   1.606 +            }
   1.607 +        }
   1.608 +        if (T == NULL) {
   1.609 +            break;
   1.610 +        }
   1.611 +
   1.612 +        // mark T;
   1.613 +        T->fMarked = TRUE;
   1.614 +
   1.615 +        // for each input symbol a do begin
   1.616 +        int32_t  a;
   1.617 +        for (a = 1; a<=lastInputSymbol; a++) {
   1.618 +            // let U be the set of positions that are in followpos(p)
   1.619 +            //    for some position p in T
   1.620 +            //    such that the symbol at position p is a;
   1.621 +            UVector    *U = NULL;
   1.622 +            RBBINode   *p;
   1.623 +            int32_t     px;
   1.624 +            for (px=0; px<T->fPositions->size(); px++) {
   1.625 +                p = (RBBINode *)T->fPositions->elementAt(px);
   1.626 +                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
   1.627 +                    if (U == NULL) {
   1.628 +                        U = new UVector(*fStatus);
   1.629 +                        if (U == NULL) {
   1.630 +                        	*fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.631 +                        	goto ExitBuildSTdeleteall;
   1.632 +                        }
   1.633 +                    }
   1.634 +                    setAdd(U, p->fFollowPos);
   1.635 +                }
   1.636 +            }
   1.637 +
   1.638 +            // if U is not empty and not in DStates then
   1.639 +            int32_t  ux = 0;
   1.640 +            UBool    UinDstates = FALSE;
   1.641 +            if (U != NULL) {
   1.642 +                U_ASSERT(U->size() > 0);
   1.643 +                int  ix;
   1.644 +                for (ix=0; ix<fDStates->size(); ix++) {
   1.645 +                    RBBIStateDescriptor *temp2;
   1.646 +                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
   1.647 +                    if (setEquals(U, temp2->fPositions)) {
   1.648 +                        delete U;
   1.649 +                        U  = temp2->fPositions;
   1.650 +                        ux = ix;
   1.651 +                        UinDstates = TRUE;
   1.652 +                        break;
   1.653 +                    }
   1.654 +                }
   1.655 +
   1.656 +                // Add U as an unmarked state to Dstates
   1.657 +                if (!UinDstates)
   1.658 +                {
   1.659 +                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   1.660 +                    if (newState == NULL) {
   1.661 +                    	*fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.662 +                    }
   1.663 +                    if (U_FAILURE(*fStatus)) {
   1.664 +                        goto ExitBuildSTdeleteall;
   1.665 +                    }
   1.666 +                    newState->fPositions = U;
   1.667 +                    fDStates->addElement(newState, *fStatus);
   1.668 +                    if (U_FAILURE(*fStatus)) {
   1.669 +                        return;
   1.670 +                    }
   1.671 +                    ux = fDStates->size()-1;
   1.672 +                }
   1.673 +
   1.674 +                // Dtran[T, a] := U;
   1.675 +                T->fDtran->setElementAt(ux, a);
   1.676 +            }
   1.677 +        }
   1.678 +    }
   1.679 +    return;
   1.680 +    // delete local pointers only if error occured.
   1.681 +ExitBuildSTdeleteall:
   1.682 +    delete initialState;
   1.683 +    delete failState;
   1.684 +}
   1.685 +
   1.686 +
   1.687 +
   1.688 +//-----------------------------------------------------------------------------
   1.689 +//
   1.690 +//   flagAcceptingStates    Identify accepting states.
   1.691 +//                          First get a list of all of the end marker nodes.
   1.692 +//                          Then, for each state s,
   1.693 +//                              if s contains one of the end marker nodes in its list of tree positions then
   1.694 +//                                  s is an accepting state.
   1.695 +//
   1.696 +//-----------------------------------------------------------------------------
   1.697 +void     RBBITableBuilder::flagAcceptingStates() {
   1.698 +    if (U_FAILURE(*fStatus)) {
   1.699 +        return;
   1.700 +    }
   1.701 +    UVector     endMarkerNodes(*fStatus);
   1.702 +    RBBINode    *endMarker;
   1.703 +    int32_t     i;
   1.704 +    int32_t     n;
   1.705 +
   1.706 +    if (U_FAILURE(*fStatus)) {
   1.707 +        return;
   1.708 +    }
   1.709 +
   1.710 +    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
   1.711 +    if (U_FAILURE(*fStatus)) {
   1.712 +        return;
   1.713 +    }
   1.714 +
   1.715 +    for (i=0; i<endMarkerNodes.size(); i++) {
   1.716 +        endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
   1.717 +        for (n=0; n<fDStates->size(); n++) {
   1.718 +            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1.719 +            if (sd->fPositions->indexOf(endMarker) >= 0) {
   1.720 +                // Any non-zero value for fAccepting means this is an accepting node.
   1.721 +                // The value is what will be returned to the user as the break status.
   1.722 +                // If no other value was specified, force it to -1.
   1.723 +
   1.724 +                if (sd->fAccepting==0) {
   1.725 +                    // State hasn't been marked as accepting yet.  Do it now.
   1.726 +                    sd->fAccepting = endMarker->fVal;
   1.727 +                    if (sd->fAccepting == 0) {
   1.728 +                        sd->fAccepting = -1;
   1.729 +                    }
   1.730 +                }
   1.731 +                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
   1.732 +                    // Both lookahead and non-lookahead accepting for this state.
   1.733 +                    // Favor the look-ahead.  Expedient for line break.
   1.734 +                    // TODO:  need a more elegant resolution for conflicting rules.
   1.735 +                    sd->fAccepting = endMarker->fVal;
   1.736 +                }
   1.737 +                // implicit else:
   1.738 +                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
   1.739 +
   1.740 +                // If the end marker node is from a look-ahead rule, set
   1.741 +                //   the fLookAhead field or this state also.
   1.742 +                if (endMarker->fLookAheadEnd) {
   1.743 +                    // TODO:  don't change value if already set?
   1.744 +                    // TODO:  allow for more than one active look-ahead rule in engine.
   1.745 +                    //        Make value here an index to a side array in engine?
   1.746 +                    sd->fLookAhead = sd->fAccepting;
   1.747 +                }
   1.748 +            }
   1.749 +        }
   1.750 +    }
   1.751 +}
   1.752 +
   1.753 +
   1.754 +//-----------------------------------------------------------------------------
   1.755 +//
   1.756 +//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
   1.757 +//
   1.758 +//-----------------------------------------------------------------------------
   1.759 +void     RBBITableBuilder::flagLookAheadStates() {
   1.760 +    if (U_FAILURE(*fStatus)) {
   1.761 +        return;
   1.762 +    }
   1.763 +    UVector     lookAheadNodes(*fStatus);
   1.764 +    RBBINode    *lookAheadNode;
   1.765 +    int32_t     i;
   1.766 +    int32_t     n;
   1.767 +
   1.768 +    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
   1.769 +    if (U_FAILURE(*fStatus)) {
   1.770 +        return;
   1.771 +    }
   1.772 +    for (i=0; i<lookAheadNodes.size(); i++) {
   1.773 +        lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
   1.774 +
   1.775 +        for (n=0; n<fDStates->size(); n++) {
   1.776 +            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1.777 +            if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
   1.778 +                sd->fLookAhead = lookAheadNode->fVal;
   1.779 +            }
   1.780 +        }
   1.781 +    }
   1.782 +}
   1.783 +
   1.784 +
   1.785 +
   1.786 +
   1.787 +//-----------------------------------------------------------------------------
   1.788 +//
   1.789 +//    flagTaggedStates
   1.790 +//
   1.791 +//-----------------------------------------------------------------------------
   1.792 +void     RBBITableBuilder::flagTaggedStates() {
   1.793 +    if (U_FAILURE(*fStatus)) {
   1.794 +        return;
   1.795 +    }
   1.796 +    UVector     tagNodes(*fStatus);
   1.797 +    RBBINode    *tagNode;
   1.798 +    int32_t     i;
   1.799 +    int32_t     n;
   1.800 +
   1.801 +    if (U_FAILURE(*fStatus)) {
   1.802 +        return;
   1.803 +    }
   1.804 +    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
   1.805 +    if (U_FAILURE(*fStatus)) {
   1.806 +        return;
   1.807 +    }
   1.808 +    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
   1.809 +        tagNode = (RBBINode *)tagNodes.elementAt(i);
   1.810 +
   1.811 +        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
   1.812 +            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1.813 +            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
   1.814 +                sortedAdd(&sd->fTagVals, tagNode->fVal);
   1.815 +            }
   1.816 +        }
   1.817 +    }
   1.818 +}
   1.819 +
   1.820 +
   1.821 +
   1.822 +
   1.823 +//-----------------------------------------------------------------------------
   1.824 +//
   1.825 +//  mergeRuleStatusVals
   1.826 +//
   1.827 +//      Update the global table of rule status {tag} values
   1.828 +//      The rule builder has a global vector of status values that are common
   1.829 +//      for all tables.  Merge the ones from this table into the global set.
   1.830 +//
   1.831 +//-----------------------------------------------------------------------------
   1.832 +void  RBBITableBuilder::mergeRuleStatusVals() {
   1.833 +    //
   1.834 +    //  The basic outline of what happens here is this...
   1.835 +    //
   1.836 +    //    for each state in this state table
   1.837 +    //       if the status tag list for this state is in the global statuses list
   1.838 +    //           record where and
   1.839 +    //           continue with the next state
   1.840 +    //       else
   1.841 +    //           add the tag list for this state to the global list.
   1.842 +    //
   1.843 +    int i;
   1.844 +    int n;
   1.845 +
   1.846 +    // Pre-set a single tag of {0} into the table.
   1.847 +    //   We will need this as a default, for rule sets with no explicit tagging.
   1.848 +    if (fRB->fRuleStatusVals->size() == 0) {
   1.849 +        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
   1.850 +        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
   1.851 +    }
   1.852 +
   1.853 +    //    For each state
   1.854 +    for (n=0; n<fDStates->size(); n++) {
   1.855 +        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   1.856 +        UVector *thisStatesTagValues = sd->fTagVals;
   1.857 +        if (thisStatesTagValues == NULL) {
   1.858 +            // No tag values are explicitly associated with this state.
   1.859 +            //   Set the default tag value.
   1.860 +            sd->fTagsIdx = 0;
   1.861 +            continue;
   1.862 +        }
   1.863 +
   1.864 +        // There are tag(s) associated with this state.
   1.865 +        //   fTagsIdx will be the index into the global tag list for this state's tag values.
   1.866 +        //   Initial value of -1 flags that we haven't got it set yet.
   1.867 +        sd->fTagsIdx = -1;
   1.868 +        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
   1.869 +        int32_t  nextTagGroupStart = 0;
   1.870 +
   1.871 +        // Loop runs once per group of tags in the global list
   1.872 +        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
   1.873 +            thisTagGroupStart = nextTagGroupStart;
   1.874 +            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
   1.875 +            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
   1.876 +                // The number of tags for this state is different from
   1.877 +                //    the number of tags in this group from the global list.
   1.878 +                //    Continue with the next group from the global list.
   1.879 +                continue;
   1.880 +            }
   1.881 +            // The lengths match, go ahead and compare the actual tag values
   1.882 +            //    between this state and the group from the global list.
   1.883 +            for (i=0; i<thisStatesTagValues->size(); i++) {
   1.884 +                if (thisStatesTagValues->elementAti(i) !=
   1.885 +                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
   1.886 +                    // Mismatch.
   1.887 +                    break;
   1.888 +                }
   1.889 +            }
   1.890 +
   1.891 +            if (i == thisStatesTagValues->size()) {
   1.892 +                // We found a set of tag values in the global list that match
   1.893 +                //   those for this state.  Use them.
   1.894 +                sd->fTagsIdx = thisTagGroupStart;
   1.895 +                break;
   1.896 +            }
   1.897 +        }
   1.898 +
   1.899 +        if (sd->fTagsIdx == -1) {
   1.900 +            // No suitable entry in the global tag list already.  Add one
   1.901 +            sd->fTagsIdx = fRB->fRuleStatusVals->size();
   1.902 +            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
   1.903 +            for (i=0; i<thisStatesTagValues->size(); i++) {
   1.904 +                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
   1.905 +            }
   1.906 +        }
   1.907 +    }
   1.908 +}
   1.909 +
   1.910 +
   1.911 +
   1.912 +
   1.913 +
   1.914 +
   1.915 +
   1.916 +//-----------------------------------------------------------------------------
   1.917 +//
   1.918 +//  sortedAdd  Add a value to a vector of sorted values (ints).
   1.919 +//             Do not replicate entries; if the value is already there, do not
   1.920 +//                add a second one.
   1.921 +//             Lazily create the vector if it does not already exist.
   1.922 +//
   1.923 +//-----------------------------------------------------------------------------
   1.924 +void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
   1.925 +    int32_t i;
   1.926 +
   1.927 +    if (*vector == NULL) {
   1.928 +        *vector = new UVector(*fStatus);
   1.929 +    }
   1.930 +    if (*vector == NULL || U_FAILURE(*fStatus)) {
   1.931 +        return;
   1.932 +    }
   1.933 +    UVector *vec = *vector;
   1.934 +    int32_t  vSize = vec->size();
   1.935 +    for (i=0; i<vSize; i++) {
   1.936 +        int32_t valAtI = vec->elementAti(i);
   1.937 +        if (valAtI == val) {
   1.938 +            // The value is already in the vector.  Don't add it again.
   1.939 +            return;
   1.940 +        }
   1.941 +        if (valAtI > val) {
   1.942 +            break;
   1.943 +        }
   1.944 +    }
   1.945 +    vec->insertElementAt(val, i, *fStatus);
   1.946 +}
   1.947 +
   1.948 +
   1.949 +
   1.950 +//-----------------------------------------------------------------------------
   1.951 +//
   1.952 +//  setAdd     Set operation on UVector
   1.953 +//             dest = dest union source
   1.954 +//             Elements may only appear once and must be sorted.
   1.955 +//
   1.956 +//-----------------------------------------------------------------------------
   1.957 +void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
   1.958 +    int32_t destOriginalSize = dest->size();
   1.959 +    int32_t sourceSize       = source->size();
   1.960 +    int32_t di           = 0;
   1.961 +    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
   1.962 +    void **destPtr, **sourcePtr;
   1.963 +    void **destLim, **sourceLim;
   1.964 +
   1.965 +    if (destOriginalSize > destArray.getCapacity()) {
   1.966 +        if (destArray.resize(destOriginalSize) == NULL) {
   1.967 +            return;
   1.968 +        }
   1.969 +    }
   1.970 +    destPtr = destArray.getAlias();
   1.971 +    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
   1.972 +
   1.973 +    if (sourceSize > sourceArray.getCapacity()) {
   1.974 +        if (sourceArray.resize(sourceSize) == NULL) {
   1.975 +            return;
   1.976 +        }
   1.977 +    }
   1.978 +    sourcePtr = sourceArray.getAlias();
   1.979 +    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
   1.980 +
   1.981 +    // Avoid multiple "get element" calls by getting the contents into arrays
   1.982 +    (void) dest->toArray(destPtr);
   1.983 +    (void) source->toArray(sourcePtr);
   1.984 +
   1.985 +    dest->setSize(sourceSize+destOriginalSize, *fStatus);
   1.986 +
   1.987 +    while (sourcePtr < sourceLim && destPtr < destLim) {
   1.988 +        if (*destPtr == *sourcePtr) {
   1.989 +            dest->setElementAt(*sourcePtr++, di++);
   1.990 +            destPtr++;
   1.991 +        }
   1.992 +        // This check is required for machines with segmented memory, like i5/OS.
   1.993 +        // Direct pointer comparison is not recommended.
   1.994 +        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
   1.995 +            dest->setElementAt(*destPtr++, di++);
   1.996 +        }
   1.997 +        else { /* *sourcePtr < *destPtr */
   1.998 +            dest->setElementAt(*sourcePtr++, di++);
   1.999 +        }
  1.1000 +    }
  1.1001 +
  1.1002 +    // At most one of these two cleanup loops will execute
  1.1003 +    while (destPtr < destLim) {
  1.1004 +        dest->setElementAt(*destPtr++, di++);
  1.1005 +    }
  1.1006 +    while (sourcePtr < sourceLim) {
  1.1007 +        dest->setElementAt(*sourcePtr++, di++);
  1.1008 +    }
  1.1009 +
  1.1010 +    dest->setSize(di, *fStatus);
  1.1011 +}
  1.1012 +
  1.1013 +
  1.1014 +
  1.1015 +//-----------------------------------------------------------------------------
  1.1016 +//
  1.1017 +//  setEqual    Set operation on UVector.
  1.1018 +//              Compare for equality.
  1.1019 +//              Elements must be sorted.
  1.1020 +//
  1.1021 +//-----------------------------------------------------------------------------
  1.1022 +UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
  1.1023 +    return a->equals(*b);
  1.1024 +}
  1.1025 +
  1.1026 +
  1.1027 +//-----------------------------------------------------------------------------
  1.1028 +//
  1.1029 +//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
  1.1030 +//                 for each node in the tree.
  1.1031 +//
  1.1032 +//-----------------------------------------------------------------------------
  1.1033 +#ifdef RBBI_DEBUG
  1.1034 +void RBBITableBuilder::printPosSets(RBBINode *n) {
  1.1035 +    if (n==NULL) {
  1.1036 +        return;
  1.1037 +    }
  1.1038 +    n->printNode();
  1.1039 +    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
  1.1040 +
  1.1041 +    RBBIDebugPrintf("         firstpos:  ");
  1.1042 +    printSet(n->fFirstPosSet);
  1.1043 +
  1.1044 +    RBBIDebugPrintf("         lastpos:   ");
  1.1045 +    printSet(n->fLastPosSet);
  1.1046 +
  1.1047 +    RBBIDebugPrintf("         followpos: ");
  1.1048 +    printSet(n->fFollowPos);
  1.1049 +
  1.1050 +    printPosSets(n->fLeftChild);
  1.1051 +    printPosSets(n->fRightChild);
  1.1052 +}
  1.1053 +#endif
  1.1054 +
  1.1055 +
  1.1056 +
  1.1057 +//-----------------------------------------------------------------------------
  1.1058 +//
  1.1059 +//   getTableSize()    Calculate the size of the runtime form of this
  1.1060 +//                     state transition table.
  1.1061 +//
  1.1062 +//-----------------------------------------------------------------------------
  1.1063 +int32_t  RBBITableBuilder::getTableSize() const {
  1.1064 +    int32_t    size = 0;
  1.1065 +    int32_t    numRows;
  1.1066 +    int32_t    numCols;
  1.1067 +    int32_t    rowSize;
  1.1068 +
  1.1069 +    if (fTree == NULL) {
  1.1070 +        return 0;
  1.1071 +    }
  1.1072 +
  1.1073 +    size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
  1.1074 +
  1.1075 +    numRows = fDStates->size();
  1.1076 +    numCols = fRB->fSetBuilder->getNumCharCategories();
  1.1077 +
  1.1078 +    //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
  1.1079 +    //        Therefore we subtract two from numCols when determining
  1.1080 +    //        how much storage to add to a row for the total columns.
  1.1081 +    rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
  1.1082 +    size   += numRows * rowSize;
  1.1083 +    return size;
  1.1084 +}
  1.1085 +
  1.1086 +
  1.1087 +
  1.1088 +//-----------------------------------------------------------------------------
  1.1089 +//
  1.1090 +//   exportTable()    export the state transition table in the format required
  1.1091 +//                    by the runtime engine.  getTableSize() bytes of memory
  1.1092 +//                    must be available at the output address "where".
  1.1093 +//
  1.1094 +//-----------------------------------------------------------------------------
  1.1095 +void RBBITableBuilder::exportTable(void *where) {
  1.1096 +    RBBIStateTable    *table = (RBBIStateTable *)where;
  1.1097 +    uint32_t           state;
  1.1098 +    int                col;
  1.1099 +
  1.1100 +    if (U_FAILURE(*fStatus) || fTree == NULL) {
  1.1101 +        return;
  1.1102 +    }
  1.1103 +
  1.1104 +    if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
  1.1105 +        fDStates->size() > 0x7fff) {
  1.1106 +        *fStatus = U_BRK_INTERNAL_ERROR;
  1.1107 +        return;
  1.1108 +    }
  1.1109 +
  1.1110 +    table->fRowLen    = sizeof(RBBIStateTableRow) +
  1.1111 +                            sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
  1.1112 +    table->fNumStates = fDStates->size();
  1.1113 +    table->fFlags     = 0;
  1.1114 +    if (fRB->fLookAheadHardBreak) {
  1.1115 +        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
  1.1116 +    }
  1.1117 +    if (fRB->fSetBuilder->sawBOF()) {
  1.1118 +        table->fFlags  |= RBBI_BOF_REQUIRED;
  1.1119 +    }
  1.1120 +    table->fReserved  = 0;
  1.1121 +
  1.1122 +    for (state=0; state<table->fNumStates; state++) {
  1.1123 +        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
  1.1124 +        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
  1.1125 +        U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
  1.1126 +        U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
  1.1127 +        row->fAccepting = (int16_t)sd->fAccepting;
  1.1128 +        row->fLookAhead = (int16_t)sd->fLookAhead;
  1.1129 +        row->fTagIdx    = (int16_t)sd->fTagsIdx;
  1.1130 +        for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
  1.1131 +            row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
  1.1132 +        }
  1.1133 +    }
  1.1134 +}
  1.1135 +
  1.1136 +
  1.1137 +
  1.1138 +//-----------------------------------------------------------------------------
  1.1139 +//
  1.1140 +//   printSet    Debug function.   Print the contents of a UVector
  1.1141 +//
  1.1142 +//-----------------------------------------------------------------------------
  1.1143 +#ifdef RBBI_DEBUG
  1.1144 +void RBBITableBuilder::printSet(UVector *s) {
  1.1145 +    int32_t  i;
  1.1146 +    for (i=0; i<s->size(); i++) {
  1.1147 +        void *v = s->elementAt(i);
  1.1148 +        RBBIDebugPrintf("%10p", v);
  1.1149 +    }
  1.1150 +    RBBIDebugPrintf("\n");
  1.1151 +}
  1.1152 +#endif
  1.1153 +
  1.1154 +
  1.1155 +//-----------------------------------------------------------------------------
  1.1156 +//
  1.1157 +//   printStates    Debug Function.  Dump the fully constructed state transition table.
  1.1158 +//
  1.1159 +//-----------------------------------------------------------------------------
  1.1160 +#ifdef RBBI_DEBUG
  1.1161 +void RBBITableBuilder::printStates() {
  1.1162 +    int     c;    // input "character"
  1.1163 +    int     n;    // state number
  1.1164 +
  1.1165 +    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
  1.1166 +    RBBIDebugPrintf("      | Acc  LA    Tag");
  1.1167 +    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1.1168 +        RBBIDebugPrintf(" %2d", c);
  1.1169 +    }
  1.1170 +    RBBIDebugPrintf("\n");
  1.1171 +    RBBIDebugPrintf("      |---------------");
  1.1172 +    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1.1173 +        RBBIDebugPrintf("---");
  1.1174 +    }
  1.1175 +    RBBIDebugPrintf("\n");
  1.1176 +
  1.1177 +    for (n=0; n<fDStates->size(); n++) {
  1.1178 +        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
  1.1179 +        RBBIDebugPrintf("  %3d | " , n);
  1.1180 +        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
  1.1181 +        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1.1182 +            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
  1.1183 +        }
  1.1184 +        RBBIDebugPrintf("\n");
  1.1185 +    }
  1.1186 +    RBBIDebugPrintf("\n\n");
  1.1187 +}
  1.1188 +#endif
  1.1189 +
  1.1190 +
  1.1191 +
  1.1192 +//-----------------------------------------------------------------------------
  1.1193 +//
  1.1194 +//   printRuleStatusTable    Debug Function.  Dump the common rule status table
  1.1195 +//
  1.1196 +//-----------------------------------------------------------------------------
  1.1197 +#ifdef RBBI_DEBUG
  1.1198 +void RBBITableBuilder::printRuleStatusTable() {
  1.1199 +    int32_t  thisRecord = 0;
  1.1200 +    int32_t  nextRecord = 0;
  1.1201 +    int      i;
  1.1202 +    UVector  *tbl = fRB->fRuleStatusVals;
  1.1203 +
  1.1204 +    RBBIDebugPrintf("index |  tags \n");
  1.1205 +    RBBIDebugPrintf("-------------------\n");
  1.1206 +
  1.1207 +    while (nextRecord < tbl->size()) {
  1.1208 +        thisRecord = nextRecord;
  1.1209 +        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
  1.1210 +        RBBIDebugPrintf("%4d   ", thisRecord);
  1.1211 +        for (i=thisRecord+1; i<nextRecord; i++) {
  1.1212 +            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
  1.1213 +        }
  1.1214 +        RBBIDebugPrintf("\n");
  1.1215 +    }
  1.1216 +    RBBIDebugPrintf("\n\n");
  1.1217 +}
  1.1218 +#endif
  1.1219 +
  1.1220 +
  1.1221 +//-----------------------------------------------------------------------------
  1.1222 +//
  1.1223 +//   RBBIStateDescriptor     Methods.  This is a very struct-like class
  1.1224 +//                           Most access is directly to the fields.
  1.1225 +//
  1.1226 +//-----------------------------------------------------------------------------
  1.1227 +
  1.1228 +RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
  1.1229 +    fMarked    = FALSE;
  1.1230 +    fAccepting = 0;
  1.1231 +    fLookAhead = 0;
  1.1232 +    fTagsIdx   = 0;
  1.1233 +    fTagVals   = NULL;
  1.1234 +    fPositions = NULL;
  1.1235 +    fDtran     = NULL;
  1.1236 +
  1.1237 +    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
  1.1238 +    if (U_FAILURE(*fStatus)) {
  1.1239 +        return;
  1.1240 +    }
  1.1241 +    if (fDtran == NULL) {
  1.1242 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
  1.1243 +        return;
  1.1244 +    }
  1.1245 +    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
  1.1246 +                                           //   It is indexed by input symbols, and will
  1.1247 +                                           //   hold  the next state number for each
  1.1248 +                                           //   symbol.
  1.1249 +}
  1.1250 +
  1.1251 +
  1.1252 +RBBIStateDescriptor::~RBBIStateDescriptor() {
  1.1253 +    delete       fPositions;
  1.1254 +    delete       fDtran;
  1.1255 +    delete       fTagVals;
  1.1256 +    fPositions = NULL;
  1.1257 +    fDtran     = NULL;
  1.1258 +    fTagVals   = NULL;
  1.1259 +}
  1.1260 +
  1.1261 +U_NAMESPACE_END
  1.1262 +
  1.1263 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial