intl/icu/source/common/rbbitblb.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 *   Copyright (c) 2002-2009, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 */
     7 //
     8 //  rbbitblb.cpp
     9 //
    12 #include "unicode/utypes.h"
    14 #if !UCONFIG_NO_BREAK_ITERATION
    16 #include "unicode/unistr.h"
    17 #include "rbbitblb.h"
    18 #include "rbbirb.h"
    19 #include "rbbisetb.h"
    20 #include "rbbidata.h"
    21 #include "cstring.h"
    22 #include "uassert.h"
    23 #include "cmemory.h"
    25 U_NAMESPACE_BEGIN
    27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
    28  fTree(*rootNode) {
    29     fRB                 = rb;
    30     fStatus             = fRB->fStatus;
    31     UErrorCode status   = U_ZERO_ERROR;
    32     fDStates            = new UVector(status);
    33     if (U_FAILURE(*fStatus)) {
    34         return;
    35     }
    36     if (U_FAILURE(status)) {
    37         *fStatus = status;
    38         return;
    39     }
    40     if (fDStates == NULL) {
    41         *fStatus = U_MEMORY_ALLOCATION_ERROR;;
    42     }
    43 }
    47 RBBITableBuilder::~RBBITableBuilder() {
    48     int i;
    49     for (i=0; i<fDStates->size(); i++) {
    50         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
    51     }
    52     delete   fDStates;
    53 }
    56 //-----------------------------------------------------------------------------
    57 //
    58 //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
    59 //                               table from the RBBI rules parse tree.
    60 //
    61 //-----------------------------------------------------------------------------
    62 void  RBBITableBuilder::build() {
    64     if (U_FAILURE(*fStatus)) {
    65         return;
    66     }
    68     // If there were no rules, just return.  This situation can easily arise
    69     //   for the reverse rules.
    70     if (fTree==NULL) {
    71         return;
    72     }
    74     //
    75     // Walk through the tree, replacing any references to $variables with a copy of the
    76     //   parse tree for the substition expression.
    77     //
    78     fTree = fTree->flattenVariables();
    79 #ifdef RBBI_DEBUG
    80     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
    81         RBBIDebugPuts("Parse tree after flattening variable references.");
    82         fTree->printTree(TRUE);
    83     }
    84 #endif
    86     //
    87     // If the rules contained any references to {bof} 
    88     //   add a {bof} <cat> <former root of tree> to the
    89     //   tree.  Means that all matches must start out with the 
    90     //   {bof} fake character.
    91     // 
    92     if (fRB->fSetBuilder->sawBOF()) {
    93         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
    94         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
    95         // Delete and exit if memory allocation failed.
    96         if (bofTop == NULL || bofLeaf == NULL) {
    97             *fStatus = U_MEMORY_ALLOCATION_ERROR;
    98             delete bofTop;
    99             delete bofLeaf;
   100             return;
   101         }
   102         bofTop->fLeftChild  = bofLeaf;
   103         bofTop->fRightChild = fTree;
   104         bofLeaf->fParent    = bofTop;
   105         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
   106         fTree               = bofTop;
   107     }
   109     //
   110     // Add a unique right-end marker to the expression.
   111     //   Appears as a cat-node, left child being the original tree,
   112     //   right child being the end marker.
   113     //
   114     RBBINode *cn = new RBBINode(RBBINode::opCat);
   115     // Exit if memory allocation failed.
   116     if (cn == NULL) {
   117         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   118         return;
   119     }
   120     cn->fLeftChild = fTree;
   121     fTree->fParent = cn;
   122     cn->fRightChild = new RBBINode(RBBINode::endMark);
   123     // Delete and exit if memory allocation failed.
   124     if (cn->fRightChild == NULL) {
   125         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   126         delete cn;
   127         return;
   128     }
   129     cn->fRightChild->fParent = cn;
   130     fTree = cn;
   132     //
   133     //  Replace all references to UnicodeSets with the tree for the equivalent
   134     //      expression.
   135     //
   136     fTree->flattenSets();
   137 #ifdef RBBI_DEBUG
   138     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
   139         RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
   140         fTree->printTree(TRUE);
   141     }
   142 #endif
   145     //
   146     // calculate the functions nullable, firstpos, lastpos and followpos on
   147     // nodes in the parse tree.
   148     //    See the alogrithm description in Aho.
   149     //    Understanding how this works by looking at the code alone will be
   150     //       nearly impossible.
   151     //
   152     calcNullable(fTree);
   153     calcFirstPos(fTree);
   154     calcLastPos(fTree);
   155     calcFollowPos(fTree);
   156     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
   157         RBBIDebugPuts("\n");
   158         printPosSets(fTree);
   159     }
   161     //
   162     //  For "chained" rules, modify the followPos sets
   163     //
   164     if (fRB->fChainRules) {
   165         calcChainedFollowPos(fTree);
   166     }
   168     //
   169     //  BOF (start of input) test fixup.
   170     //
   171     if (fRB->fSetBuilder->sawBOF()) {
   172         bofFixup();
   173     }
   175     //
   176     // Build the DFA state transition tables.
   177     //
   178     buildStateTable();
   179     flagAcceptingStates();
   180     flagLookAheadStates();
   181     flagTaggedStates();
   183     //
   184     // Update the global table of rule status {tag} values
   185     // The rule builder has a global vector of status values that are common
   186     //    for all tables.  Merge the ones from this table into the global set.
   187     //
   188     mergeRuleStatusVals();
   190     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
   191 }
   195 //-----------------------------------------------------------------------------
   196 //
   197 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
   198 //
   199 //-----------------------------------------------------------------------------
   200 void RBBITableBuilder::calcNullable(RBBINode *n) {
   201     if (n == NULL) {
   202         return;
   203     }
   204     if (n->fType == RBBINode::setRef ||
   205         n->fType == RBBINode::endMark ) {
   206         // These are non-empty leaf node types.
   207         n->fNullable = FALSE;
   208         return;
   209     }
   211     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
   212         // Lookahead marker node.  It's a leaf, so no recursion on children.
   213         // It's nullable because it does not match any literal text from the input stream.
   214         n->fNullable = TRUE;
   215         return;
   216     }
   219     // The node is not a leaf.
   220     //  Calculate nullable on its children.
   221     calcNullable(n->fLeftChild);
   222     calcNullable(n->fRightChild);
   224     // Apply functions from table 3.40 in Aho
   225     if (n->fType == RBBINode::opOr) {
   226         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
   227     }
   228     else if (n->fType == RBBINode::opCat) {
   229         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
   230     }
   231     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
   232         n->fNullable = TRUE;
   233     }
   234     else {
   235         n->fNullable = FALSE;
   236     }
   237 }
   242 //-----------------------------------------------------------------------------
   243 //
   244 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
   245 //
   246 //-----------------------------------------------------------------------------
   247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
   248     if (n == NULL) {
   249         return;
   250     }
   251     if (n->fType == RBBINode::leafChar  ||
   252         n->fType == RBBINode::endMark   ||
   253         n->fType == RBBINode::lookAhead ||
   254         n->fType == RBBINode::tag) {
   255         // These are non-empty leaf node types.
   256         // Note: In order to maintain the sort invariant on the set,
   257         // this function should only be called on a node whose set is
   258         // empty to start with.
   259         n->fFirstPosSet->addElement(n, *fStatus);
   260         return;
   261     }
   263     // The node is not a leaf.
   264     //  Calculate firstPos on its children.
   265     calcFirstPos(n->fLeftChild);
   266     calcFirstPos(n->fRightChild);
   268     // Apply functions from table 3.40 in Aho
   269     if (n->fType == RBBINode::opOr) {
   270         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   271         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
   272     }
   273     else if (n->fType == RBBINode::opCat) {
   274         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   275         if (n->fLeftChild->fNullable) {
   276             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
   277         }
   278     }
   279     else if (n->fType == RBBINode::opStar ||
   280              n->fType == RBBINode::opQuestion ||
   281              n->fType == RBBINode::opPlus) {
   282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
   283     }
   284 }
   288 //-----------------------------------------------------------------------------
   289 //
   290 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
   291 //
   292 //-----------------------------------------------------------------------------
   293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
   294     if (n == NULL) {
   295         return;
   296     }
   297     if (n->fType == RBBINode::leafChar  ||
   298         n->fType == RBBINode::endMark   ||
   299         n->fType == RBBINode::lookAhead ||
   300         n->fType == RBBINode::tag) {
   301         // These are non-empty leaf node types.
   302         // Note: In order to maintain the sort invariant on the set,
   303         // this function should only be called on a node whose set is
   304         // empty to start with.
   305         n->fLastPosSet->addElement(n, *fStatus);
   306         return;
   307     }
   309     // The node is not a leaf.
   310     //  Calculate lastPos on its children.
   311     calcLastPos(n->fLeftChild);
   312     calcLastPos(n->fRightChild);
   314     // Apply functions from table 3.40 in Aho
   315     if (n->fType == RBBINode::opOr) {
   316         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   317         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
   318     }
   319     else if (n->fType == RBBINode::opCat) {
   320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
   321         if (n->fRightChild->fNullable) {
   322             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   323         }
   324     }
   325     else if (n->fType == RBBINode::opStar     ||
   326              n->fType == RBBINode::opQuestion ||
   327              n->fType == RBBINode::opPlus) {
   328         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
   329     }
   330 }
   334 //-----------------------------------------------------------------------------
   335 //
   336 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
   337 //
   338 //-----------------------------------------------------------------------------
   339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
   340     if (n == NULL ||
   341         n->fType == RBBINode::leafChar ||
   342         n->fType == RBBINode::endMark) {
   343         return;
   344     }
   346     calcFollowPos(n->fLeftChild);
   347     calcFollowPos(n->fRightChild);
   349     // Aho rule #1
   350     if (n->fType == RBBINode::opCat) {
   351         RBBINode *i;   // is 'i' in Aho's description
   352         uint32_t     ix;
   354         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
   356         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
   357             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
   358             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
   359         }
   360     }
   362     // Aho rule #2
   363     if (n->fType == RBBINode::opStar ||
   364         n->fType == RBBINode::opPlus) {
   365         RBBINode   *i;  // again, n and i are the names from Aho's description.
   366         uint32_t    ix;
   368         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
   369             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
   370             setAdd(i->fFollowPos, n->fFirstPosSet);
   371         }
   372     }
   376 }
   379 //-----------------------------------------------------------------------------
   380 //
   381 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
   382 //                            to implement rule chaining.  NOT described by Aho
   383 //
   384 //-----------------------------------------------------------------------------
   385 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
   387     UVector         endMarkerNodes(*fStatus);
   388     UVector         leafNodes(*fStatus);
   389     int32_t         i;
   391     if (U_FAILURE(*fStatus)) {
   392         return;
   393     }
   395     // get a list of all endmarker nodes.
   396     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
   398     // get a list all leaf nodes
   399     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
   400     if (U_FAILURE(*fStatus)) {
   401         return;
   402     }
   404     // Get all nodes that can be the start a match, which is FirstPosition()
   405     // of the portion of the tree corresponding to user-written rules.
   406     // See the tree description in bofFixup().
   407     RBBINode *userRuleRoot = tree;
   408     if (fRB->fSetBuilder->sawBOF()) {
   409         userRuleRoot = tree->fLeftChild->fRightChild;
   410     }
   411     U_ASSERT(userRuleRoot != NULL);
   412     UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
   415     // Iteratate over all leaf nodes,
   416     //
   417     int32_t  endNodeIx;
   418     int32_t  startNodeIx;
   420     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
   421         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
   422         RBBINode *endNode = NULL;
   424         // Identify leaf nodes that correspond to overall rule match positions.
   425         //   These include an endMarkerNode in their followPos sets.
   426         for (i=0; i<endMarkerNodes.size(); i++) {
   427             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
   428                 endNode = tNode;
   429                 break;
   430             }
   431         }
   432         if (endNode == NULL) {
   433             // node wasn't an end node.  Try again with the next.
   434             continue;
   435         }
   437         // We've got a node that can end a match.
   439         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
   440         //                            don't chain from it.
   441         // TODO:  Add rule syntax for this behavior, get specifics out of here and
   442         //        into the rule file.
   443         if (fRB->fLBCMNoChain) {
   444             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
   445             if (c != -1) {
   446                 // c == -1 occurs with sets containing only the {eof} marker string.
   447                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
   448                 if (cLBProp == U_LB_COMBINING_MARK) {
   449                     continue;
   450                 }
   451             }
   452         }
   455         // Now iterate over the nodes that can start a match, looking for ones
   456         //   with the same char class as our ending node.
   457         RBBINode *startNode;
   458         for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
   459             startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
   460             if (startNode->fType != RBBINode::leafChar) {
   461                 continue;
   462             }
   464             if (endNode->fVal == startNode->fVal) {
   465                 // The end val (character class) of one possible match is the
   466                 //   same as the start of another.
   468                 // Add all nodes from the followPos of the start node to the
   469                 //  followPos set of the end node, which will have the effect of
   470                 //  letting matches transition from a match state at endNode
   471                 //  to the second char of a match starting with startNode.
   472                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
   473             }
   474         }
   475     }
   476 }
   479 //-----------------------------------------------------------------------------
   480 //
   481 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
   482 //                Do an swizzle similar to chaining, modifying the followPos set of
   483 //                the bofNode to include the followPos nodes from other {bot} nodes
   484 //                scattered through the tree.
   485 //
   486 //                This function has much in common with calcChainedFollowPos().
   487 //
   488 //-----------------------------------------------------------------------------
   489 void RBBITableBuilder::bofFixup() {
   491     if (U_FAILURE(*fStatus)) {
   492         return;
   493     }
   495     //   The parse tree looks like this ...
   496     //         fTree root  --->       <cat>
   497     //                               /     \       .
   498     //                            <cat>   <#end node>
   499     //                           /     \  .
   500     //                     <bofNode>   rest
   501     //                               of tree
   502     //
   503     //    We will be adding things to the followPos set of the <bofNode>
   504     //
   505     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
   506     U_ASSERT(bofNode->fType == RBBINode::leafChar);
   507     U_ASSERT(bofNode->fVal == 2);
   509     // Get all nodes that can be the start a match of the user-written rules
   510     //  (excluding the fake bofNode)
   511     //  We want the nodes that can start a match in the
   512     //     part labeled "rest of tree"
   513     // 
   514     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
   516     RBBINode *startNode;
   517     int       startNodeIx;
   518     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
   519         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
   520         if (startNode->fType != RBBINode::leafChar) {
   521             continue;
   522         }
   524         if (startNode->fVal == bofNode->fVal) {
   525             //  We found a leaf node corresponding to a {bof} that was
   526             //    explicitly written into a rule.
   527             //  Add everything from the followPos set of this node to the
   528             //    followPos set of the fake bofNode at the start of the tree.
   529             //  
   530             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
   531         }
   532     }
   533 }
   535 //-----------------------------------------------------------------------------
   536 //
   537 //   buildStateTable()    Determine the set of runtime DFA states and the
   538 //                        transition tables for these states, by the algorithm
   539 //                        of fig. 3.44 in Aho.
   540 //
   541 //                        Most of the comments are quotes of Aho's psuedo-code.
   542 //
   543 //-----------------------------------------------------------------------------
   544 void RBBITableBuilder::buildStateTable() {
   545     if (U_FAILURE(*fStatus)) {
   546         return;
   547     }
   548     RBBIStateDescriptor *failState;
   549     // Set it to NULL to avoid uninitialized warning
   550     RBBIStateDescriptor *initialState = NULL; 
   551     //
   552     // Add a dummy state 0 - the stop state.  Not from Aho.
   553     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
   554     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   555     if (failState == NULL) {
   556         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   557         goto ExitBuildSTdeleteall;
   558     }
   559     failState->fPositions = new UVector(*fStatus);
   560     if (failState->fPositions == NULL) {
   561         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   562     }
   563     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
   564         goto ExitBuildSTdeleteall;
   565     }
   566     fDStates->addElement(failState, *fStatus);
   567     if (U_FAILURE(*fStatus)) {
   568         goto ExitBuildSTdeleteall;
   569     }
   571     // initially, the only unmarked state in Dstates is firstpos(root),
   572     //       where toot is the root of the syntax tree for (r)#;
   573     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   574     if (initialState == NULL) {
   575         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   576     }
   577     if (U_FAILURE(*fStatus)) {
   578         goto ExitBuildSTdeleteall;
   579     }
   580     initialState->fPositions = new UVector(*fStatus);
   581     if (initialState->fPositions == NULL) {
   582         *fStatus = U_MEMORY_ALLOCATION_ERROR;
   583     }
   584     if (U_FAILURE(*fStatus)) {
   585         goto ExitBuildSTdeleteall;
   586     }
   587     setAdd(initialState->fPositions, fTree->fFirstPosSet);
   588     fDStates->addElement(initialState, *fStatus);
   589     if (U_FAILURE(*fStatus)) {
   590         goto ExitBuildSTdeleteall;
   591     }
   593     // while there is an unmarked state T in Dstates do begin
   594     for (;;) {
   595         RBBIStateDescriptor *T = NULL;
   596         int32_t              tx;
   597         for (tx=1; tx<fDStates->size(); tx++) {
   598             RBBIStateDescriptor *temp;
   599             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
   600             if (temp->fMarked == FALSE) {
   601                 T = temp;
   602                 break;
   603             }
   604         }
   605         if (T == NULL) {
   606             break;
   607         }
   609         // mark T;
   610         T->fMarked = TRUE;
   612         // for each input symbol a do begin
   613         int32_t  a;
   614         for (a = 1; a<=lastInputSymbol; a++) {
   615             // let U be the set of positions that are in followpos(p)
   616             //    for some position p in T
   617             //    such that the symbol at position p is a;
   618             UVector    *U = NULL;
   619             RBBINode   *p;
   620             int32_t     px;
   621             for (px=0; px<T->fPositions->size(); px++) {
   622                 p = (RBBINode *)T->fPositions->elementAt(px);
   623                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
   624                     if (U == NULL) {
   625                         U = new UVector(*fStatus);
   626                         if (U == NULL) {
   627                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
   628                         	goto ExitBuildSTdeleteall;
   629                         }
   630                     }
   631                     setAdd(U, p->fFollowPos);
   632                 }
   633             }
   635             // if U is not empty and not in DStates then
   636             int32_t  ux = 0;
   637             UBool    UinDstates = FALSE;
   638             if (U != NULL) {
   639                 U_ASSERT(U->size() > 0);
   640                 int  ix;
   641                 for (ix=0; ix<fDStates->size(); ix++) {
   642                     RBBIStateDescriptor *temp2;
   643                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
   644                     if (setEquals(U, temp2->fPositions)) {
   645                         delete U;
   646                         U  = temp2->fPositions;
   647                         ux = ix;
   648                         UinDstates = TRUE;
   649                         break;
   650                     }
   651                 }
   653                 // Add U as an unmarked state to Dstates
   654                 if (!UinDstates)
   655                 {
   656                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
   657                     if (newState == NULL) {
   658                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
   659                     }
   660                     if (U_FAILURE(*fStatus)) {
   661                         goto ExitBuildSTdeleteall;
   662                     }
   663                     newState->fPositions = U;
   664                     fDStates->addElement(newState, *fStatus);
   665                     if (U_FAILURE(*fStatus)) {
   666                         return;
   667                     }
   668                     ux = fDStates->size()-1;
   669                 }
   671                 // Dtran[T, a] := U;
   672                 T->fDtran->setElementAt(ux, a);
   673             }
   674         }
   675     }
   676     return;
   677     // delete local pointers only if error occured.
   678 ExitBuildSTdeleteall:
   679     delete initialState;
   680     delete failState;
   681 }
   685 //-----------------------------------------------------------------------------
   686 //
   687 //   flagAcceptingStates    Identify accepting states.
   688 //                          First get a list of all of the end marker nodes.
   689 //                          Then, for each state s,
   690 //                              if s contains one of the end marker nodes in its list of tree positions then
   691 //                                  s is an accepting state.
   692 //
   693 //-----------------------------------------------------------------------------
   694 void     RBBITableBuilder::flagAcceptingStates() {
   695     if (U_FAILURE(*fStatus)) {
   696         return;
   697     }
   698     UVector     endMarkerNodes(*fStatus);
   699     RBBINode    *endMarker;
   700     int32_t     i;
   701     int32_t     n;
   703     if (U_FAILURE(*fStatus)) {
   704         return;
   705     }
   707     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
   708     if (U_FAILURE(*fStatus)) {
   709         return;
   710     }
   712     for (i=0; i<endMarkerNodes.size(); i++) {
   713         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
   714         for (n=0; n<fDStates->size(); n++) {
   715             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   716             if (sd->fPositions->indexOf(endMarker) >= 0) {
   717                 // Any non-zero value for fAccepting means this is an accepting node.
   718                 // The value is what will be returned to the user as the break status.
   719                 // If no other value was specified, force it to -1.
   721                 if (sd->fAccepting==0) {
   722                     // State hasn't been marked as accepting yet.  Do it now.
   723                     sd->fAccepting = endMarker->fVal;
   724                     if (sd->fAccepting == 0) {
   725                         sd->fAccepting = -1;
   726                     }
   727                 }
   728                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
   729                     // Both lookahead and non-lookahead accepting for this state.
   730                     // Favor the look-ahead.  Expedient for line break.
   731                     // TODO:  need a more elegant resolution for conflicting rules.
   732                     sd->fAccepting = endMarker->fVal;
   733                 }
   734                 // implicit else:
   735                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
   737                 // If the end marker node is from a look-ahead rule, set
   738                 //   the fLookAhead field or this state also.
   739                 if (endMarker->fLookAheadEnd) {
   740                     // TODO:  don't change value if already set?
   741                     // TODO:  allow for more than one active look-ahead rule in engine.
   742                     //        Make value here an index to a side array in engine?
   743                     sd->fLookAhead = sd->fAccepting;
   744                 }
   745             }
   746         }
   747     }
   748 }
   751 //-----------------------------------------------------------------------------
   752 //
   753 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
   754 //
   755 //-----------------------------------------------------------------------------
   756 void     RBBITableBuilder::flagLookAheadStates() {
   757     if (U_FAILURE(*fStatus)) {
   758         return;
   759     }
   760     UVector     lookAheadNodes(*fStatus);
   761     RBBINode    *lookAheadNode;
   762     int32_t     i;
   763     int32_t     n;
   765     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
   766     if (U_FAILURE(*fStatus)) {
   767         return;
   768     }
   769     for (i=0; i<lookAheadNodes.size(); i++) {
   770         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
   772         for (n=0; n<fDStates->size(); n++) {
   773             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   774             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
   775                 sd->fLookAhead = lookAheadNode->fVal;
   776             }
   777         }
   778     }
   779 }
   784 //-----------------------------------------------------------------------------
   785 //
   786 //    flagTaggedStates
   787 //
   788 //-----------------------------------------------------------------------------
   789 void     RBBITableBuilder::flagTaggedStates() {
   790     if (U_FAILURE(*fStatus)) {
   791         return;
   792     }
   793     UVector     tagNodes(*fStatus);
   794     RBBINode    *tagNode;
   795     int32_t     i;
   796     int32_t     n;
   798     if (U_FAILURE(*fStatus)) {
   799         return;
   800     }
   801     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
   802     if (U_FAILURE(*fStatus)) {
   803         return;
   804     }
   805     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
   806         tagNode = (RBBINode *)tagNodes.elementAt(i);
   808         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
   809             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   810             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
   811                 sortedAdd(&sd->fTagVals, tagNode->fVal);
   812             }
   813         }
   814     }
   815 }
   820 //-----------------------------------------------------------------------------
   821 //
   822 //  mergeRuleStatusVals
   823 //
   824 //      Update the global table of rule status {tag} values
   825 //      The rule builder has a global vector of status values that are common
   826 //      for all tables.  Merge the ones from this table into the global set.
   827 //
   828 //-----------------------------------------------------------------------------
   829 void  RBBITableBuilder::mergeRuleStatusVals() {
   830     //
   831     //  The basic outline of what happens here is this...
   832     //
   833     //    for each state in this state table
   834     //       if the status tag list for this state is in the global statuses list
   835     //           record where and
   836     //           continue with the next state
   837     //       else
   838     //           add the tag list for this state to the global list.
   839     //
   840     int i;
   841     int n;
   843     // Pre-set a single tag of {0} into the table.
   844     //   We will need this as a default, for rule sets with no explicit tagging.
   845     if (fRB->fRuleStatusVals->size() == 0) {
   846         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
   847         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
   848     }
   850     //    For each state
   851     for (n=0; n<fDStates->size(); n++) {
   852         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
   853         UVector *thisStatesTagValues = sd->fTagVals;
   854         if (thisStatesTagValues == NULL) {
   855             // No tag values are explicitly associated with this state.
   856             //   Set the default tag value.
   857             sd->fTagsIdx = 0;
   858             continue;
   859         }
   861         // There are tag(s) associated with this state.
   862         //   fTagsIdx will be the index into the global tag list for this state's tag values.
   863         //   Initial value of -1 flags that we haven't got it set yet.
   864         sd->fTagsIdx = -1;
   865         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
   866         int32_t  nextTagGroupStart = 0;
   868         // Loop runs once per group of tags in the global list
   869         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
   870             thisTagGroupStart = nextTagGroupStart;
   871             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
   872             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
   873                 // The number of tags for this state is different from
   874                 //    the number of tags in this group from the global list.
   875                 //    Continue with the next group from the global list.
   876                 continue;
   877             }
   878             // The lengths match, go ahead and compare the actual tag values
   879             //    between this state and the group from the global list.
   880             for (i=0; i<thisStatesTagValues->size(); i++) {
   881                 if (thisStatesTagValues->elementAti(i) !=
   882                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
   883                     // Mismatch.
   884                     break;
   885                 }
   886             }
   888             if (i == thisStatesTagValues->size()) {
   889                 // We found a set of tag values in the global list that match
   890                 //   those for this state.  Use them.
   891                 sd->fTagsIdx = thisTagGroupStart;
   892                 break;
   893             }
   894         }
   896         if (sd->fTagsIdx == -1) {
   897             // No suitable entry in the global tag list already.  Add one
   898             sd->fTagsIdx = fRB->fRuleStatusVals->size();
   899             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
   900             for (i=0; i<thisStatesTagValues->size(); i++) {
   901                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
   902             }
   903         }
   904     }
   905 }
   913 //-----------------------------------------------------------------------------
   914 //
   915 //  sortedAdd  Add a value to a vector of sorted values (ints).
   916 //             Do not replicate entries; if the value is already there, do not
   917 //                add a second one.
   918 //             Lazily create the vector if it does not already exist.
   919 //
   920 //-----------------------------------------------------------------------------
   921 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
   922     int32_t i;
   924     if (*vector == NULL) {
   925         *vector = new UVector(*fStatus);
   926     }
   927     if (*vector == NULL || U_FAILURE(*fStatus)) {
   928         return;
   929     }
   930     UVector *vec = *vector;
   931     int32_t  vSize = vec->size();
   932     for (i=0; i<vSize; i++) {
   933         int32_t valAtI = vec->elementAti(i);
   934         if (valAtI == val) {
   935             // The value is already in the vector.  Don't add it again.
   936             return;
   937         }
   938         if (valAtI > val) {
   939             break;
   940         }
   941     }
   942     vec->insertElementAt(val, i, *fStatus);
   943 }
   947 //-----------------------------------------------------------------------------
   948 //
   949 //  setAdd     Set operation on UVector
   950 //             dest = dest union source
   951 //             Elements may only appear once and must be sorted.
   952 //
   953 //-----------------------------------------------------------------------------
   954 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
   955     int32_t destOriginalSize = dest->size();
   956     int32_t sourceSize       = source->size();
   957     int32_t di           = 0;
   958     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
   959     void **destPtr, **sourcePtr;
   960     void **destLim, **sourceLim;
   962     if (destOriginalSize > destArray.getCapacity()) {
   963         if (destArray.resize(destOriginalSize) == NULL) {
   964             return;
   965         }
   966     }
   967     destPtr = destArray.getAlias();
   968     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
   970     if (sourceSize > sourceArray.getCapacity()) {
   971         if (sourceArray.resize(sourceSize) == NULL) {
   972             return;
   973         }
   974     }
   975     sourcePtr = sourceArray.getAlias();
   976     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
   978     // Avoid multiple "get element" calls by getting the contents into arrays
   979     (void) dest->toArray(destPtr);
   980     (void) source->toArray(sourcePtr);
   982     dest->setSize(sourceSize+destOriginalSize, *fStatus);
   984     while (sourcePtr < sourceLim && destPtr < destLim) {
   985         if (*destPtr == *sourcePtr) {
   986             dest->setElementAt(*sourcePtr++, di++);
   987             destPtr++;
   988         }
   989         // This check is required for machines with segmented memory, like i5/OS.
   990         // Direct pointer comparison is not recommended.
   991         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
   992             dest->setElementAt(*destPtr++, di++);
   993         }
   994         else { /* *sourcePtr < *destPtr */
   995             dest->setElementAt(*sourcePtr++, di++);
   996         }
   997     }
   999     // At most one of these two cleanup loops will execute
  1000     while (destPtr < destLim) {
  1001         dest->setElementAt(*destPtr++, di++);
  1003     while (sourcePtr < sourceLim) {
  1004         dest->setElementAt(*sourcePtr++, di++);
  1007     dest->setSize(di, *fStatus);
  1012 //-----------------------------------------------------------------------------
  1013 //
  1014 //  setEqual    Set operation on UVector.
  1015 //              Compare for equality.
  1016 //              Elements must be sorted.
  1017 //
  1018 //-----------------------------------------------------------------------------
  1019 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
  1020     return a->equals(*b);
  1024 //-----------------------------------------------------------------------------
  1025 //
  1026 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
  1027 //                 for each node in the tree.
  1028 //
  1029 //-----------------------------------------------------------------------------
  1030 #ifdef RBBI_DEBUG
  1031 void RBBITableBuilder::printPosSets(RBBINode *n) {
  1032     if (n==NULL) {
  1033         return;
  1035     n->printNode();
  1036     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
  1038     RBBIDebugPrintf("         firstpos:  ");
  1039     printSet(n->fFirstPosSet);
  1041     RBBIDebugPrintf("         lastpos:   ");
  1042     printSet(n->fLastPosSet);
  1044     RBBIDebugPrintf("         followpos: ");
  1045     printSet(n->fFollowPos);
  1047     printPosSets(n->fLeftChild);
  1048     printPosSets(n->fRightChild);
  1050 #endif
  1054 //-----------------------------------------------------------------------------
  1055 //
  1056 //   getTableSize()    Calculate the size of the runtime form of this
  1057 //                     state transition table.
  1058 //
  1059 //-----------------------------------------------------------------------------
  1060 int32_t  RBBITableBuilder::getTableSize() const {
  1061     int32_t    size = 0;
  1062     int32_t    numRows;
  1063     int32_t    numCols;
  1064     int32_t    rowSize;
  1066     if (fTree == NULL) {
  1067         return 0;
  1070     size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
  1072     numRows = fDStates->size();
  1073     numCols = fRB->fSetBuilder->getNumCharCategories();
  1075     //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
  1076     //        Therefore we subtract two from numCols when determining
  1077     //        how much storage to add to a row for the total columns.
  1078     rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
  1079     size   += numRows * rowSize;
  1080     return size;
  1085 //-----------------------------------------------------------------------------
  1086 //
  1087 //   exportTable()    export the state transition table in the format required
  1088 //                    by the runtime engine.  getTableSize() bytes of memory
  1089 //                    must be available at the output address "where".
  1090 //
  1091 //-----------------------------------------------------------------------------
  1092 void RBBITableBuilder::exportTable(void *where) {
  1093     RBBIStateTable    *table = (RBBIStateTable *)where;
  1094     uint32_t           state;
  1095     int                col;
  1097     if (U_FAILURE(*fStatus) || fTree == NULL) {
  1098         return;
  1101     if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
  1102         fDStates->size() > 0x7fff) {
  1103         *fStatus = U_BRK_INTERNAL_ERROR;
  1104         return;
  1107     table->fRowLen    = sizeof(RBBIStateTableRow) +
  1108                             sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
  1109     table->fNumStates = fDStates->size();
  1110     table->fFlags     = 0;
  1111     if (fRB->fLookAheadHardBreak) {
  1112         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
  1114     if (fRB->fSetBuilder->sawBOF()) {
  1115         table->fFlags  |= RBBI_BOF_REQUIRED;
  1117     table->fReserved  = 0;
  1119     for (state=0; state<table->fNumStates; state++) {
  1120         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
  1121         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
  1122         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
  1123         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
  1124         row->fAccepting = (int16_t)sd->fAccepting;
  1125         row->fLookAhead = (int16_t)sd->fLookAhead;
  1126         row->fTagIdx    = (int16_t)sd->fTagsIdx;
  1127         for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
  1128             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
  1135 //-----------------------------------------------------------------------------
  1136 //
  1137 //   printSet    Debug function.   Print the contents of a UVector
  1138 //
  1139 //-----------------------------------------------------------------------------
  1140 #ifdef RBBI_DEBUG
  1141 void RBBITableBuilder::printSet(UVector *s) {
  1142     int32_t  i;
  1143     for (i=0; i<s->size(); i++) {
  1144         void *v = s->elementAt(i);
  1145         RBBIDebugPrintf("%10p", v);
  1147     RBBIDebugPrintf("\n");
  1149 #endif
  1152 //-----------------------------------------------------------------------------
  1153 //
  1154 //   printStates    Debug Function.  Dump the fully constructed state transition table.
  1155 //
  1156 //-----------------------------------------------------------------------------
  1157 #ifdef RBBI_DEBUG
  1158 void RBBITableBuilder::printStates() {
  1159     int     c;    // input "character"
  1160     int     n;    // state number
  1162     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
  1163     RBBIDebugPrintf("      | Acc  LA    Tag");
  1164     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1165         RBBIDebugPrintf(" %2d", c);
  1167     RBBIDebugPrintf("\n");
  1168     RBBIDebugPrintf("      |---------------");
  1169     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1170         RBBIDebugPrintf("---");
  1172     RBBIDebugPrintf("\n");
  1174     for (n=0; n<fDStates->size(); n++) {
  1175         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
  1176         RBBIDebugPrintf("  %3d | " , n);
  1177         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
  1178         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
  1179             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
  1181         RBBIDebugPrintf("\n");
  1183     RBBIDebugPrintf("\n\n");
  1185 #endif
  1189 //-----------------------------------------------------------------------------
  1190 //
  1191 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
  1192 //
  1193 //-----------------------------------------------------------------------------
  1194 #ifdef RBBI_DEBUG
  1195 void RBBITableBuilder::printRuleStatusTable() {
  1196     int32_t  thisRecord = 0;
  1197     int32_t  nextRecord = 0;
  1198     int      i;
  1199     UVector  *tbl = fRB->fRuleStatusVals;
  1201     RBBIDebugPrintf("index |  tags \n");
  1202     RBBIDebugPrintf("-------------------\n");
  1204     while (nextRecord < tbl->size()) {
  1205         thisRecord = nextRecord;
  1206         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
  1207         RBBIDebugPrintf("%4d   ", thisRecord);
  1208         for (i=thisRecord+1; i<nextRecord; i++) {
  1209             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
  1211         RBBIDebugPrintf("\n");
  1213     RBBIDebugPrintf("\n\n");
  1215 #endif
  1218 //-----------------------------------------------------------------------------
  1219 //
  1220 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
  1221 //                           Most access is directly to the fields.
  1222 //
  1223 //-----------------------------------------------------------------------------
  1225 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
  1226     fMarked    = FALSE;
  1227     fAccepting = 0;
  1228     fLookAhead = 0;
  1229     fTagsIdx   = 0;
  1230     fTagVals   = NULL;
  1231     fPositions = NULL;
  1232     fDtran     = NULL;
  1234     fDtran     = new UVector(lastInputSymbol+1, *fStatus);
  1235     if (U_FAILURE(*fStatus)) {
  1236         return;
  1238     if (fDtran == NULL) {
  1239         *fStatus = U_MEMORY_ALLOCATION_ERROR;
  1240         return;
  1242     fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
  1243                                            //   It is indexed by input symbols, and will
  1244                                            //   hold  the next state number for each
  1245                                            //   symbol.
  1249 RBBIStateDescriptor::~RBBIStateDescriptor() {
  1250     delete       fPositions;
  1251     delete       fDtran;
  1252     delete       fTagVals;
  1253     fPositions = NULL;
  1254     fDtran     = NULL;
  1255     fTagVals   = NULL;
  1258 U_NAMESPACE_END
  1260 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial