intl/icu/source/common/rbbisetb.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbisetb.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,695 @@
     1.4 +//
     1.5 +//  rbbisetb.cpp
     1.6 +//
     1.7 +/*
     1.8 +***************************************************************************
     1.9 +*   Copyright (C) 2002-2008 International Business Machines Corporation   *
    1.10 +*   and others. All rights reserved.                                      *
    1.11 +***************************************************************************
    1.12 +*/
    1.13 +//
    1.14 +//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
    1.15 +//                   (part of the rule building process.)
    1.16 +//
    1.17 +//      Starting with the rules parse tree from the scanner,
    1.18 +//
    1.19 +//                   -  Enumerate the set of UnicodeSets that are referenced
    1.20 +//                      by the RBBI rules.
    1.21 +//                   -  compute a set of non-overlapping character ranges
    1.22 +//                      with all characters within a range belonging to the same
    1.23 +//                      set of input uniocde sets.
    1.24 +//                   -  Derive a set of non-overlapping UnicodeSet (like things)
    1.25 +//                      that will correspond to columns in the state table for
    1.26 +//                      the RBBI execution engine.  All characters within one
    1.27 +//                      of these sets belong to the same set of the original
    1.28 +//                      UnicodeSets from the user's rules.
    1.29 +//                   -  construct the trie table that maps input characters
    1.30 +//                      to the index of the matching non-overlapping set of set from
    1.31 +//                      the previous step.
    1.32 +//
    1.33 +
    1.34 +#include "unicode/utypes.h"
    1.35 +
    1.36 +#if !UCONFIG_NO_BREAK_ITERATION
    1.37 +
    1.38 +#include "unicode/uniset.h"
    1.39 +#include "utrie.h"
    1.40 +#include "uvector.h"
    1.41 +#include "uassert.h"
    1.42 +#include "cmemory.h"
    1.43 +#include "cstring.h"
    1.44 +
    1.45 +#include "rbbisetb.h"
    1.46 +#include "rbbinode.h"
    1.47 +
    1.48 +
    1.49 +//------------------------------------------------------------------------
    1.50 +//
    1.51 +//   getFoldedRBBIValue        Call-back function used during building of Trie table.
    1.52 +//                             Folding value: just store the offset (16 bits)
    1.53 +//                             if there is any non-0 entry.
    1.54 +//                             (It'd really be nice if the Trie builder would provide a
    1.55 +//                             simple default, so this function could go away from here.)
    1.56 +//
    1.57 +//------------------------------------------------------------------------
    1.58 +/* folding value: just store the offset (16 bits) if there is any non-0 entry */
    1.59 +U_CDECL_BEGIN
    1.60 +static uint32_t U_CALLCONV
    1.61 +getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
    1.62 +    uint32_t value;
    1.63 +    UChar32 limit;
    1.64 +    UBool inBlockZero;
    1.65 +
    1.66 +    limit=start+0x400;
    1.67 +    while(start<limit) {
    1.68 +        value=utrie_get32(trie, start, &inBlockZero);
    1.69 +        if(inBlockZero) {
    1.70 +            start+=UTRIE_DATA_BLOCK_LENGTH;
    1.71 +        } else if(value!=0) {
    1.72 +            return (uint32_t)(offset|0x8000);
    1.73 +        } else {
    1.74 +            ++start;
    1.75 +        }
    1.76 +    }
    1.77 +    return 0;
    1.78 +}
    1.79 +
    1.80 +
    1.81 +U_CDECL_END
    1.82 +
    1.83 +
    1.84 +
    1.85 +U_NAMESPACE_BEGIN
    1.86 +
    1.87 +//------------------------------------------------------------------------
    1.88 +//
    1.89 +//   Constructor
    1.90 +//
    1.91 +//------------------------------------------------------------------------
    1.92 +RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
    1.93 +{
    1.94 +    fRB             = rb;
    1.95 +    fStatus         = rb->fStatus;
    1.96 +    fRangeList      = 0;
    1.97 +    fTrie           = 0;
    1.98 +    fTrieSize       = 0;
    1.99 +    fGroupCount     = 0;
   1.100 +    fSawBOF         = FALSE;
   1.101 +}
   1.102 +
   1.103 +
   1.104 +//------------------------------------------------------------------------
   1.105 +//
   1.106 +//   Destructor
   1.107 +//
   1.108 +//------------------------------------------------------------------------
   1.109 +RBBISetBuilder::~RBBISetBuilder()
   1.110 +{
   1.111 +    RangeDescriptor   *nextRangeDesc;
   1.112 +
   1.113 +    // Walk through & delete the linked list of RangeDescriptors
   1.114 +    for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
   1.115 +        RangeDescriptor *r = nextRangeDesc;
   1.116 +        nextRangeDesc      = r->fNext;
   1.117 +        delete r;
   1.118 +    }
   1.119 +
   1.120 +    utrie_close(fTrie);
   1.121 +}
   1.122 +
   1.123 +
   1.124 +
   1.125 +
   1.126 +//------------------------------------------------------------------------
   1.127 +//
   1.128 +//   build          Build the list of non-overlapping character ranges
   1.129 +//                  from the Unicode Sets.
   1.130 +//
   1.131 +//------------------------------------------------------------------------
   1.132 +void RBBISetBuilder::build() {
   1.133 +    RBBINode        *usetNode;
   1.134 +    RangeDescriptor *rlRange;
   1.135 +
   1.136 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
   1.137 +
   1.138 +    //
   1.139 +    //  Initialize the process by creating a single range encompassing all characters
   1.140 +    //  that is in no sets.
   1.141 +    //
   1.142 +    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
   1.143 +    if (fRangeList == NULL) {
   1.144 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.145 +        return;
   1.146 +    }
   1.147 +    fRangeList->fStartChar    = 0;
   1.148 +    fRangeList->fEndChar      = 0x10ffff;
   1.149 +
   1.150 +    if (U_FAILURE(*fStatus)) {
   1.151 +        return;
   1.152 +    }
   1.153 +
   1.154 +    //
   1.155 +    //  Find the set of non-overlapping ranges of characters
   1.156 +    //
   1.157 +    int  ni;
   1.158 +    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
   1.159 +        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
   1.160 +        if (usetNode==NULL) {
   1.161 +            break;
   1.162 +        }
   1.163 +
   1.164 +        UnicodeSet      *inputSet             = usetNode->fInputSet;
   1.165 +        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
   1.166 +        int              inputSetRangeIndex   = 0;
   1.167 +                         rlRange              = fRangeList;
   1.168 +
   1.169 +        for (;;) {
   1.170 +            if (inputSetRangeIndex >= inputSetRangeCount) {
   1.171 +                break;
   1.172 +            }
   1.173 +            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
   1.174 +            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);
   1.175 +
   1.176 +            // skip over ranges from the range list that are completely
   1.177 +            //   below the current range from the input unicode set.
   1.178 +            while (rlRange->fEndChar < inputSetRangeBegin) {
   1.179 +                rlRange = rlRange->fNext;
   1.180 +            }
   1.181 +
   1.182 +            // If the start of the range from the range list is before with
   1.183 +            //   the start of the range from the unicode set, split the range list range
   1.184 +            //   in two, with one part being before (wholly outside of) the unicode set
   1.185 +            //   and the other containing the rest.
   1.186 +            //   Then continue the loop; the post-split current range will then be skipped
   1.187 +            //     over
   1.188 +            if (rlRange->fStartChar < inputSetRangeBegin) {
   1.189 +                rlRange->split(inputSetRangeBegin, *fStatus);
   1.190 +                if (U_FAILURE(*fStatus)) {
   1.191 +                    return;
   1.192 +                }
   1.193 +                continue;
   1.194 +            }
   1.195 +
   1.196 +            // Same thing at the end of the ranges...
   1.197 +            // If the end of the range from the range list doesn't coincide with
   1.198 +            //   the end of the range from the unicode set, split the range list
   1.199 +            //   range in two.  The first part of the split range will be
   1.200 +            //   wholly inside the Unicode set.
   1.201 +            if (rlRange->fEndChar > inputSetRangeEnd) {
   1.202 +                rlRange->split(inputSetRangeEnd+1, *fStatus);
   1.203 +                if (U_FAILURE(*fStatus)) {
   1.204 +                    return;
   1.205 +                }
   1.206 +            }
   1.207 +
   1.208 +            // The current rlRange is now entirely within the UnicodeSet range.
   1.209 +            // Add this unicode set to the list of sets for this rlRange
   1.210 +            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
   1.211 +                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
   1.212 +                if (U_FAILURE(*fStatus)) {
   1.213 +                    return;
   1.214 +                }
   1.215 +            }
   1.216 +
   1.217 +            // Advance over ranges that we are finished with.
   1.218 +            if (inputSetRangeEnd == rlRange->fEndChar) {
   1.219 +                inputSetRangeIndex++;
   1.220 +            }
   1.221 +            rlRange = rlRange->fNext;
   1.222 +        }
   1.223 +    }
   1.224 +
   1.225 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
   1.226 +
   1.227 +    //
   1.228 +    //  Group the above ranges, with each group consisting of one or more
   1.229 +    //    ranges that are in exactly the same set of original UnicodeSets.
   1.230 +    //    The groups are numbered, and these group numbers are the set of
   1.231 +    //    input symbols recognized by the run-time state machine.
   1.232 +    //
   1.233 +    //    Numbering: # 0  (state table column 0) is unused.
   1.234 +    //               # 1  is reserved - table column 1 is for end-of-input
   1.235 +    //               # 2  is reserved - table column 2 is for beginning-in-input
   1.236 +    //               # 3  is the first range list.
   1.237 +    //
   1.238 +    RangeDescriptor *rlSearchRange;
   1.239 +    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
   1.240 +        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
   1.241 +            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
   1.242 +                rlRange->fNum = rlSearchRange->fNum;
   1.243 +                break;
   1.244 +            }
   1.245 +        }
   1.246 +        if (rlRange->fNum == 0) {
   1.247 +            fGroupCount ++;
   1.248 +            rlRange->fNum = fGroupCount+2; 
   1.249 +            rlRange->setDictionaryFlag();
   1.250 +            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
   1.251 +        }
   1.252 +    }
   1.253 +
   1.254 +    // Handle input sets that contain the special string {eof}.
   1.255 +    //   Column 1 of the state table is reserved for EOF on input.
   1.256 +    //   Column 2 is reserved for before-the-start-input.
   1.257 +    //            (This column can be optimized away later if there are no rule
   1.258 +    //             references to {bof}.)
   1.259 +    //   Add this column value (1 or 2) to the equivalent expression
   1.260 +    //     subtree for each UnicodeSet that contains the string {eof}
   1.261 +    //   Because {bof} and {eof} are not a characters in the normal sense,
   1.262 +    //   they doesn't affect the computation of ranges or TRIE.
   1.263 +    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
   1.264 +    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
   1.265 +
   1.266 +    UnicodeString eofString(eofUString);
   1.267 +    UnicodeString bofString(bofUString);
   1.268 +    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
   1.269 +        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
   1.270 +        if (usetNode==NULL) {
   1.271 +            break;
   1.272 +        }
   1.273 +        UnicodeSet      *inputSet = usetNode->fInputSet;
   1.274 +        if (inputSet->contains(eofString)) {
   1.275 +            addValToSet(usetNode, 1);
   1.276 +        }
   1.277 +        if (inputSet->contains(bofString)) {
   1.278 +            addValToSet(usetNode, 2);
   1.279 +            fSawBOF = TRUE;
   1.280 +        }
   1.281 +    }
   1.282 +
   1.283 +
   1.284 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
   1.285 +    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
   1.286 +
   1.287 +    //
   1.288 +    // Build the Trie table for mapping UChar32 values to the corresponding
   1.289 +    //   range group number
   1.290 +    //
   1.291 +    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
   1.292 +                      NULL,    //  Data array  (utrie will allocate one)
   1.293 +                      100000,  //  Max Data Length
   1.294 +                      0,       //  Initial value for all code points
   1.295 +                      0,       //  Lead surrogate unit value
   1.296 +                      TRUE);   //  Keep Latin 1 in separately
   1.297 +
   1.298 +
   1.299 +    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
   1.300 +        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
   1.301 +    }
   1.302 +}
   1.303 +
   1.304 +
   1.305 +
   1.306 +//-----------------------------------------------------------------------------------
   1.307 +//
   1.308 +//  getTrieSize()    Return the size that will be required to serialize the Trie.
   1.309 +//
   1.310 +//-----------------------------------------------------------------------------------
   1.311 +int32_t RBBISetBuilder::getTrieSize() /*const*/ {
   1.312 +    fTrieSize  = utrie_serialize(fTrie,
   1.313 +                                    NULL,                // Buffer
   1.314 +                                    0,                   // Capacity
   1.315 +                                    getFoldedRBBIValue,
   1.316 +                                    TRUE,                // Reduce to 16 bits
   1.317 +                                    fStatus);
   1.318 +    // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
   1.319 +    return fTrieSize;
   1.320 +}
   1.321 +
   1.322 +
   1.323 +//-----------------------------------------------------------------------------------
   1.324 +//
   1.325 +//  serializeTrie()   Put the serialized trie at the specified address.
   1.326 +//                    Trust the caller to have given us enough memory.
   1.327 +//                    getTrieSize() MUST be called first.
   1.328 +//
   1.329 +//-----------------------------------------------------------------------------------
   1.330 +void RBBISetBuilder::serializeTrie(uint8_t *where) {
   1.331 +    utrie_serialize(fTrie,
   1.332 +                    where,                   // Buffer
   1.333 +                    fTrieSize,               // Capacity
   1.334 +                    getFoldedRBBIValue,
   1.335 +                    TRUE,                    // Reduce to 16 bits
   1.336 +                    fStatus);
   1.337 +}
   1.338 +
   1.339 +//------------------------------------------------------------------------
   1.340 +//
   1.341 +//  addValToSets     Add a runtime-mapped input value to each uset from a
   1.342 +//                   list of uset nodes. (val corresponds to a state table column.)
   1.343 +//                   For each of the original Unicode sets - which correspond
   1.344 +//                   directly to uset nodes - a logically equivalent expression
   1.345 +//                   is constructed in terms of the remapped runtime input
   1.346 +//                   symbol set.  This function adds one runtime input symbol to
   1.347 +//                   a list of sets.
   1.348 +//
   1.349 +//                   The "logically equivalent expression" is the tree for an
   1.350 +//                   or-ing together of all of the symbols that go into the set.
   1.351 +//
   1.352 +//------------------------------------------------------------------------
   1.353 +void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
   1.354 +    int32_t       ix;
   1.355 +
   1.356 +    for (ix=0; ix<sets->size(); ix++) {
   1.357 +        RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
   1.358 +        addValToSet(usetNode, val);
   1.359 +    }
   1.360 +}
   1.361 +
   1.362 +void  RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
   1.363 +    RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
   1.364 +    if (leafNode == NULL) {
   1.365 +        *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.366 +        return;
   1.367 +    }
   1.368 +    leafNode->fVal = (unsigned short)val;
   1.369 +    if (usetNode->fLeftChild == NULL) {
   1.370 +        usetNode->fLeftChild = leafNode;
   1.371 +        leafNode->fParent    = usetNode;
   1.372 +    } else {
   1.373 +        // There are already input symbols present for this set.
   1.374 +        // Set up an OR node, with the previous stuff as the left child
   1.375 +        //   and the new value as the right child.
   1.376 +        RBBINode *orNode = new RBBINode(RBBINode::opOr);
   1.377 +        if (orNode == NULL) {
   1.378 +            *fStatus = U_MEMORY_ALLOCATION_ERROR;
   1.379 +            return;
   1.380 +        }
   1.381 +        orNode->fLeftChild  = usetNode->fLeftChild;
   1.382 +        orNode->fRightChild = leafNode;
   1.383 +        orNode->fLeftChild->fParent  = orNode;
   1.384 +        orNode->fRightChild->fParent = orNode;
   1.385 +        usetNode->fLeftChild = orNode;
   1.386 +        orNode->fParent = usetNode;
   1.387 +    }
   1.388 +}
   1.389 +
   1.390 +
   1.391 +//------------------------------------------------------------------------
   1.392 +//
   1.393 +//   getNumCharCategories
   1.394 +//
   1.395 +//------------------------------------------------------------------------
   1.396 +int32_t  RBBISetBuilder::getNumCharCategories() const {
   1.397 +    return fGroupCount + 3;
   1.398 +}
   1.399 +
   1.400 +
   1.401 +//------------------------------------------------------------------------
   1.402 +//
   1.403 +//   sawBOF
   1.404 +//
   1.405 +//------------------------------------------------------------------------
   1.406 +UBool  RBBISetBuilder::sawBOF() const {
   1.407 +    return fSawBOF;
   1.408 +}
   1.409 +
   1.410 +
   1.411 +//------------------------------------------------------------------------
   1.412 +//
   1.413 +//   getFirstChar      Given a runtime RBBI character category, find
   1.414 +//                     the first UChar32 that is in the set of chars 
   1.415 +//                     in the category.
   1.416 +//------------------------------------------------------------------------
   1.417 +UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
   1.418 +    RangeDescriptor   *rlRange;
   1.419 +    UChar32            retVal = (UChar32)-1;
   1.420 +    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
   1.421 +        if (rlRange->fNum == category) {
   1.422 +            retVal = rlRange->fStartChar;
   1.423 +            break;
   1.424 +        }
   1.425 +    }
   1.426 +    return retVal;
   1.427 +}
   1.428 +
   1.429 +
   1.430 +
   1.431 +//------------------------------------------------------------------------
   1.432 +//
   1.433 +//   printRanges        A debugging function.
   1.434 +//                      dump out all of the range definitions.
   1.435 +//
   1.436 +//------------------------------------------------------------------------
   1.437 +#ifdef RBBI_DEBUG
   1.438 +void RBBISetBuilder::printRanges() {
   1.439 +    RangeDescriptor       *rlRange;
   1.440 +    int                    i;
   1.441 +
   1.442 +    RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
   1.443 +    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
   1.444 +        RBBIDebugPrintf("%2i  %4x-%4x  ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
   1.445 +
   1.446 +        for (i=0; i<rlRange->fIncludesSets->size(); i++) {
   1.447 +            RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
   1.448 +            UnicodeString   setName = UNICODE_STRING("anon", 4);
   1.449 +            RBBINode       *setRef = usetNode->fParent;
   1.450 +            if (setRef != NULL) {
   1.451 +                RBBINode *varRef = setRef->fParent;
   1.452 +                if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
   1.453 +                    setName = varRef->fText;
   1.454 +                }
   1.455 +            }
   1.456 +            RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
   1.457 +        }
   1.458 +        RBBIDebugPrintf("\n");
   1.459 +    }
   1.460 +}
   1.461 +#endif
   1.462 +
   1.463 +
   1.464 +//------------------------------------------------------------------------
   1.465 +//
   1.466 +//   printRangeGroups     A debugging function.
   1.467 +//                        dump out all of the range groups.
   1.468 +//
   1.469 +//------------------------------------------------------------------------
   1.470 +#ifdef RBBI_DEBUG
   1.471 +void RBBISetBuilder::printRangeGroups() {
   1.472 +    RangeDescriptor       *rlRange;
   1.473 +    RangeDescriptor       *tRange;
   1.474 +    int                    i;
   1.475 +    int                    lastPrintedGroupNum = 0;
   1.476 +
   1.477 +    RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
   1.478 +    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
   1.479 +        int groupNum = rlRange->fNum & 0xbfff;
   1.480 +        if (groupNum > lastPrintedGroupNum) {
   1.481 +            lastPrintedGroupNum = groupNum;
   1.482 +            RBBIDebugPrintf("%2i  ", groupNum);
   1.483 +
   1.484 +            if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
   1.485 +
   1.486 +            for (i=0; i<rlRange->fIncludesSets->size(); i++) {
   1.487 +                RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
   1.488 +                UnicodeString   setName = UNICODE_STRING("anon", 4);
   1.489 +                RBBINode       *setRef = usetNode->fParent;
   1.490 +                if (setRef != NULL) {
   1.491 +                    RBBINode *varRef = setRef->fParent;
   1.492 +                    if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
   1.493 +                        setName = varRef->fText;
   1.494 +                    }
   1.495 +                }
   1.496 +                RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
   1.497 +            }
   1.498 +
   1.499 +            i = 0;
   1.500 +            for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
   1.501 +                if (tRange->fNum == rlRange->fNum) {
   1.502 +                    if (i++ % 5 == 0) {
   1.503 +                        RBBIDebugPrintf("\n    ");
   1.504 +                    }
   1.505 +                    RBBIDebugPrintf("  %05x-%05x", tRange->fStartChar, tRange->fEndChar);
   1.506 +                }
   1.507 +            }
   1.508 +            RBBIDebugPrintf("\n");
   1.509 +        }
   1.510 +    }
   1.511 +    RBBIDebugPrintf("\n");
   1.512 +}
   1.513 +#endif
   1.514 +
   1.515 +
   1.516 +//------------------------------------------------------------------------
   1.517 +//
   1.518 +//   printSets          A debugging function.
   1.519 +//                      dump out all of the set definitions.
   1.520 +//
   1.521 +//------------------------------------------------------------------------
   1.522 +#ifdef RBBI_DEBUG
   1.523 +void RBBISetBuilder::printSets() {
   1.524 +    int                   i;
   1.525 +
   1.526 +    RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
   1.527 +    for (i=0; ; i++) {
   1.528 +        RBBINode        *usetNode;
   1.529 +        RBBINode        *setRef;
   1.530 +        RBBINode        *varRef;
   1.531 +        UnicodeString    setName;
   1.532 +
   1.533 +        usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
   1.534 +        if (usetNode == NULL) {
   1.535 +            break;
   1.536 +        }
   1.537 +
   1.538 +        RBBIDebugPrintf("%3d    ", i);
   1.539 +        setName = UNICODE_STRING("anonymous", 9);
   1.540 +        setRef = usetNode->fParent;
   1.541 +        if (setRef != NULL) {
   1.542 +            varRef = setRef->fParent;
   1.543 +            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
   1.544 +                setName = varRef->fText;
   1.545 +            }
   1.546 +        }
   1.547 +        RBBI_DEBUG_printUnicodeString(setName);
   1.548 +        RBBIDebugPrintf("   ");
   1.549 +        RBBI_DEBUG_printUnicodeString(usetNode->fText);
   1.550 +        RBBIDebugPrintf("\n");
   1.551 +        if (usetNode->fLeftChild != NULL) {
   1.552 +            usetNode->fLeftChild->printTree(TRUE);
   1.553 +        }
   1.554 +    }
   1.555 +    RBBIDebugPrintf("\n");
   1.556 +}
   1.557 +#endif
   1.558 +
   1.559 +
   1.560 +
   1.561 +//-------------------------------------------------------------------------------------
   1.562 +//
   1.563 +//  RangeDescriptor copy constructor
   1.564 +//
   1.565 +//-------------------------------------------------------------------------------------
   1.566 +
   1.567 +RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
   1.568 +    int  i;
   1.569 +
   1.570 +    this->fStartChar    = other.fStartChar;
   1.571 +    this->fEndChar      = other.fEndChar;
   1.572 +    this->fNum          = other.fNum;
   1.573 +    this->fNext         = NULL;
   1.574 +    UErrorCode oldstatus = status;
   1.575 +    this->fIncludesSets = new UVector(status);
   1.576 +    if (U_FAILURE(oldstatus)) {
   1.577 +        status = oldstatus;
   1.578 +    }
   1.579 +    if (U_FAILURE(status)) {
   1.580 +        return;
   1.581 +    }
   1.582 +    /* test for NULL */
   1.583 +    if (this->fIncludesSets == 0) {
   1.584 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.585 +        return;
   1.586 +    }
   1.587 +
   1.588 +    for (i=0; i<other.fIncludesSets->size(); i++) {
   1.589 +        this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
   1.590 +    }
   1.591 +}
   1.592 +
   1.593 +
   1.594 +//-------------------------------------------------------------------------------------
   1.595 +//
   1.596 +//  RangeDesriptor default constructor
   1.597 +//
   1.598 +//-------------------------------------------------------------------------------------
   1.599 +RangeDescriptor::RangeDescriptor(UErrorCode &status) {
   1.600 +    this->fStartChar    = 0;
   1.601 +    this->fEndChar      = 0;
   1.602 +    this->fNum          = 0;
   1.603 +    this->fNext         = NULL;
   1.604 +    UErrorCode oldstatus = status;
   1.605 +    this->fIncludesSets = new UVector(status);
   1.606 +    if (U_FAILURE(oldstatus)) {
   1.607 +        status = oldstatus;
   1.608 +    }
   1.609 +    if (U_FAILURE(status)) {
   1.610 +        return;
   1.611 +    }
   1.612 +    /* test for NULL */
   1.613 +    if(this->fIncludesSets == 0) {
   1.614 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.615 +        return;
   1.616 +    }
   1.617 +
   1.618 +}
   1.619 +
   1.620 +
   1.621 +//-------------------------------------------------------------------------------------
   1.622 +//
   1.623 +//  RangeDesriptor Destructor
   1.624 +//
   1.625 +//-------------------------------------------------------------------------------------
   1.626 +RangeDescriptor::~RangeDescriptor() {
   1.627 +    delete  fIncludesSets;
   1.628 +    fIncludesSets = NULL;
   1.629 +}
   1.630 +
   1.631 +//-------------------------------------------------------------------------------------
   1.632 +//
   1.633 +//  RangeDesriptor::split()
   1.634 +//
   1.635 +//-------------------------------------------------------------------------------------
   1.636 +void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
   1.637 +    U_ASSERT(where>fStartChar && where<=fEndChar);
   1.638 +    RangeDescriptor *nr = new RangeDescriptor(*this, status);
   1.639 +    if(nr == 0) {
   1.640 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.641 +        return;
   1.642 +    }
   1.643 +    if (U_FAILURE(status)) {
   1.644 +        delete nr;
   1.645 +        return;
   1.646 +    }
   1.647 +    //  RangeDescriptor copy constructor copies all fields.
   1.648 +    //  Only need to update those that are different after the split.
   1.649 +    nr->fStartChar = where;
   1.650 +    this->fEndChar = where-1;
   1.651 +    nr->fNext      = this->fNext;
   1.652 +    this->fNext    = nr;
   1.653 +}
   1.654 +
   1.655 +
   1.656 +//-------------------------------------------------------------------------------------
   1.657 +//
   1.658 +//   RangeDescriptor::setDictionaryFlag
   1.659 +//
   1.660 +//            Character Category Numbers that include characters from
   1.661 +//            the original Unicode Set named "dictionary" have bit 14
   1.662 +//            set to 1.  The RBBI runtime engine uses this to trigger
   1.663 +//            use of the word dictionary.
   1.664 +//
   1.665 +//            This function looks through the Unicode Sets that it
   1.666 +//            (the range) includes, and sets the bit in fNum when
   1.667 +//            "dictionary" is among them.
   1.668 +//
   1.669 +//            TODO:  a faster way would be to find the set node for
   1.670 +//                   "dictionary" just once, rather than looking it
   1.671 +//                   up by name every time.
   1.672 +//
   1.673 +//-------------------------------------------------------------------------------------
   1.674 +void RangeDescriptor::setDictionaryFlag() {
   1.675 +    int i;
   1.676 +
   1.677 +    for (i=0; i<this->fIncludesSets->size(); i++) {
   1.678 +        RBBINode       *usetNode    = (RBBINode *)fIncludesSets->elementAt(i);
   1.679 +        UnicodeString   setName;
   1.680 +        RBBINode       *setRef = usetNode->fParent;
   1.681 +        if (setRef != NULL) {
   1.682 +            RBBINode *varRef = setRef->fParent;
   1.683 +            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
   1.684 +                setName = varRef->fText;
   1.685 +            }
   1.686 +        }
   1.687 +        if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) {   // TODO:  no string literals.
   1.688 +            this->fNum |= 0x4000;
   1.689 +            break;
   1.690 +        }
   1.691 +    }
   1.692 +}
   1.693 +
   1.694 +
   1.695 +
   1.696 +U_NAMESPACE_END
   1.697 +
   1.698 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial