1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbisetb.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,695 @@ 1.4 +// 1.5 +// rbbisetb.cpp 1.6 +// 1.7 +/* 1.8 +*************************************************************************** 1.9 +* Copyright (C) 2002-2008 International Business Machines Corporation * 1.10 +* and others. All rights reserved. * 1.11 +*************************************************************************** 1.12 +*/ 1.13 +// 1.14 +// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 1.15 +// (part of the rule building process.) 1.16 +// 1.17 +// Starting with the rules parse tree from the scanner, 1.18 +// 1.19 +// - Enumerate the set of UnicodeSets that are referenced 1.20 +// by the RBBI rules. 1.21 +// - compute a set of non-overlapping character ranges 1.22 +// with all characters within a range belonging to the same 1.23 +// set of input uniocde sets. 1.24 +// - Derive a set of non-overlapping UnicodeSet (like things) 1.25 +// that will correspond to columns in the state table for 1.26 +// the RBBI execution engine. All characters within one 1.27 +// of these sets belong to the same set of the original 1.28 +// UnicodeSets from the user's rules. 1.29 +// - construct the trie table that maps input characters 1.30 +// to the index of the matching non-overlapping set of set from 1.31 +// the previous step. 1.32 +// 1.33 + 1.34 +#include "unicode/utypes.h" 1.35 + 1.36 +#if !UCONFIG_NO_BREAK_ITERATION 1.37 + 1.38 +#include "unicode/uniset.h" 1.39 +#include "utrie.h" 1.40 +#include "uvector.h" 1.41 +#include "uassert.h" 1.42 +#include "cmemory.h" 1.43 +#include "cstring.h" 1.44 + 1.45 +#include "rbbisetb.h" 1.46 +#include "rbbinode.h" 1.47 + 1.48 + 1.49 +//------------------------------------------------------------------------ 1.50 +// 1.51 +// getFoldedRBBIValue Call-back function used during building of Trie table. 1.52 +// Folding value: just store the offset (16 bits) 1.53 +// if there is any non-0 entry. 1.54 +// (It'd really be nice if the Trie builder would provide a 1.55 +// simple default, so this function could go away from here.) 1.56 +// 1.57 +//------------------------------------------------------------------------ 1.58 +/* folding value: just store the offset (16 bits) if there is any non-0 entry */ 1.59 +U_CDECL_BEGIN 1.60 +static uint32_t U_CALLCONV 1.61 +getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) { 1.62 + uint32_t value; 1.63 + UChar32 limit; 1.64 + UBool inBlockZero; 1.65 + 1.66 + limit=start+0x400; 1.67 + while(start<limit) { 1.68 + value=utrie_get32(trie, start, &inBlockZero); 1.69 + if(inBlockZero) { 1.70 + start+=UTRIE_DATA_BLOCK_LENGTH; 1.71 + } else if(value!=0) { 1.72 + return (uint32_t)(offset|0x8000); 1.73 + } else { 1.74 + ++start; 1.75 + } 1.76 + } 1.77 + return 0; 1.78 +} 1.79 + 1.80 + 1.81 +U_CDECL_END 1.82 + 1.83 + 1.84 + 1.85 +U_NAMESPACE_BEGIN 1.86 + 1.87 +//------------------------------------------------------------------------ 1.88 +// 1.89 +// Constructor 1.90 +// 1.91 +//------------------------------------------------------------------------ 1.92 +RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) 1.93 +{ 1.94 + fRB = rb; 1.95 + fStatus = rb->fStatus; 1.96 + fRangeList = 0; 1.97 + fTrie = 0; 1.98 + fTrieSize = 0; 1.99 + fGroupCount = 0; 1.100 + fSawBOF = FALSE; 1.101 +} 1.102 + 1.103 + 1.104 +//------------------------------------------------------------------------ 1.105 +// 1.106 +// Destructor 1.107 +// 1.108 +//------------------------------------------------------------------------ 1.109 +RBBISetBuilder::~RBBISetBuilder() 1.110 +{ 1.111 + RangeDescriptor *nextRangeDesc; 1.112 + 1.113 + // Walk through & delete the linked list of RangeDescriptors 1.114 + for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { 1.115 + RangeDescriptor *r = nextRangeDesc; 1.116 + nextRangeDesc = r->fNext; 1.117 + delete r; 1.118 + } 1.119 + 1.120 + utrie_close(fTrie); 1.121 +} 1.122 + 1.123 + 1.124 + 1.125 + 1.126 +//------------------------------------------------------------------------ 1.127 +// 1.128 +// build Build the list of non-overlapping character ranges 1.129 +// from the Unicode Sets. 1.130 +// 1.131 +//------------------------------------------------------------------------ 1.132 +void RBBISetBuilder::build() { 1.133 + RBBINode *usetNode; 1.134 + RangeDescriptor *rlRange; 1.135 + 1.136 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} 1.137 + 1.138 + // 1.139 + // Initialize the process by creating a single range encompassing all characters 1.140 + // that is in no sets. 1.141 + // 1.142 + fRangeList = new RangeDescriptor(*fStatus); // will check for status here 1.143 + if (fRangeList == NULL) { 1.144 + *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.145 + return; 1.146 + } 1.147 + fRangeList->fStartChar = 0; 1.148 + fRangeList->fEndChar = 0x10ffff; 1.149 + 1.150 + if (U_FAILURE(*fStatus)) { 1.151 + return; 1.152 + } 1.153 + 1.154 + // 1.155 + // Find the set of non-overlapping ranges of characters 1.156 + // 1.157 + int ni; 1.158 + for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules 1.159 + usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); 1.160 + if (usetNode==NULL) { 1.161 + break; 1.162 + } 1.163 + 1.164 + UnicodeSet *inputSet = usetNode->fInputSet; 1.165 + int32_t inputSetRangeCount = inputSet->getRangeCount(); 1.166 + int inputSetRangeIndex = 0; 1.167 + rlRange = fRangeList; 1.168 + 1.169 + for (;;) { 1.170 + if (inputSetRangeIndex >= inputSetRangeCount) { 1.171 + break; 1.172 + } 1.173 + UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); 1.174 + UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); 1.175 + 1.176 + // skip over ranges from the range list that are completely 1.177 + // below the current range from the input unicode set. 1.178 + while (rlRange->fEndChar < inputSetRangeBegin) { 1.179 + rlRange = rlRange->fNext; 1.180 + } 1.181 + 1.182 + // If the start of the range from the range list is before with 1.183 + // the start of the range from the unicode set, split the range list range 1.184 + // in two, with one part being before (wholly outside of) the unicode set 1.185 + // and the other containing the rest. 1.186 + // Then continue the loop; the post-split current range will then be skipped 1.187 + // over 1.188 + if (rlRange->fStartChar < inputSetRangeBegin) { 1.189 + rlRange->split(inputSetRangeBegin, *fStatus); 1.190 + if (U_FAILURE(*fStatus)) { 1.191 + return; 1.192 + } 1.193 + continue; 1.194 + } 1.195 + 1.196 + // Same thing at the end of the ranges... 1.197 + // If the end of the range from the range list doesn't coincide with 1.198 + // the end of the range from the unicode set, split the range list 1.199 + // range in two. The first part of the split range will be 1.200 + // wholly inside the Unicode set. 1.201 + if (rlRange->fEndChar > inputSetRangeEnd) { 1.202 + rlRange->split(inputSetRangeEnd+1, *fStatus); 1.203 + if (U_FAILURE(*fStatus)) { 1.204 + return; 1.205 + } 1.206 + } 1.207 + 1.208 + // The current rlRange is now entirely within the UnicodeSet range. 1.209 + // Add this unicode set to the list of sets for this rlRange 1.210 + if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { 1.211 + rlRange->fIncludesSets->addElement(usetNode, *fStatus); 1.212 + if (U_FAILURE(*fStatus)) { 1.213 + return; 1.214 + } 1.215 + } 1.216 + 1.217 + // Advance over ranges that we are finished with. 1.218 + if (inputSetRangeEnd == rlRange->fEndChar) { 1.219 + inputSetRangeIndex++; 1.220 + } 1.221 + rlRange = rlRange->fNext; 1.222 + } 1.223 + } 1.224 + 1.225 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} 1.226 + 1.227 + // 1.228 + // Group the above ranges, with each group consisting of one or more 1.229 + // ranges that are in exactly the same set of original UnicodeSets. 1.230 + // The groups are numbered, and these group numbers are the set of 1.231 + // input symbols recognized by the run-time state machine. 1.232 + // 1.233 + // Numbering: # 0 (state table column 0) is unused. 1.234 + // # 1 is reserved - table column 1 is for end-of-input 1.235 + // # 2 is reserved - table column 2 is for beginning-in-input 1.236 + // # 3 is the first range list. 1.237 + // 1.238 + RangeDescriptor *rlSearchRange; 1.239 + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 1.240 + for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { 1.241 + if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { 1.242 + rlRange->fNum = rlSearchRange->fNum; 1.243 + break; 1.244 + } 1.245 + } 1.246 + if (rlRange->fNum == 0) { 1.247 + fGroupCount ++; 1.248 + rlRange->fNum = fGroupCount+2; 1.249 + rlRange->setDictionaryFlag(); 1.250 + addValToSets(rlRange->fIncludesSets, fGroupCount+2); 1.251 + } 1.252 + } 1.253 + 1.254 + // Handle input sets that contain the special string {eof}. 1.255 + // Column 1 of the state table is reserved for EOF on input. 1.256 + // Column 2 is reserved for before-the-start-input. 1.257 + // (This column can be optimized away later if there are no rule 1.258 + // references to {bof}.) 1.259 + // Add this column value (1 or 2) to the equivalent expression 1.260 + // subtree for each UnicodeSet that contains the string {eof} 1.261 + // Because {bof} and {eof} are not a characters in the normal sense, 1.262 + // they doesn't affect the computation of ranges or TRIE. 1.263 + static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; 1.264 + static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; 1.265 + 1.266 + UnicodeString eofString(eofUString); 1.267 + UnicodeString bofString(bofUString); 1.268 + for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules 1.269 + usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); 1.270 + if (usetNode==NULL) { 1.271 + break; 1.272 + } 1.273 + UnicodeSet *inputSet = usetNode->fInputSet; 1.274 + if (inputSet->contains(eofString)) { 1.275 + addValToSet(usetNode, 1); 1.276 + } 1.277 + if (inputSet->contains(bofString)) { 1.278 + addValToSet(usetNode, 2); 1.279 + fSawBOF = TRUE; 1.280 + } 1.281 + } 1.282 + 1.283 + 1.284 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} 1.285 + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} 1.286 + 1.287 + // 1.288 + // Build the Trie table for mapping UChar32 values to the corresponding 1.289 + // range group number 1.290 + // 1.291 + fTrie = utrie_open(NULL, // Pre-existing trie to be filled in 1.292 + NULL, // Data array (utrie will allocate one) 1.293 + 100000, // Max Data Length 1.294 + 0, // Initial value for all code points 1.295 + 0, // Lead surrogate unit value 1.296 + TRUE); // Keep Latin 1 in separately 1.297 + 1.298 + 1.299 + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 1.300 + utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); 1.301 + } 1.302 +} 1.303 + 1.304 + 1.305 + 1.306 +//----------------------------------------------------------------------------------- 1.307 +// 1.308 +// getTrieSize() Return the size that will be required to serialize the Trie. 1.309 +// 1.310 +//----------------------------------------------------------------------------------- 1.311 +int32_t RBBISetBuilder::getTrieSize() /*const*/ { 1.312 + fTrieSize = utrie_serialize(fTrie, 1.313 + NULL, // Buffer 1.314 + 0, // Capacity 1.315 + getFoldedRBBIValue, 1.316 + TRUE, // Reduce to 16 bits 1.317 + fStatus); 1.318 + // RBBIDebugPrintf("Trie table size is %d\n", trieSize); 1.319 + return fTrieSize; 1.320 +} 1.321 + 1.322 + 1.323 +//----------------------------------------------------------------------------------- 1.324 +// 1.325 +// serializeTrie() Put the serialized trie at the specified address. 1.326 +// Trust the caller to have given us enough memory. 1.327 +// getTrieSize() MUST be called first. 1.328 +// 1.329 +//----------------------------------------------------------------------------------- 1.330 +void RBBISetBuilder::serializeTrie(uint8_t *where) { 1.331 + utrie_serialize(fTrie, 1.332 + where, // Buffer 1.333 + fTrieSize, // Capacity 1.334 + getFoldedRBBIValue, 1.335 + TRUE, // Reduce to 16 bits 1.336 + fStatus); 1.337 +} 1.338 + 1.339 +//------------------------------------------------------------------------ 1.340 +// 1.341 +// addValToSets Add a runtime-mapped input value to each uset from a 1.342 +// list of uset nodes. (val corresponds to a state table column.) 1.343 +// For each of the original Unicode sets - which correspond 1.344 +// directly to uset nodes - a logically equivalent expression 1.345 +// is constructed in terms of the remapped runtime input 1.346 +// symbol set. This function adds one runtime input symbol to 1.347 +// a list of sets. 1.348 +// 1.349 +// The "logically equivalent expression" is the tree for an 1.350 +// or-ing together of all of the symbols that go into the set. 1.351 +// 1.352 +//------------------------------------------------------------------------ 1.353 +void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { 1.354 + int32_t ix; 1.355 + 1.356 + for (ix=0; ix<sets->size(); ix++) { 1.357 + RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); 1.358 + addValToSet(usetNode, val); 1.359 + } 1.360 +} 1.361 + 1.362 +void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { 1.363 + RBBINode *leafNode = new RBBINode(RBBINode::leafChar); 1.364 + if (leafNode == NULL) { 1.365 + *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.366 + return; 1.367 + } 1.368 + leafNode->fVal = (unsigned short)val; 1.369 + if (usetNode->fLeftChild == NULL) { 1.370 + usetNode->fLeftChild = leafNode; 1.371 + leafNode->fParent = usetNode; 1.372 + } else { 1.373 + // There are already input symbols present for this set. 1.374 + // Set up an OR node, with the previous stuff as the left child 1.375 + // and the new value as the right child. 1.376 + RBBINode *orNode = new RBBINode(RBBINode::opOr); 1.377 + if (orNode == NULL) { 1.378 + *fStatus = U_MEMORY_ALLOCATION_ERROR; 1.379 + return; 1.380 + } 1.381 + orNode->fLeftChild = usetNode->fLeftChild; 1.382 + orNode->fRightChild = leafNode; 1.383 + orNode->fLeftChild->fParent = orNode; 1.384 + orNode->fRightChild->fParent = orNode; 1.385 + usetNode->fLeftChild = orNode; 1.386 + orNode->fParent = usetNode; 1.387 + } 1.388 +} 1.389 + 1.390 + 1.391 +//------------------------------------------------------------------------ 1.392 +// 1.393 +// getNumCharCategories 1.394 +// 1.395 +//------------------------------------------------------------------------ 1.396 +int32_t RBBISetBuilder::getNumCharCategories() const { 1.397 + return fGroupCount + 3; 1.398 +} 1.399 + 1.400 + 1.401 +//------------------------------------------------------------------------ 1.402 +// 1.403 +// sawBOF 1.404 +// 1.405 +//------------------------------------------------------------------------ 1.406 +UBool RBBISetBuilder::sawBOF() const { 1.407 + return fSawBOF; 1.408 +} 1.409 + 1.410 + 1.411 +//------------------------------------------------------------------------ 1.412 +// 1.413 +// getFirstChar Given a runtime RBBI character category, find 1.414 +// the first UChar32 that is in the set of chars 1.415 +// in the category. 1.416 +//------------------------------------------------------------------------ 1.417 +UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { 1.418 + RangeDescriptor *rlRange; 1.419 + UChar32 retVal = (UChar32)-1; 1.420 + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 1.421 + if (rlRange->fNum == category) { 1.422 + retVal = rlRange->fStartChar; 1.423 + break; 1.424 + } 1.425 + } 1.426 + return retVal; 1.427 +} 1.428 + 1.429 + 1.430 + 1.431 +//------------------------------------------------------------------------ 1.432 +// 1.433 +// printRanges A debugging function. 1.434 +// dump out all of the range definitions. 1.435 +// 1.436 +//------------------------------------------------------------------------ 1.437 +#ifdef RBBI_DEBUG 1.438 +void RBBISetBuilder::printRanges() { 1.439 + RangeDescriptor *rlRange; 1.440 + int i; 1.441 + 1.442 + RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); 1.443 + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 1.444 + RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); 1.445 + 1.446 + for (i=0; i<rlRange->fIncludesSets->size(); i++) { 1.447 + RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); 1.448 + UnicodeString setName = UNICODE_STRING("anon", 4); 1.449 + RBBINode *setRef = usetNode->fParent; 1.450 + if (setRef != NULL) { 1.451 + RBBINode *varRef = setRef->fParent; 1.452 + if (varRef != NULL && varRef->fType == RBBINode::varRef) { 1.453 + setName = varRef->fText; 1.454 + } 1.455 + } 1.456 + RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); 1.457 + } 1.458 + RBBIDebugPrintf("\n"); 1.459 + } 1.460 +} 1.461 +#endif 1.462 + 1.463 + 1.464 +//------------------------------------------------------------------------ 1.465 +// 1.466 +// printRangeGroups A debugging function. 1.467 +// dump out all of the range groups. 1.468 +// 1.469 +//------------------------------------------------------------------------ 1.470 +#ifdef RBBI_DEBUG 1.471 +void RBBISetBuilder::printRangeGroups() { 1.472 + RangeDescriptor *rlRange; 1.473 + RangeDescriptor *tRange; 1.474 + int i; 1.475 + int lastPrintedGroupNum = 0; 1.476 + 1.477 + RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); 1.478 + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { 1.479 + int groupNum = rlRange->fNum & 0xbfff; 1.480 + if (groupNum > lastPrintedGroupNum) { 1.481 + lastPrintedGroupNum = groupNum; 1.482 + RBBIDebugPrintf("%2i ", groupNum); 1.483 + 1.484 + if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} 1.485 + 1.486 + for (i=0; i<rlRange->fIncludesSets->size(); i++) { 1.487 + RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); 1.488 + UnicodeString setName = UNICODE_STRING("anon", 4); 1.489 + RBBINode *setRef = usetNode->fParent; 1.490 + if (setRef != NULL) { 1.491 + RBBINode *varRef = setRef->fParent; 1.492 + if (varRef != NULL && varRef->fType == RBBINode::varRef) { 1.493 + setName = varRef->fText; 1.494 + } 1.495 + } 1.496 + RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); 1.497 + } 1.498 + 1.499 + i = 0; 1.500 + for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { 1.501 + if (tRange->fNum == rlRange->fNum) { 1.502 + if (i++ % 5 == 0) { 1.503 + RBBIDebugPrintf("\n "); 1.504 + } 1.505 + RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); 1.506 + } 1.507 + } 1.508 + RBBIDebugPrintf("\n"); 1.509 + } 1.510 + } 1.511 + RBBIDebugPrintf("\n"); 1.512 +} 1.513 +#endif 1.514 + 1.515 + 1.516 +//------------------------------------------------------------------------ 1.517 +// 1.518 +// printSets A debugging function. 1.519 +// dump out all of the set definitions. 1.520 +// 1.521 +//------------------------------------------------------------------------ 1.522 +#ifdef RBBI_DEBUG 1.523 +void RBBISetBuilder::printSets() { 1.524 + int i; 1.525 + 1.526 + RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); 1.527 + for (i=0; ; i++) { 1.528 + RBBINode *usetNode; 1.529 + RBBINode *setRef; 1.530 + RBBINode *varRef; 1.531 + UnicodeString setName; 1.532 + 1.533 + usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); 1.534 + if (usetNode == NULL) { 1.535 + break; 1.536 + } 1.537 + 1.538 + RBBIDebugPrintf("%3d ", i); 1.539 + setName = UNICODE_STRING("anonymous", 9); 1.540 + setRef = usetNode->fParent; 1.541 + if (setRef != NULL) { 1.542 + varRef = setRef->fParent; 1.543 + if (varRef != NULL && varRef->fType == RBBINode::varRef) { 1.544 + setName = varRef->fText; 1.545 + } 1.546 + } 1.547 + RBBI_DEBUG_printUnicodeString(setName); 1.548 + RBBIDebugPrintf(" "); 1.549 + RBBI_DEBUG_printUnicodeString(usetNode->fText); 1.550 + RBBIDebugPrintf("\n"); 1.551 + if (usetNode->fLeftChild != NULL) { 1.552 + usetNode->fLeftChild->printTree(TRUE); 1.553 + } 1.554 + } 1.555 + RBBIDebugPrintf("\n"); 1.556 +} 1.557 +#endif 1.558 + 1.559 + 1.560 + 1.561 +//------------------------------------------------------------------------------------- 1.562 +// 1.563 +// RangeDescriptor copy constructor 1.564 +// 1.565 +//------------------------------------------------------------------------------------- 1.566 + 1.567 +RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { 1.568 + int i; 1.569 + 1.570 + this->fStartChar = other.fStartChar; 1.571 + this->fEndChar = other.fEndChar; 1.572 + this->fNum = other.fNum; 1.573 + this->fNext = NULL; 1.574 + UErrorCode oldstatus = status; 1.575 + this->fIncludesSets = new UVector(status); 1.576 + if (U_FAILURE(oldstatus)) { 1.577 + status = oldstatus; 1.578 + } 1.579 + if (U_FAILURE(status)) { 1.580 + return; 1.581 + } 1.582 + /* test for NULL */ 1.583 + if (this->fIncludesSets == 0) { 1.584 + status = U_MEMORY_ALLOCATION_ERROR; 1.585 + return; 1.586 + } 1.587 + 1.588 + for (i=0; i<other.fIncludesSets->size(); i++) { 1.589 + this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); 1.590 + } 1.591 +} 1.592 + 1.593 + 1.594 +//------------------------------------------------------------------------------------- 1.595 +// 1.596 +// RangeDesriptor default constructor 1.597 +// 1.598 +//------------------------------------------------------------------------------------- 1.599 +RangeDescriptor::RangeDescriptor(UErrorCode &status) { 1.600 + this->fStartChar = 0; 1.601 + this->fEndChar = 0; 1.602 + this->fNum = 0; 1.603 + this->fNext = NULL; 1.604 + UErrorCode oldstatus = status; 1.605 + this->fIncludesSets = new UVector(status); 1.606 + if (U_FAILURE(oldstatus)) { 1.607 + status = oldstatus; 1.608 + } 1.609 + if (U_FAILURE(status)) { 1.610 + return; 1.611 + } 1.612 + /* test for NULL */ 1.613 + if(this->fIncludesSets == 0) { 1.614 + status = U_MEMORY_ALLOCATION_ERROR; 1.615 + return; 1.616 + } 1.617 + 1.618 +} 1.619 + 1.620 + 1.621 +//------------------------------------------------------------------------------------- 1.622 +// 1.623 +// RangeDesriptor Destructor 1.624 +// 1.625 +//------------------------------------------------------------------------------------- 1.626 +RangeDescriptor::~RangeDescriptor() { 1.627 + delete fIncludesSets; 1.628 + fIncludesSets = NULL; 1.629 +} 1.630 + 1.631 +//------------------------------------------------------------------------------------- 1.632 +// 1.633 +// RangeDesriptor::split() 1.634 +// 1.635 +//------------------------------------------------------------------------------------- 1.636 +void RangeDescriptor::split(UChar32 where, UErrorCode &status) { 1.637 + U_ASSERT(where>fStartChar && where<=fEndChar); 1.638 + RangeDescriptor *nr = new RangeDescriptor(*this, status); 1.639 + if(nr == 0) { 1.640 + status = U_MEMORY_ALLOCATION_ERROR; 1.641 + return; 1.642 + } 1.643 + if (U_FAILURE(status)) { 1.644 + delete nr; 1.645 + return; 1.646 + } 1.647 + // RangeDescriptor copy constructor copies all fields. 1.648 + // Only need to update those that are different after the split. 1.649 + nr->fStartChar = where; 1.650 + this->fEndChar = where-1; 1.651 + nr->fNext = this->fNext; 1.652 + this->fNext = nr; 1.653 +} 1.654 + 1.655 + 1.656 +//------------------------------------------------------------------------------------- 1.657 +// 1.658 +// RangeDescriptor::setDictionaryFlag 1.659 +// 1.660 +// Character Category Numbers that include characters from 1.661 +// the original Unicode Set named "dictionary" have bit 14 1.662 +// set to 1. The RBBI runtime engine uses this to trigger 1.663 +// use of the word dictionary. 1.664 +// 1.665 +// This function looks through the Unicode Sets that it 1.666 +// (the range) includes, and sets the bit in fNum when 1.667 +// "dictionary" is among them. 1.668 +// 1.669 +// TODO: a faster way would be to find the set node for 1.670 +// "dictionary" just once, rather than looking it 1.671 +// up by name every time. 1.672 +// 1.673 +//------------------------------------------------------------------------------------- 1.674 +void RangeDescriptor::setDictionaryFlag() { 1.675 + int i; 1.676 + 1.677 + for (i=0; i<this->fIncludesSets->size(); i++) { 1.678 + RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); 1.679 + UnicodeString setName; 1.680 + RBBINode *setRef = usetNode->fParent; 1.681 + if (setRef != NULL) { 1.682 + RBBINode *varRef = setRef->fParent; 1.683 + if (varRef != NULL && varRef->fType == RBBINode::varRef) { 1.684 + setName = varRef->fText; 1.685 + } 1.686 + } 1.687 + if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. 1.688 + this->fNum |= 0x4000; 1.689 + break; 1.690 + } 1.691 + } 1.692 +} 1.693 + 1.694 + 1.695 + 1.696 +U_NAMESPACE_END 1.697 + 1.698 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */