Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | // |
michael@0 | 2 | // rbbisetb.cpp |
michael@0 | 3 | // |
michael@0 | 4 | /* |
michael@0 | 5 | *************************************************************************** |
michael@0 | 6 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
michael@0 | 7 | * and others. All rights reserved. * |
michael@0 | 8 | *************************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | // |
michael@0 | 11 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules |
michael@0 | 12 | // (part of the rule building process.) |
michael@0 | 13 | // |
michael@0 | 14 | // Starting with the rules parse tree from the scanner, |
michael@0 | 15 | // |
michael@0 | 16 | // - Enumerate the set of UnicodeSets that are referenced |
michael@0 | 17 | // by the RBBI rules. |
michael@0 | 18 | // - compute a set of non-overlapping character ranges |
michael@0 | 19 | // with all characters within a range belonging to the same |
michael@0 | 20 | // set of input uniocde sets. |
michael@0 | 21 | // - Derive a set of non-overlapping UnicodeSet (like things) |
michael@0 | 22 | // that will correspond to columns in the state table for |
michael@0 | 23 | // the RBBI execution engine. All characters within one |
michael@0 | 24 | // of these sets belong to the same set of the original |
michael@0 | 25 | // UnicodeSets from the user's rules. |
michael@0 | 26 | // - construct the trie table that maps input characters |
michael@0 | 27 | // to the index of the matching non-overlapping set of set from |
michael@0 | 28 | // the previous step. |
michael@0 | 29 | // |
michael@0 | 30 | |
michael@0 | 31 | #include "unicode/utypes.h" |
michael@0 | 32 | |
michael@0 | 33 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 34 | |
michael@0 | 35 | #include "unicode/uniset.h" |
michael@0 | 36 | #include "utrie.h" |
michael@0 | 37 | #include "uvector.h" |
michael@0 | 38 | #include "uassert.h" |
michael@0 | 39 | #include "cmemory.h" |
michael@0 | 40 | #include "cstring.h" |
michael@0 | 41 | |
michael@0 | 42 | #include "rbbisetb.h" |
michael@0 | 43 | #include "rbbinode.h" |
michael@0 | 44 | |
michael@0 | 45 | |
michael@0 | 46 | //------------------------------------------------------------------------ |
michael@0 | 47 | // |
michael@0 | 48 | // getFoldedRBBIValue Call-back function used during building of Trie table. |
michael@0 | 49 | // Folding value: just store the offset (16 bits) |
michael@0 | 50 | // if there is any non-0 entry. |
michael@0 | 51 | // (It'd really be nice if the Trie builder would provide a |
michael@0 | 52 | // simple default, so this function could go away from here.) |
michael@0 | 53 | // |
michael@0 | 54 | //------------------------------------------------------------------------ |
michael@0 | 55 | /* folding value: just store the offset (16 bits) if there is any non-0 entry */ |
michael@0 | 56 | U_CDECL_BEGIN |
michael@0 | 57 | static uint32_t U_CALLCONV |
michael@0 | 58 | getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) { |
michael@0 | 59 | uint32_t value; |
michael@0 | 60 | UChar32 limit; |
michael@0 | 61 | UBool inBlockZero; |
michael@0 | 62 | |
michael@0 | 63 | limit=start+0x400; |
michael@0 | 64 | while(start<limit) { |
michael@0 | 65 | value=utrie_get32(trie, start, &inBlockZero); |
michael@0 | 66 | if(inBlockZero) { |
michael@0 | 67 | start+=UTRIE_DATA_BLOCK_LENGTH; |
michael@0 | 68 | } else if(value!=0) { |
michael@0 | 69 | return (uint32_t)(offset|0x8000); |
michael@0 | 70 | } else { |
michael@0 | 71 | ++start; |
michael@0 | 72 | } |
michael@0 | 73 | } |
michael@0 | 74 | return 0; |
michael@0 | 75 | } |
michael@0 | 76 | |
michael@0 | 77 | |
michael@0 | 78 | U_CDECL_END |
michael@0 | 79 | |
michael@0 | 80 | |
michael@0 | 81 | |
michael@0 | 82 | U_NAMESPACE_BEGIN |
michael@0 | 83 | |
michael@0 | 84 | //------------------------------------------------------------------------ |
michael@0 | 85 | // |
michael@0 | 86 | // Constructor |
michael@0 | 87 | // |
michael@0 | 88 | //------------------------------------------------------------------------ |
michael@0 | 89 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) |
michael@0 | 90 | { |
michael@0 | 91 | fRB = rb; |
michael@0 | 92 | fStatus = rb->fStatus; |
michael@0 | 93 | fRangeList = 0; |
michael@0 | 94 | fTrie = 0; |
michael@0 | 95 | fTrieSize = 0; |
michael@0 | 96 | fGroupCount = 0; |
michael@0 | 97 | fSawBOF = FALSE; |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | |
michael@0 | 101 | //------------------------------------------------------------------------ |
michael@0 | 102 | // |
michael@0 | 103 | // Destructor |
michael@0 | 104 | // |
michael@0 | 105 | //------------------------------------------------------------------------ |
michael@0 | 106 | RBBISetBuilder::~RBBISetBuilder() |
michael@0 | 107 | { |
michael@0 | 108 | RangeDescriptor *nextRangeDesc; |
michael@0 | 109 | |
michael@0 | 110 | // Walk through & delete the linked list of RangeDescriptors |
michael@0 | 111 | for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { |
michael@0 | 112 | RangeDescriptor *r = nextRangeDesc; |
michael@0 | 113 | nextRangeDesc = r->fNext; |
michael@0 | 114 | delete r; |
michael@0 | 115 | } |
michael@0 | 116 | |
michael@0 | 117 | utrie_close(fTrie); |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | |
michael@0 | 121 | |
michael@0 | 122 | |
michael@0 | 123 | //------------------------------------------------------------------------ |
michael@0 | 124 | // |
michael@0 | 125 | // build Build the list of non-overlapping character ranges |
michael@0 | 126 | // from the Unicode Sets. |
michael@0 | 127 | // |
michael@0 | 128 | //------------------------------------------------------------------------ |
michael@0 | 129 | void RBBISetBuilder::build() { |
michael@0 | 130 | RBBINode *usetNode; |
michael@0 | 131 | RangeDescriptor *rlRange; |
michael@0 | 132 | |
michael@0 | 133 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} |
michael@0 | 134 | |
michael@0 | 135 | // |
michael@0 | 136 | // Initialize the process by creating a single range encompassing all characters |
michael@0 | 137 | // that is in no sets. |
michael@0 | 138 | // |
michael@0 | 139 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
michael@0 | 140 | if (fRangeList == NULL) { |
michael@0 | 141 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 142 | return; |
michael@0 | 143 | } |
michael@0 | 144 | fRangeList->fStartChar = 0; |
michael@0 | 145 | fRangeList->fEndChar = 0x10ffff; |
michael@0 | 146 | |
michael@0 | 147 | if (U_FAILURE(*fStatus)) { |
michael@0 | 148 | return; |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | // |
michael@0 | 152 | // Find the set of non-overlapping ranges of characters |
michael@0 | 153 | // |
michael@0 | 154 | int ni; |
michael@0 | 155 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
michael@0 | 156 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
michael@0 | 157 | if (usetNode==NULL) { |
michael@0 | 158 | break; |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | UnicodeSet *inputSet = usetNode->fInputSet; |
michael@0 | 162 | int32_t inputSetRangeCount = inputSet->getRangeCount(); |
michael@0 | 163 | int inputSetRangeIndex = 0; |
michael@0 | 164 | rlRange = fRangeList; |
michael@0 | 165 | |
michael@0 | 166 | for (;;) { |
michael@0 | 167 | if (inputSetRangeIndex >= inputSetRangeCount) { |
michael@0 | 168 | break; |
michael@0 | 169 | } |
michael@0 | 170 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); |
michael@0 | 171 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); |
michael@0 | 172 | |
michael@0 | 173 | // skip over ranges from the range list that are completely |
michael@0 | 174 | // below the current range from the input unicode set. |
michael@0 | 175 | while (rlRange->fEndChar < inputSetRangeBegin) { |
michael@0 | 176 | rlRange = rlRange->fNext; |
michael@0 | 177 | } |
michael@0 | 178 | |
michael@0 | 179 | // If the start of the range from the range list is before with |
michael@0 | 180 | // the start of the range from the unicode set, split the range list range |
michael@0 | 181 | // in two, with one part being before (wholly outside of) the unicode set |
michael@0 | 182 | // and the other containing the rest. |
michael@0 | 183 | // Then continue the loop; the post-split current range will then be skipped |
michael@0 | 184 | // over |
michael@0 | 185 | if (rlRange->fStartChar < inputSetRangeBegin) { |
michael@0 | 186 | rlRange->split(inputSetRangeBegin, *fStatus); |
michael@0 | 187 | if (U_FAILURE(*fStatus)) { |
michael@0 | 188 | return; |
michael@0 | 189 | } |
michael@0 | 190 | continue; |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | // Same thing at the end of the ranges... |
michael@0 | 194 | // If the end of the range from the range list doesn't coincide with |
michael@0 | 195 | // the end of the range from the unicode set, split the range list |
michael@0 | 196 | // range in two. The first part of the split range will be |
michael@0 | 197 | // wholly inside the Unicode set. |
michael@0 | 198 | if (rlRange->fEndChar > inputSetRangeEnd) { |
michael@0 | 199 | rlRange->split(inputSetRangeEnd+1, *fStatus); |
michael@0 | 200 | if (U_FAILURE(*fStatus)) { |
michael@0 | 201 | return; |
michael@0 | 202 | } |
michael@0 | 203 | } |
michael@0 | 204 | |
michael@0 | 205 | // The current rlRange is now entirely within the UnicodeSet range. |
michael@0 | 206 | // Add this unicode set to the list of sets for this rlRange |
michael@0 | 207 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { |
michael@0 | 208 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); |
michael@0 | 209 | if (U_FAILURE(*fStatus)) { |
michael@0 | 210 | return; |
michael@0 | 211 | } |
michael@0 | 212 | } |
michael@0 | 213 | |
michael@0 | 214 | // Advance over ranges that we are finished with. |
michael@0 | 215 | if (inputSetRangeEnd == rlRange->fEndChar) { |
michael@0 | 216 | inputSetRangeIndex++; |
michael@0 | 217 | } |
michael@0 | 218 | rlRange = rlRange->fNext; |
michael@0 | 219 | } |
michael@0 | 220 | } |
michael@0 | 221 | |
michael@0 | 222 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} |
michael@0 | 223 | |
michael@0 | 224 | // |
michael@0 | 225 | // Group the above ranges, with each group consisting of one or more |
michael@0 | 226 | // ranges that are in exactly the same set of original UnicodeSets. |
michael@0 | 227 | // The groups are numbered, and these group numbers are the set of |
michael@0 | 228 | // input symbols recognized by the run-time state machine. |
michael@0 | 229 | // |
michael@0 | 230 | // Numbering: # 0 (state table column 0) is unused. |
michael@0 | 231 | // # 1 is reserved - table column 1 is for end-of-input |
michael@0 | 232 | // # 2 is reserved - table column 2 is for beginning-in-input |
michael@0 | 233 | // # 3 is the first range list. |
michael@0 | 234 | // |
michael@0 | 235 | RangeDescriptor *rlSearchRange; |
michael@0 | 236 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
michael@0 | 237 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { |
michael@0 | 238 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { |
michael@0 | 239 | rlRange->fNum = rlSearchRange->fNum; |
michael@0 | 240 | break; |
michael@0 | 241 | } |
michael@0 | 242 | } |
michael@0 | 243 | if (rlRange->fNum == 0) { |
michael@0 | 244 | fGroupCount ++; |
michael@0 | 245 | rlRange->fNum = fGroupCount+2; |
michael@0 | 246 | rlRange->setDictionaryFlag(); |
michael@0 | 247 | addValToSets(rlRange->fIncludesSets, fGroupCount+2); |
michael@0 | 248 | } |
michael@0 | 249 | } |
michael@0 | 250 | |
michael@0 | 251 | // Handle input sets that contain the special string {eof}. |
michael@0 | 252 | // Column 1 of the state table is reserved for EOF on input. |
michael@0 | 253 | // Column 2 is reserved for before-the-start-input. |
michael@0 | 254 | // (This column can be optimized away later if there are no rule |
michael@0 | 255 | // references to {bof}.) |
michael@0 | 256 | // Add this column value (1 or 2) to the equivalent expression |
michael@0 | 257 | // subtree for each UnicodeSet that contains the string {eof} |
michael@0 | 258 | // Because {bof} and {eof} are not a characters in the normal sense, |
michael@0 | 259 | // they doesn't affect the computation of ranges or TRIE. |
michael@0 | 260 | static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; |
michael@0 | 261 | static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; |
michael@0 | 262 | |
michael@0 | 263 | UnicodeString eofString(eofUString); |
michael@0 | 264 | UnicodeString bofString(bofUString); |
michael@0 | 265 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
michael@0 | 266 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
michael@0 | 267 | if (usetNode==NULL) { |
michael@0 | 268 | break; |
michael@0 | 269 | } |
michael@0 | 270 | UnicodeSet *inputSet = usetNode->fInputSet; |
michael@0 | 271 | if (inputSet->contains(eofString)) { |
michael@0 | 272 | addValToSet(usetNode, 1); |
michael@0 | 273 | } |
michael@0 | 274 | if (inputSet->contains(bofString)) { |
michael@0 | 275 | addValToSet(usetNode, 2); |
michael@0 | 276 | fSawBOF = TRUE; |
michael@0 | 277 | } |
michael@0 | 278 | } |
michael@0 | 279 | |
michael@0 | 280 | |
michael@0 | 281 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} |
michael@0 | 282 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} |
michael@0 | 283 | |
michael@0 | 284 | // |
michael@0 | 285 | // Build the Trie table for mapping UChar32 values to the corresponding |
michael@0 | 286 | // range group number |
michael@0 | 287 | // |
michael@0 | 288 | fTrie = utrie_open(NULL, // Pre-existing trie to be filled in |
michael@0 | 289 | NULL, // Data array (utrie will allocate one) |
michael@0 | 290 | 100000, // Max Data Length |
michael@0 | 291 | 0, // Initial value for all code points |
michael@0 | 292 | 0, // Lead surrogate unit value |
michael@0 | 293 | TRUE); // Keep Latin 1 in separately |
michael@0 | 294 | |
michael@0 | 295 | |
michael@0 | 296 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
michael@0 | 297 | utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); |
michael@0 | 298 | } |
michael@0 | 299 | } |
michael@0 | 300 | |
michael@0 | 301 | |
michael@0 | 302 | |
michael@0 | 303 | //----------------------------------------------------------------------------------- |
michael@0 | 304 | // |
michael@0 | 305 | // getTrieSize() Return the size that will be required to serialize the Trie. |
michael@0 | 306 | // |
michael@0 | 307 | //----------------------------------------------------------------------------------- |
michael@0 | 308 | int32_t RBBISetBuilder::getTrieSize() /*const*/ { |
michael@0 | 309 | fTrieSize = utrie_serialize(fTrie, |
michael@0 | 310 | NULL, // Buffer |
michael@0 | 311 | 0, // Capacity |
michael@0 | 312 | getFoldedRBBIValue, |
michael@0 | 313 | TRUE, // Reduce to 16 bits |
michael@0 | 314 | fStatus); |
michael@0 | 315 | // RBBIDebugPrintf("Trie table size is %d\n", trieSize); |
michael@0 | 316 | return fTrieSize; |
michael@0 | 317 | } |
michael@0 | 318 | |
michael@0 | 319 | |
michael@0 | 320 | //----------------------------------------------------------------------------------- |
michael@0 | 321 | // |
michael@0 | 322 | // serializeTrie() Put the serialized trie at the specified address. |
michael@0 | 323 | // Trust the caller to have given us enough memory. |
michael@0 | 324 | // getTrieSize() MUST be called first. |
michael@0 | 325 | // |
michael@0 | 326 | //----------------------------------------------------------------------------------- |
michael@0 | 327 | void RBBISetBuilder::serializeTrie(uint8_t *where) { |
michael@0 | 328 | utrie_serialize(fTrie, |
michael@0 | 329 | where, // Buffer |
michael@0 | 330 | fTrieSize, // Capacity |
michael@0 | 331 | getFoldedRBBIValue, |
michael@0 | 332 | TRUE, // Reduce to 16 bits |
michael@0 | 333 | fStatus); |
michael@0 | 334 | } |
michael@0 | 335 | |
michael@0 | 336 | //------------------------------------------------------------------------ |
michael@0 | 337 | // |
michael@0 | 338 | // addValToSets Add a runtime-mapped input value to each uset from a |
michael@0 | 339 | // list of uset nodes. (val corresponds to a state table column.) |
michael@0 | 340 | // For each of the original Unicode sets - which correspond |
michael@0 | 341 | // directly to uset nodes - a logically equivalent expression |
michael@0 | 342 | // is constructed in terms of the remapped runtime input |
michael@0 | 343 | // symbol set. This function adds one runtime input symbol to |
michael@0 | 344 | // a list of sets. |
michael@0 | 345 | // |
michael@0 | 346 | // The "logically equivalent expression" is the tree for an |
michael@0 | 347 | // or-ing together of all of the symbols that go into the set. |
michael@0 | 348 | // |
michael@0 | 349 | //------------------------------------------------------------------------ |
michael@0 | 350 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { |
michael@0 | 351 | int32_t ix; |
michael@0 | 352 | |
michael@0 | 353 | for (ix=0; ix<sets->size(); ix++) { |
michael@0 | 354 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); |
michael@0 | 355 | addValToSet(usetNode, val); |
michael@0 | 356 | } |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
michael@0 | 360 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); |
michael@0 | 361 | if (leafNode == NULL) { |
michael@0 | 362 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 363 | return; |
michael@0 | 364 | } |
michael@0 | 365 | leafNode->fVal = (unsigned short)val; |
michael@0 | 366 | if (usetNode->fLeftChild == NULL) { |
michael@0 | 367 | usetNode->fLeftChild = leafNode; |
michael@0 | 368 | leafNode->fParent = usetNode; |
michael@0 | 369 | } else { |
michael@0 | 370 | // There are already input symbols present for this set. |
michael@0 | 371 | // Set up an OR node, with the previous stuff as the left child |
michael@0 | 372 | // and the new value as the right child. |
michael@0 | 373 | RBBINode *orNode = new RBBINode(RBBINode::opOr); |
michael@0 | 374 | if (orNode == NULL) { |
michael@0 | 375 | *fStatus = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 376 | return; |
michael@0 | 377 | } |
michael@0 | 378 | orNode->fLeftChild = usetNode->fLeftChild; |
michael@0 | 379 | orNode->fRightChild = leafNode; |
michael@0 | 380 | orNode->fLeftChild->fParent = orNode; |
michael@0 | 381 | orNode->fRightChild->fParent = orNode; |
michael@0 | 382 | usetNode->fLeftChild = orNode; |
michael@0 | 383 | orNode->fParent = usetNode; |
michael@0 | 384 | } |
michael@0 | 385 | } |
michael@0 | 386 | |
michael@0 | 387 | |
michael@0 | 388 | //------------------------------------------------------------------------ |
michael@0 | 389 | // |
michael@0 | 390 | // getNumCharCategories |
michael@0 | 391 | // |
michael@0 | 392 | //------------------------------------------------------------------------ |
michael@0 | 393 | int32_t RBBISetBuilder::getNumCharCategories() const { |
michael@0 | 394 | return fGroupCount + 3; |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | |
michael@0 | 398 | //------------------------------------------------------------------------ |
michael@0 | 399 | // |
michael@0 | 400 | // sawBOF |
michael@0 | 401 | // |
michael@0 | 402 | //------------------------------------------------------------------------ |
michael@0 | 403 | UBool RBBISetBuilder::sawBOF() const { |
michael@0 | 404 | return fSawBOF; |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | |
michael@0 | 408 | //------------------------------------------------------------------------ |
michael@0 | 409 | // |
michael@0 | 410 | // getFirstChar Given a runtime RBBI character category, find |
michael@0 | 411 | // the first UChar32 that is in the set of chars |
michael@0 | 412 | // in the category. |
michael@0 | 413 | //------------------------------------------------------------------------ |
michael@0 | 414 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { |
michael@0 | 415 | RangeDescriptor *rlRange; |
michael@0 | 416 | UChar32 retVal = (UChar32)-1; |
michael@0 | 417 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
michael@0 | 418 | if (rlRange->fNum == category) { |
michael@0 | 419 | retVal = rlRange->fStartChar; |
michael@0 | 420 | break; |
michael@0 | 421 | } |
michael@0 | 422 | } |
michael@0 | 423 | return retVal; |
michael@0 | 424 | } |
michael@0 | 425 | |
michael@0 | 426 | |
michael@0 | 427 | |
michael@0 | 428 | //------------------------------------------------------------------------ |
michael@0 | 429 | // |
michael@0 | 430 | // printRanges A debugging function. |
michael@0 | 431 | // dump out all of the range definitions. |
michael@0 | 432 | // |
michael@0 | 433 | //------------------------------------------------------------------------ |
michael@0 | 434 | #ifdef RBBI_DEBUG |
michael@0 | 435 | void RBBISetBuilder::printRanges() { |
michael@0 | 436 | RangeDescriptor *rlRange; |
michael@0 | 437 | int i; |
michael@0 | 438 | |
michael@0 | 439 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); |
michael@0 | 440 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
michael@0 | 441 | RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); |
michael@0 | 442 | |
michael@0 | 443 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
michael@0 | 444 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
michael@0 | 445 | UnicodeString setName = UNICODE_STRING("anon", 4); |
michael@0 | 446 | RBBINode *setRef = usetNode->fParent; |
michael@0 | 447 | if (setRef != NULL) { |
michael@0 | 448 | RBBINode *varRef = setRef->fParent; |
michael@0 | 449 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
michael@0 | 450 | setName = varRef->fText; |
michael@0 | 451 | } |
michael@0 | 452 | } |
michael@0 | 453 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
michael@0 | 454 | } |
michael@0 | 455 | RBBIDebugPrintf("\n"); |
michael@0 | 456 | } |
michael@0 | 457 | } |
michael@0 | 458 | #endif |
michael@0 | 459 | |
michael@0 | 460 | |
michael@0 | 461 | //------------------------------------------------------------------------ |
michael@0 | 462 | // |
michael@0 | 463 | // printRangeGroups A debugging function. |
michael@0 | 464 | // dump out all of the range groups. |
michael@0 | 465 | // |
michael@0 | 466 | //------------------------------------------------------------------------ |
michael@0 | 467 | #ifdef RBBI_DEBUG |
michael@0 | 468 | void RBBISetBuilder::printRangeGroups() { |
michael@0 | 469 | RangeDescriptor *rlRange; |
michael@0 | 470 | RangeDescriptor *tRange; |
michael@0 | 471 | int i; |
michael@0 | 472 | int lastPrintedGroupNum = 0; |
michael@0 | 473 | |
michael@0 | 474 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); |
michael@0 | 475 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { |
michael@0 | 476 | int groupNum = rlRange->fNum & 0xbfff; |
michael@0 | 477 | if (groupNum > lastPrintedGroupNum) { |
michael@0 | 478 | lastPrintedGroupNum = groupNum; |
michael@0 | 479 | RBBIDebugPrintf("%2i ", groupNum); |
michael@0 | 480 | |
michael@0 | 481 | if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} |
michael@0 | 482 | |
michael@0 | 483 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { |
michael@0 | 484 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); |
michael@0 | 485 | UnicodeString setName = UNICODE_STRING("anon", 4); |
michael@0 | 486 | RBBINode *setRef = usetNode->fParent; |
michael@0 | 487 | if (setRef != NULL) { |
michael@0 | 488 | RBBINode *varRef = setRef->fParent; |
michael@0 | 489 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
michael@0 | 490 | setName = varRef->fText; |
michael@0 | 491 | } |
michael@0 | 492 | } |
michael@0 | 493 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
michael@0 | 494 | } |
michael@0 | 495 | |
michael@0 | 496 | i = 0; |
michael@0 | 497 | for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { |
michael@0 | 498 | if (tRange->fNum == rlRange->fNum) { |
michael@0 | 499 | if (i++ % 5 == 0) { |
michael@0 | 500 | RBBIDebugPrintf("\n "); |
michael@0 | 501 | } |
michael@0 | 502 | RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); |
michael@0 | 503 | } |
michael@0 | 504 | } |
michael@0 | 505 | RBBIDebugPrintf("\n"); |
michael@0 | 506 | } |
michael@0 | 507 | } |
michael@0 | 508 | RBBIDebugPrintf("\n"); |
michael@0 | 509 | } |
michael@0 | 510 | #endif |
michael@0 | 511 | |
michael@0 | 512 | |
michael@0 | 513 | //------------------------------------------------------------------------ |
michael@0 | 514 | // |
michael@0 | 515 | // printSets A debugging function. |
michael@0 | 516 | // dump out all of the set definitions. |
michael@0 | 517 | // |
michael@0 | 518 | //------------------------------------------------------------------------ |
michael@0 | 519 | #ifdef RBBI_DEBUG |
michael@0 | 520 | void RBBISetBuilder::printSets() { |
michael@0 | 521 | int i; |
michael@0 | 522 | |
michael@0 | 523 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); |
michael@0 | 524 | for (i=0; ; i++) { |
michael@0 | 525 | RBBINode *usetNode; |
michael@0 | 526 | RBBINode *setRef; |
michael@0 | 527 | RBBINode *varRef; |
michael@0 | 528 | UnicodeString setName; |
michael@0 | 529 | |
michael@0 | 530 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); |
michael@0 | 531 | if (usetNode == NULL) { |
michael@0 | 532 | break; |
michael@0 | 533 | } |
michael@0 | 534 | |
michael@0 | 535 | RBBIDebugPrintf("%3d ", i); |
michael@0 | 536 | setName = UNICODE_STRING("anonymous", 9); |
michael@0 | 537 | setRef = usetNode->fParent; |
michael@0 | 538 | if (setRef != NULL) { |
michael@0 | 539 | varRef = setRef->fParent; |
michael@0 | 540 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
michael@0 | 541 | setName = varRef->fText; |
michael@0 | 542 | } |
michael@0 | 543 | } |
michael@0 | 544 | RBBI_DEBUG_printUnicodeString(setName); |
michael@0 | 545 | RBBIDebugPrintf(" "); |
michael@0 | 546 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
michael@0 | 547 | RBBIDebugPrintf("\n"); |
michael@0 | 548 | if (usetNode->fLeftChild != NULL) { |
michael@0 | 549 | usetNode->fLeftChild->printTree(TRUE); |
michael@0 | 550 | } |
michael@0 | 551 | } |
michael@0 | 552 | RBBIDebugPrintf("\n"); |
michael@0 | 553 | } |
michael@0 | 554 | #endif |
michael@0 | 555 | |
michael@0 | 556 | |
michael@0 | 557 | |
michael@0 | 558 | //------------------------------------------------------------------------------------- |
michael@0 | 559 | // |
michael@0 | 560 | // RangeDescriptor copy constructor |
michael@0 | 561 | // |
michael@0 | 562 | //------------------------------------------------------------------------------------- |
michael@0 | 563 | |
michael@0 | 564 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { |
michael@0 | 565 | int i; |
michael@0 | 566 | |
michael@0 | 567 | this->fStartChar = other.fStartChar; |
michael@0 | 568 | this->fEndChar = other.fEndChar; |
michael@0 | 569 | this->fNum = other.fNum; |
michael@0 | 570 | this->fNext = NULL; |
michael@0 | 571 | UErrorCode oldstatus = status; |
michael@0 | 572 | this->fIncludesSets = new UVector(status); |
michael@0 | 573 | if (U_FAILURE(oldstatus)) { |
michael@0 | 574 | status = oldstatus; |
michael@0 | 575 | } |
michael@0 | 576 | if (U_FAILURE(status)) { |
michael@0 | 577 | return; |
michael@0 | 578 | } |
michael@0 | 579 | /* test for NULL */ |
michael@0 | 580 | if (this->fIncludesSets == 0) { |
michael@0 | 581 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 582 | return; |
michael@0 | 583 | } |
michael@0 | 584 | |
michael@0 | 585 | for (i=0; i<other.fIncludesSets->size(); i++) { |
michael@0 | 586 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); |
michael@0 | 587 | } |
michael@0 | 588 | } |
michael@0 | 589 | |
michael@0 | 590 | |
michael@0 | 591 | //------------------------------------------------------------------------------------- |
michael@0 | 592 | // |
michael@0 | 593 | // RangeDesriptor default constructor |
michael@0 | 594 | // |
michael@0 | 595 | //------------------------------------------------------------------------------------- |
michael@0 | 596 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { |
michael@0 | 597 | this->fStartChar = 0; |
michael@0 | 598 | this->fEndChar = 0; |
michael@0 | 599 | this->fNum = 0; |
michael@0 | 600 | this->fNext = NULL; |
michael@0 | 601 | UErrorCode oldstatus = status; |
michael@0 | 602 | this->fIncludesSets = new UVector(status); |
michael@0 | 603 | if (U_FAILURE(oldstatus)) { |
michael@0 | 604 | status = oldstatus; |
michael@0 | 605 | } |
michael@0 | 606 | if (U_FAILURE(status)) { |
michael@0 | 607 | return; |
michael@0 | 608 | } |
michael@0 | 609 | /* test for NULL */ |
michael@0 | 610 | if(this->fIncludesSets == 0) { |
michael@0 | 611 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 612 | return; |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | } |
michael@0 | 616 | |
michael@0 | 617 | |
michael@0 | 618 | //------------------------------------------------------------------------------------- |
michael@0 | 619 | // |
michael@0 | 620 | // RangeDesriptor Destructor |
michael@0 | 621 | // |
michael@0 | 622 | //------------------------------------------------------------------------------------- |
michael@0 | 623 | RangeDescriptor::~RangeDescriptor() { |
michael@0 | 624 | delete fIncludesSets; |
michael@0 | 625 | fIncludesSets = NULL; |
michael@0 | 626 | } |
michael@0 | 627 | |
michael@0 | 628 | //------------------------------------------------------------------------------------- |
michael@0 | 629 | // |
michael@0 | 630 | // RangeDesriptor::split() |
michael@0 | 631 | // |
michael@0 | 632 | //------------------------------------------------------------------------------------- |
michael@0 | 633 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { |
michael@0 | 634 | U_ASSERT(where>fStartChar && where<=fEndChar); |
michael@0 | 635 | RangeDescriptor *nr = new RangeDescriptor(*this, status); |
michael@0 | 636 | if(nr == 0) { |
michael@0 | 637 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 638 | return; |
michael@0 | 639 | } |
michael@0 | 640 | if (U_FAILURE(status)) { |
michael@0 | 641 | delete nr; |
michael@0 | 642 | return; |
michael@0 | 643 | } |
michael@0 | 644 | // RangeDescriptor copy constructor copies all fields. |
michael@0 | 645 | // Only need to update those that are different after the split. |
michael@0 | 646 | nr->fStartChar = where; |
michael@0 | 647 | this->fEndChar = where-1; |
michael@0 | 648 | nr->fNext = this->fNext; |
michael@0 | 649 | this->fNext = nr; |
michael@0 | 650 | } |
michael@0 | 651 | |
michael@0 | 652 | |
michael@0 | 653 | //------------------------------------------------------------------------------------- |
michael@0 | 654 | // |
michael@0 | 655 | // RangeDescriptor::setDictionaryFlag |
michael@0 | 656 | // |
michael@0 | 657 | // Character Category Numbers that include characters from |
michael@0 | 658 | // the original Unicode Set named "dictionary" have bit 14 |
michael@0 | 659 | // set to 1. The RBBI runtime engine uses this to trigger |
michael@0 | 660 | // use of the word dictionary. |
michael@0 | 661 | // |
michael@0 | 662 | // This function looks through the Unicode Sets that it |
michael@0 | 663 | // (the range) includes, and sets the bit in fNum when |
michael@0 | 664 | // "dictionary" is among them. |
michael@0 | 665 | // |
michael@0 | 666 | // TODO: a faster way would be to find the set node for |
michael@0 | 667 | // "dictionary" just once, rather than looking it |
michael@0 | 668 | // up by name every time. |
michael@0 | 669 | // |
michael@0 | 670 | //------------------------------------------------------------------------------------- |
michael@0 | 671 | void RangeDescriptor::setDictionaryFlag() { |
michael@0 | 672 | int i; |
michael@0 | 673 | |
michael@0 | 674 | for (i=0; i<this->fIncludesSets->size(); i++) { |
michael@0 | 675 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); |
michael@0 | 676 | UnicodeString setName; |
michael@0 | 677 | RBBINode *setRef = usetNode->fParent; |
michael@0 | 678 | if (setRef != NULL) { |
michael@0 | 679 | RBBINode *varRef = setRef->fParent; |
michael@0 | 680 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { |
michael@0 | 681 | setName = varRef->fText; |
michael@0 | 682 | } |
michael@0 | 683 | } |
michael@0 | 684 | if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. |
michael@0 | 685 | this->fNum |= 0x4000; |
michael@0 | 686 | break; |
michael@0 | 687 | } |
michael@0 | 688 | } |
michael@0 | 689 | } |
michael@0 | 690 | |
michael@0 | 691 | |
michael@0 | 692 | |
michael@0 | 693 | U_NAMESPACE_END |
michael@0 | 694 | |
michael@0 | 695 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |