1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/rbbi.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1891 @@ 1.4 +/* 1.5 +*************************************************************************** 1.6 +* Copyright (C) 1999-2013 International Business Machines Corporation 1.7 +* and others. All rights reserved. 1.8 +*************************************************************************** 1.9 +*/ 1.10 +// 1.11 +// file: rbbi.c Contains the implementation of the rule based break iterator 1.12 +// runtime engine and the API implementation for 1.13 +// class RuleBasedBreakIterator 1.14 +// 1.15 + 1.16 +#include "utypeinfo.h" // for 'typeid' to work 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 + 1.20 +#if !UCONFIG_NO_BREAK_ITERATION 1.21 + 1.22 +#include "unicode/rbbi.h" 1.23 +#include "unicode/schriter.h" 1.24 +#include "unicode/uchriter.h" 1.25 +#include "unicode/udata.h" 1.26 +#include "unicode/uclean.h" 1.27 +#include "rbbidata.h" 1.28 +#include "rbbirb.h" 1.29 +#include "cmemory.h" 1.30 +#include "cstring.h" 1.31 +#include "umutex.h" 1.32 +#include "ucln_cmn.h" 1.33 +#include "brkeng.h" 1.34 + 1.35 +#include "uassert.h" 1.36 +#include "uvector.h" 1.37 + 1.38 +// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 1.39 +#if U_LOCAL_SERVICE_HOOK 1.40 +#include "localsvc.h" 1.41 +#endif 1.42 + 1.43 +#ifdef RBBI_DEBUG 1.44 +static UBool fTrace = FALSE; 1.45 +#endif 1.46 + 1.47 +U_NAMESPACE_BEGIN 1.48 + 1.49 +// The state number of the starting state 1.50 +#define START_STATE 1 1.51 + 1.52 +// The state-transition value indicating "stop" 1.53 +#define STOP_STATE 0 1.54 + 1.55 + 1.56 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 1.57 + 1.58 + 1.59 +//======================================================================= 1.60 +// constructors 1.61 +//======================================================================= 1.62 + 1.63 +/** 1.64 + * Constructs a RuleBasedBreakIterator that uses the already-created 1.65 + * tables object that is passed in as a parameter. 1.66 + */ 1.67 +RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 1.68 +{ 1.69 + init(); 1.70 + fData = new RBBIDataWrapper(data, status); // status checked in constructor 1.71 + if (U_FAILURE(status)) {return;} 1.72 + if(fData == 0) { 1.73 + status = U_MEMORY_ALLOCATION_ERROR; 1.74 + return; 1.75 + } 1.76 +} 1.77 + 1.78 +/** 1.79 + * Same as above but does not adopt memory 1.80 + */ 1.81 +RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 1.82 +{ 1.83 + init(); 1.84 + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 1.85 + if (U_FAILURE(status)) {return;} 1.86 + if(fData == 0) { 1.87 + status = U_MEMORY_ALLOCATION_ERROR; 1.88 + return; 1.89 + } 1.90 +} 1.91 + 1.92 + 1.93 +// 1.94 +// Construct from precompiled binary rules (tables). This constructor is public API, 1.95 +// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 1.96 +// 1.97 +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 1.98 + uint32_t ruleLength, 1.99 + UErrorCode &status) { 1.100 + init(); 1.101 + if (U_FAILURE(status)) { 1.102 + return; 1.103 + } 1.104 + if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 1.105 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.106 + return; 1.107 + } 1.108 + const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 1.109 + if (data->fLength > ruleLength) { 1.110 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.111 + return; 1.112 + } 1.113 + fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 1.114 + if (U_FAILURE(status)) {return;} 1.115 + if(fData == 0) { 1.116 + status = U_MEMORY_ALLOCATION_ERROR; 1.117 + return; 1.118 + } 1.119 +} 1.120 + 1.121 + 1.122 +//------------------------------------------------------------------------------- 1.123 +// 1.124 +// Constructor from a UDataMemory handle to precompiled break rules 1.125 +// stored in an ICU data file. 1.126 +// 1.127 +//------------------------------------------------------------------------------- 1.128 +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 1.129 +{ 1.130 + init(); 1.131 + fData = new RBBIDataWrapper(udm, status); // status checked in constructor 1.132 + if (U_FAILURE(status)) {return;} 1.133 + if(fData == 0) { 1.134 + status = U_MEMORY_ALLOCATION_ERROR; 1.135 + return; 1.136 + } 1.137 +} 1.138 + 1.139 + 1.140 + 1.141 +//------------------------------------------------------------------------------- 1.142 +// 1.143 +// Constructor from a set of rules supplied as a string. 1.144 +// 1.145 +//------------------------------------------------------------------------------- 1.146 +RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 1.147 + UParseError &parseError, 1.148 + UErrorCode &status) 1.149 +{ 1.150 + init(); 1.151 + if (U_FAILURE(status)) {return;} 1.152 + RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 1.153 + RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 1.154 + // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 1.155 + // creates and returns a complete RBBI. From here, in a constructor, we 1.156 + // can't just return the object created by the builder factory, hence 1.157 + // the assignment of the factory created object to "this". 1.158 + if (U_SUCCESS(status)) { 1.159 + *this = *bi; 1.160 + delete bi; 1.161 + } 1.162 +} 1.163 + 1.164 + 1.165 +//------------------------------------------------------------------------------- 1.166 +// 1.167 +// Default Constructor. Create an empty shell that can be set up later. 1.168 +// Used when creating a RuleBasedBreakIterator from a set 1.169 +// of rules. 1.170 +//------------------------------------------------------------------------------- 1.171 +RuleBasedBreakIterator::RuleBasedBreakIterator() { 1.172 + init(); 1.173 +} 1.174 + 1.175 + 1.176 +//------------------------------------------------------------------------------- 1.177 +// 1.178 +// Copy constructor. Will produce a break iterator with the same behavior, 1.179 +// and which iterates over the same text, as the one passed in. 1.180 +// 1.181 +//------------------------------------------------------------------------------- 1.182 +RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 1.183 +: BreakIterator(other) 1.184 +{ 1.185 + this->init(); 1.186 + *this = other; 1.187 +} 1.188 + 1.189 + 1.190 +/** 1.191 + * Destructor 1.192 + */ 1.193 +RuleBasedBreakIterator::~RuleBasedBreakIterator() { 1.194 + if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 1.195 + // fCharIter was adopted from the outside. 1.196 + delete fCharIter; 1.197 + } 1.198 + fCharIter = NULL; 1.199 + delete fSCharIter; 1.200 + fCharIter = NULL; 1.201 + delete fDCharIter; 1.202 + fDCharIter = NULL; 1.203 + 1.204 + utext_close(fText); 1.205 + 1.206 + if (fData != NULL) { 1.207 + fData->removeReference(); 1.208 + fData = NULL; 1.209 + } 1.210 + if (fCachedBreakPositions) { 1.211 + uprv_free(fCachedBreakPositions); 1.212 + fCachedBreakPositions = NULL; 1.213 + } 1.214 + if (fLanguageBreakEngines) { 1.215 + delete fLanguageBreakEngines; 1.216 + fLanguageBreakEngines = NULL; 1.217 + } 1.218 + if (fUnhandledBreakEngine) { 1.219 + delete fUnhandledBreakEngine; 1.220 + fUnhandledBreakEngine = NULL; 1.221 + } 1.222 +} 1.223 + 1.224 +/** 1.225 + * Assignment operator. Sets this iterator to have the same behavior, 1.226 + * and iterate over the same text, as the one passed in. 1.227 + */ 1.228 +RuleBasedBreakIterator& 1.229 +RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 1.230 + if (this == &that) { 1.231 + return *this; 1.232 + } 1.233 + reset(); // Delete break cache information 1.234 + fBreakType = that.fBreakType; 1.235 + if (fLanguageBreakEngines != NULL) { 1.236 + delete fLanguageBreakEngines; 1.237 + fLanguageBreakEngines = NULL; // Just rebuild for now 1.238 + } 1.239 + // TODO: clone fLanguageBreakEngines from "that" 1.240 + UErrorCode status = U_ZERO_ERROR; 1.241 + fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 1.242 + 1.243 + if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 1.244 + delete fCharIter; 1.245 + } 1.246 + fCharIter = NULL; 1.247 + 1.248 + if (that.fCharIter != NULL ) { 1.249 + // This is a little bit tricky - it will intially appear that 1.250 + // this->fCharIter is adopted, even if that->fCharIter was 1.251 + // not adopted. That's ok. 1.252 + fCharIter = that.fCharIter->clone(); 1.253 + } 1.254 + 1.255 + if (fData != NULL) { 1.256 + fData->removeReference(); 1.257 + fData = NULL; 1.258 + } 1.259 + if (that.fData != NULL) { 1.260 + fData = that.fData->addReference(); 1.261 + } 1.262 + 1.263 + return *this; 1.264 +} 1.265 + 1.266 + 1.267 + 1.268 +//----------------------------------------------------------------------------- 1.269 +// 1.270 +// init() Shared initialization routine. Used by all the constructors. 1.271 +// Initializes all fields, leaving the object in a consistent state. 1.272 +// 1.273 +//----------------------------------------------------------------------------- 1.274 +void RuleBasedBreakIterator::init() { 1.275 + UErrorCode status = U_ZERO_ERROR; 1.276 + fText = utext_openUChars(NULL, NULL, 0, &status); 1.277 + fCharIter = NULL; 1.278 + fSCharIter = NULL; 1.279 + fDCharIter = NULL; 1.280 + fData = NULL; 1.281 + fLastRuleStatusIndex = 0; 1.282 + fLastStatusIndexValid = TRUE; 1.283 + fDictionaryCharCount = 0; 1.284 + fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 1.285 + // dictionary behavior for Break Iterators that are 1.286 + // built from rules. Even better would be the ability to 1.287 + // declare the type in the rules. 1.288 + 1.289 + fCachedBreakPositions = NULL; 1.290 + fLanguageBreakEngines = NULL; 1.291 + fUnhandledBreakEngine = NULL; 1.292 + fNumCachedBreakPositions = 0; 1.293 + fPositionInCache = 0; 1.294 + 1.295 +#ifdef RBBI_DEBUG 1.296 + static UBool debugInitDone = FALSE; 1.297 + if (debugInitDone == FALSE) { 1.298 + char *debugEnv = getenv("U_RBBIDEBUG"); 1.299 + if (debugEnv && uprv_strstr(debugEnv, "trace")) { 1.300 + fTrace = TRUE; 1.301 + } 1.302 + debugInitDone = TRUE; 1.303 + } 1.304 +#endif 1.305 +} 1.306 + 1.307 + 1.308 + 1.309 +//----------------------------------------------------------------------------- 1.310 +// 1.311 +// clone - Returns a newly-constructed RuleBasedBreakIterator with the same 1.312 +// behavior, and iterating over the same text, as this one. 1.313 +// Virtual function: does the right thing with subclasses. 1.314 +// 1.315 +//----------------------------------------------------------------------------- 1.316 +BreakIterator* 1.317 +RuleBasedBreakIterator::clone(void) const { 1.318 + return new RuleBasedBreakIterator(*this); 1.319 +} 1.320 + 1.321 +/** 1.322 + * Equality operator. Returns TRUE if both BreakIterators are of the 1.323 + * same class, have the same behavior, and iterate over the same text. 1.324 + */ 1.325 +UBool 1.326 +RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 1.327 + if (typeid(*this) != typeid(that)) { 1.328 + return FALSE; 1.329 + } 1.330 + 1.331 + const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 1.332 + 1.333 + if (!utext_equals(fText, that2.fText)) { 1.334 + // The two break iterators are operating on different text, 1.335 + // or have a different interation position. 1.336 + return FALSE; 1.337 + }; 1.338 + 1.339 + // TODO: need a check for when in a dictionary region at different offsets. 1.340 + 1.341 + if (that2.fData == fData || 1.342 + (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 1.343 + // The two break iterators are using the same rules. 1.344 + return TRUE; 1.345 + } 1.346 + return FALSE; 1.347 +} 1.348 + 1.349 +/** 1.350 + * Compute a hash code for this BreakIterator 1.351 + * @return A hash code 1.352 + */ 1.353 +int32_t 1.354 +RuleBasedBreakIterator::hashCode(void) const { 1.355 + int32_t hash = 0; 1.356 + if (fData != NULL) { 1.357 + hash = fData->hashCode(); 1.358 + } 1.359 + return hash; 1.360 +} 1.361 + 1.362 + 1.363 +void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 1.364 + if (U_FAILURE(status)) { 1.365 + return; 1.366 + } 1.367 + reset(); 1.368 + fText = utext_clone(fText, ut, FALSE, TRUE, &status); 1.369 + 1.370 + // Set up a dummy CharacterIterator to be returned if anyone 1.371 + // calls getText(). With input from UText, there is no reasonable 1.372 + // way to return a characterIterator over the actual input text. 1.373 + // Return one over an empty string instead - this is the closest 1.374 + // we can come to signaling a failure. 1.375 + // (GetText() is obsolete, this failure is sort of OK) 1.376 + if (fDCharIter == NULL) { 1.377 + static const UChar c = 0; 1.378 + fDCharIter = new UCharCharacterIterator(&c, 0); 1.379 + if (fDCharIter == NULL) { 1.380 + status = U_MEMORY_ALLOCATION_ERROR; 1.381 + return; 1.382 + } 1.383 + } 1.384 + 1.385 + if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 1.386 + // existing fCharIter was adopted from the outside. Delete it now. 1.387 + delete fCharIter; 1.388 + } 1.389 + fCharIter = fDCharIter; 1.390 + 1.391 + this->first(); 1.392 +} 1.393 + 1.394 + 1.395 +UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 1.396 + UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 1.397 + return result; 1.398 +} 1.399 + 1.400 + 1.401 + 1.402 +/** 1.403 + * Returns the description used to create this iterator 1.404 + */ 1.405 +const UnicodeString& 1.406 +RuleBasedBreakIterator::getRules() const { 1.407 + if (fData != NULL) { 1.408 + return fData->getRuleSourceString(); 1.409 + } else { 1.410 + static const UnicodeString *s; 1.411 + if (s == NULL) { 1.412 + // TODO: something more elegant here. 1.413 + // perhaps API should return the string by value. 1.414 + // Note: thread unsafe init & leak are semi-ok, better than 1.415 + // what was before. Sould be cleaned up, though. 1.416 + s = new UnicodeString; 1.417 + } 1.418 + return *s; 1.419 + } 1.420 +} 1.421 + 1.422 +//======================================================================= 1.423 +// BreakIterator overrides 1.424 +//======================================================================= 1.425 + 1.426 +/** 1.427 + * Return a CharacterIterator over the text being analyzed. 1.428 + */ 1.429 +CharacterIterator& 1.430 +RuleBasedBreakIterator::getText() const { 1.431 + return *fCharIter; 1.432 +} 1.433 + 1.434 +/** 1.435 + * Set the iterator to analyze a new piece of text. This function resets 1.436 + * the current iteration position to the beginning of the text. 1.437 + * @param newText An iterator over the text to analyze. 1.438 + */ 1.439 +void 1.440 +RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 1.441 + // If we are holding a CharacterIterator adopted from a 1.442 + // previous call to this function, delete it now. 1.443 + if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 1.444 + delete fCharIter; 1.445 + } 1.446 + 1.447 + fCharIter = newText; 1.448 + UErrorCode status = U_ZERO_ERROR; 1.449 + reset(); 1.450 + if (newText==NULL || newText->startIndex() != 0) { 1.451 + // startIndex !=0 wants to be an error, but there's no way to report it. 1.452 + // Make the iterator text be an empty string. 1.453 + fText = utext_openUChars(fText, NULL, 0, &status); 1.454 + } else { 1.455 + fText = utext_openCharacterIterator(fText, newText, &status); 1.456 + } 1.457 + this->first(); 1.458 +} 1.459 + 1.460 +/** 1.461 + * Set the iterator to analyze a new piece of text. This function resets 1.462 + * the current iteration position to the beginning of the text. 1.463 + * @param newText An iterator over the text to analyze. 1.464 + */ 1.465 +void 1.466 +RuleBasedBreakIterator::setText(const UnicodeString& newText) { 1.467 + UErrorCode status = U_ZERO_ERROR; 1.468 + reset(); 1.469 + fText = utext_openConstUnicodeString(fText, &newText, &status); 1.470 + 1.471 + // Set up a character iterator on the string. 1.472 + // Needed in case someone calls getText(). 1.473 + // Can not, unfortunately, do this lazily on the (probably never) 1.474 + // call to getText(), because getText is const. 1.475 + if (fSCharIter == NULL) { 1.476 + fSCharIter = new StringCharacterIterator(newText); 1.477 + } else { 1.478 + fSCharIter->setText(newText); 1.479 + } 1.480 + 1.481 + if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 1.482 + // old fCharIter was adopted from the outside. Delete it. 1.483 + delete fCharIter; 1.484 + } 1.485 + fCharIter = fSCharIter; 1.486 + 1.487 + this->first(); 1.488 +} 1.489 + 1.490 + 1.491 +/** 1.492 + * Provide a new UText for the input text. Must reference text with contents identical 1.493 + * to the original. 1.494 + * Intended for use with text data originating in Java (garbage collected) environments 1.495 + * where the data may be moved in memory at arbitrary times. 1.496 + */ 1.497 +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 1.498 + if (U_FAILURE(status)) { 1.499 + return *this; 1.500 + } 1.501 + if (input == NULL) { 1.502 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.503 + return *this; 1.504 + } 1.505 + int64_t pos = utext_getNativeIndex(fText); 1.506 + // Shallow read-only clone of the new UText into the existing input UText 1.507 + fText = utext_clone(fText, input, FALSE, TRUE, &status); 1.508 + if (U_FAILURE(status)) { 1.509 + return *this; 1.510 + } 1.511 + utext_setNativeIndex(fText, pos); 1.512 + if (utext_getNativeIndex(fText) != pos) { 1.513 + // Sanity check. The new input utext is supposed to have the exact same 1.514 + // contents as the old. If we can't set to the same position, it doesn't. 1.515 + // The contents underlying the old utext might be invalid at this point, 1.516 + // so it's not safe to check directly. 1.517 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.518 + } 1.519 + return *this; 1.520 +} 1.521 + 1.522 + 1.523 +/** 1.524 + * Sets the current iteration position to the beginning of the text. 1.525 + * @return The offset of the beginning of the text. 1.526 + */ 1.527 +int32_t RuleBasedBreakIterator::first(void) { 1.528 + reset(); 1.529 + fLastRuleStatusIndex = 0; 1.530 + fLastStatusIndexValid = TRUE; 1.531 + //if (fText == NULL) 1.532 + // return BreakIterator::DONE; 1.533 + 1.534 + utext_setNativeIndex(fText, 0); 1.535 + return 0; 1.536 +} 1.537 + 1.538 +/** 1.539 + * Sets the current iteration position to the end of the text. 1.540 + * @return The text's past-the-end offset. 1.541 + */ 1.542 +int32_t RuleBasedBreakIterator::last(void) { 1.543 + reset(); 1.544 + if (fText == NULL) { 1.545 + fLastRuleStatusIndex = 0; 1.546 + fLastStatusIndexValid = TRUE; 1.547 + return BreakIterator::DONE; 1.548 + } 1.549 + 1.550 + fLastStatusIndexValid = FALSE; 1.551 + int32_t pos = (int32_t)utext_nativeLength(fText); 1.552 + utext_setNativeIndex(fText, pos); 1.553 + return pos; 1.554 +} 1.555 + 1.556 +/** 1.557 + * Advances the iterator either forward or backward the specified number of steps. 1.558 + * Negative values move backward, and positive values move forward. This is 1.559 + * equivalent to repeatedly calling next() or previous(). 1.560 + * @param n The number of steps to move. The sign indicates the direction 1.561 + * (negative is backwards, and positive is forwards). 1.562 + * @return The character offset of the boundary position n boundaries away from 1.563 + * the current one. 1.564 + */ 1.565 +int32_t RuleBasedBreakIterator::next(int32_t n) { 1.566 + int32_t result = current(); 1.567 + while (n > 0) { 1.568 + result = next(); 1.569 + --n; 1.570 + } 1.571 + while (n < 0) { 1.572 + result = previous(); 1.573 + ++n; 1.574 + } 1.575 + return result; 1.576 +} 1.577 + 1.578 +/** 1.579 + * Advances the iterator to the next boundary position. 1.580 + * @return The position of the first boundary after this one. 1.581 + */ 1.582 +int32_t RuleBasedBreakIterator::next(void) { 1.583 + // if we have cached break positions and we're still in the range 1.584 + // covered by them, just move one step forward in the cache 1.585 + if (fCachedBreakPositions != NULL) { 1.586 + if (fPositionInCache < fNumCachedBreakPositions - 1) { 1.587 + ++fPositionInCache; 1.588 + int32_t pos = fCachedBreakPositions[fPositionInCache]; 1.589 + utext_setNativeIndex(fText, pos); 1.590 + return pos; 1.591 + } 1.592 + else { 1.593 + reset(); 1.594 + } 1.595 + } 1.596 + 1.597 + int32_t startPos = current(); 1.598 + int32_t result = handleNext(fData->fForwardTable); 1.599 + if (fDictionaryCharCount > 0) { 1.600 + result = checkDictionary(startPos, result, FALSE); 1.601 + } 1.602 + return result; 1.603 +} 1.604 + 1.605 +/** 1.606 + * Advances the iterator backwards, to the last boundary preceding this one. 1.607 + * @return The position of the last boundary position preceding this one. 1.608 + */ 1.609 +int32_t RuleBasedBreakIterator::previous(void) { 1.610 + int32_t result; 1.611 + int32_t startPos; 1.612 + 1.613 + // if we have cached break positions and we're still in the range 1.614 + // covered by them, just move one step backward in the cache 1.615 + if (fCachedBreakPositions != NULL) { 1.616 + if (fPositionInCache > 0) { 1.617 + --fPositionInCache; 1.618 + // If we're at the beginning of the cache, need to reevaluate the 1.619 + // rule status 1.620 + if (fPositionInCache <= 0) { 1.621 + fLastStatusIndexValid = FALSE; 1.622 + } 1.623 + int32_t pos = fCachedBreakPositions[fPositionInCache]; 1.624 + utext_setNativeIndex(fText, pos); 1.625 + return pos; 1.626 + } 1.627 + else { 1.628 + reset(); 1.629 + } 1.630 + } 1.631 + 1.632 + // if we're already sitting at the beginning of the text, return DONE 1.633 + if (fText == NULL || (startPos = current()) == 0) { 1.634 + fLastRuleStatusIndex = 0; 1.635 + fLastStatusIndexValid = TRUE; 1.636 + return BreakIterator::DONE; 1.637 + } 1.638 + 1.639 + if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 1.640 + result = handlePrevious(fData->fReverseTable); 1.641 + if (fDictionaryCharCount > 0) { 1.642 + result = checkDictionary(result, startPos, TRUE); 1.643 + } 1.644 + return result; 1.645 + } 1.646 + 1.647 + // old rule syntax 1.648 + // set things up. handlePrevious() will back us up to some valid 1.649 + // break position before the current position (we back our internal 1.650 + // iterator up one step to prevent handlePrevious() from returning 1.651 + // the current position), but not necessarily the last one before 1.652 + 1.653 + // where we started 1.654 + 1.655 + int32_t start = current(); 1.656 + 1.657 + (void)UTEXT_PREVIOUS32(fText); 1.658 + int32_t lastResult = handlePrevious(fData->fReverseTable); 1.659 + if (lastResult == UBRK_DONE) { 1.660 + lastResult = 0; 1.661 + utext_setNativeIndex(fText, 0); 1.662 + } 1.663 + result = lastResult; 1.664 + int32_t lastTag = 0; 1.665 + UBool breakTagValid = FALSE; 1.666 + 1.667 + // iterate forward from the known break position until we pass our 1.668 + // starting point. The last break position before the starting 1.669 + // point is our return value 1.670 + 1.671 + for (;;) { 1.672 + result = next(); 1.673 + if (result == BreakIterator::DONE || result >= start) { 1.674 + break; 1.675 + } 1.676 + lastResult = result; 1.677 + lastTag = fLastRuleStatusIndex; 1.678 + breakTagValid = TRUE; 1.679 + } 1.680 + 1.681 + // fLastBreakTag wants to have the value for section of text preceding 1.682 + // the result position that we are to return (in lastResult.) If 1.683 + // the backwards rules overshot and the above loop had to do two or more 1.684 + // next()s to move up to the desired return position, we will have a valid 1.685 + // tag value. But, if handlePrevious() took us to exactly the correct result positon, 1.686 + // we wont have a tag value for that position, which is only set by handleNext(). 1.687 + 1.688 + // set the current iteration position to be the last break position 1.689 + // before where we started, and then return that value 1.690 + utext_setNativeIndex(fText, lastResult); 1.691 + fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 1.692 + fLastStatusIndexValid = breakTagValid; 1.693 + 1.694 + // No need to check the dictionary; it will have been handled by 1.695 + // next() 1.696 + 1.697 + return lastResult; 1.698 +} 1.699 + 1.700 +/** 1.701 + * Sets the iterator to refer to the first boundary position following 1.702 + * the specified position. 1.703 + * @offset The position from which to begin searching for a break position. 1.704 + * @return The position of the first break after the current position. 1.705 + */ 1.706 +int32_t RuleBasedBreakIterator::following(int32_t offset) { 1.707 + // if we have cached break positions and offset is in the range 1.708 + // covered by them, use them 1.709 + // TODO: could use binary search 1.710 + // TODO: what if offset is outside range, but break is not? 1.711 + if (fCachedBreakPositions != NULL) { 1.712 + if (offset >= fCachedBreakPositions[0] 1.713 + && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 1.714 + fPositionInCache = 0; 1.715 + // We are guaranteed not to leave the array due to range test above 1.716 + while (offset >= fCachedBreakPositions[fPositionInCache]) { 1.717 + ++fPositionInCache; 1.718 + } 1.719 + int32_t pos = fCachedBreakPositions[fPositionInCache]; 1.720 + utext_setNativeIndex(fText, pos); 1.721 + return pos; 1.722 + } 1.723 + else { 1.724 + reset(); 1.725 + } 1.726 + } 1.727 + 1.728 + // if the offset passed in is already past the end of the text, 1.729 + // just return DONE; if it's before the beginning, return the 1.730 + // text's starting offset 1.731 + fLastRuleStatusIndex = 0; 1.732 + fLastStatusIndexValid = TRUE; 1.733 + if (fText == NULL || offset >= utext_nativeLength(fText)) { 1.734 + last(); 1.735 + return next(); 1.736 + } 1.737 + else if (offset < 0) { 1.738 + return first(); 1.739 + } 1.740 + 1.741 + // otherwise, set our internal iteration position (temporarily) 1.742 + // to the position passed in. If this is the _beginning_ position, 1.743 + // then we can just use next() to get our return value 1.744 + 1.745 + int32_t result = 0; 1.746 + 1.747 + if (fData->fSafeRevTable != NULL) { 1.748 + // new rule syntax 1.749 + utext_setNativeIndex(fText, offset); 1.750 + // move forward one codepoint to prepare for moving back to a 1.751 + // safe point. 1.752 + // this handles offset being between a supplementary character 1.753 + (void)UTEXT_NEXT32(fText); 1.754 + // handlePrevious will move most of the time to < 1 boundary away 1.755 + handlePrevious(fData->fSafeRevTable); 1.756 + int32_t result = next(); 1.757 + while (result <= offset) { 1.758 + result = next(); 1.759 + } 1.760 + return result; 1.761 + } 1.762 + if (fData->fSafeFwdTable != NULL) { 1.763 + // backup plan if forward safe table is not available 1.764 + utext_setNativeIndex(fText, offset); 1.765 + (void)UTEXT_PREVIOUS32(fText); 1.766 + // handle next will give result >= offset 1.767 + handleNext(fData->fSafeFwdTable); 1.768 + // previous will give result 0 or 1 boundary away from offset, 1.769 + // most of the time 1.770 + // we have to 1.771 + int32_t oldresult = previous(); 1.772 + while (oldresult > offset) { 1.773 + int32_t result = previous(); 1.774 + if (result <= offset) { 1.775 + return oldresult; 1.776 + } 1.777 + oldresult = result; 1.778 + } 1.779 + int32_t result = next(); 1.780 + if (result <= offset) { 1.781 + return next(); 1.782 + } 1.783 + return result; 1.784 + } 1.785 + // otherwise, we have to sync up first. Use handlePrevious() to back 1.786 + // up to a known break position before the specified position (if 1.787 + // we can determine that the specified position is a break position, 1.788 + // we don't back up at all). This may or may not be the last break 1.789 + // position at or before our starting position. Advance forward 1.790 + // from here until we've passed the starting position. The position 1.791 + // we stop on will be the first break position after the specified one. 1.792 + // old rule syntax 1.793 + 1.794 + utext_setNativeIndex(fText, offset); 1.795 + if (offset==0 || 1.796 + (offset==1 && utext_getNativeIndex(fText)==0)) { 1.797 + return next(); 1.798 + } 1.799 + result = previous(); 1.800 + 1.801 + while (result != BreakIterator::DONE && result <= offset) { 1.802 + result = next(); 1.803 + } 1.804 + 1.805 + return result; 1.806 +} 1.807 + 1.808 +/** 1.809 + * Sets the iterator to refer to the last boundary position before the 1.810 + * specified position. 1.811 + * @offset The position to begin searching for a break from. 1.812 + * @return The position of the last boundary before the starting position. 1.813 + */ 1.814 +int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 1.815 + // if we have cached break positions and offset is in the range 1.816 + // covered by them, use them 1.817 + if (fCachedBreakPositions != NULL) { 1.818 + // TODO: binary search? 1.819 + // TODO: What if offset is outside range, but break is not? 1.820 + if (offset > fCachedBreakPositions[0] 1.821 + && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 1.822 + fPositionInCache = 0; 1.823 + while (fPositionInCache < fNumCachedBreakPositions 1.824 + && offset > fCachedBreakPositions[fPositionInCache]) 1.825 + ++fPositionInCache; 1.826 + --fPositionInCache; 1.827 + // If we're at the beginning of the cache, need to reevaluate the 1.828 + // rule status 1.829 + if (fPositionInCache <= 0) { 1.830 + fLastStatusIndexValid = FALSE; 1.831 + } 1.832 + utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 1.833 + return fCachedBreakPositions[fPositionInCache]; 1.834 + } 1.835 + else { 1.836 + reset(); 1.837 + } 1.838 + } 1.839 + 1.840 + // if the offset passed in is already past the end of the text, 1.841 + // just return DONE; if it's before the beginning, return the 1.842 + // text's starting offset 1.843 + if (fText == NULL || offset > utext_nativeLength(fText)) { 1.844 + // return BreakIterator::DONE; 1.845 + return last(); 1.846 + } 1.847 + else if (offset < 0) { 1.848 + return first(); 1.849 + } 1.850 + 1.851 + // if we start by updating the current iteration position to the 1.852 + // position specified by the caller, we can just use previous() 1.853 + // to carry out this operation 1.854 + 1.855 + if (fData->fSafeFwdTable != NULL) { 1.856 + // new rule syntax 1.857 + utext_setNativeIndex(fText, offset); 1.858 + int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.859 + if (newOffset != offset) { 1.860 + // Will come here if specified offset was not a code point boundary AND 1.861 + // the underlying implmentation is using UText, which snaps any non-code-point-boundary 1.862 + // indices to the containing code point. 1.863 + // For breakitereator::preceding only, these non-code-point indices need to be moved 1.864 + // up to refer to the following codepoint. 1.865 + (void)UTEXT_NEXT32(fText); 1.866 + offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.867 + } 1.868 + 1.869 + // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 1.870 + // rather than adjusting the position unconditionally? 1.871 + // (Change would interact with safe rules.) 1.872 + // TODO: change RBBI behavior for off-boundary indices to match that of UText? 1.873 + // affects only preceding(), seems cleaner, but is slightly different. 1.874 + (void)UTEXT_PREVIOUS32(fText); 1.875 + handleNext(fData->fSafeFwdTable); 1.876 + int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.877 + while (result >= offset) { 1.878 + result = previous(); 1.879 + } 1.880 + return result; 1.881 + } 1.882 + if (fData->fSafeRevTable != NULL) { 1.883 + // backup plan if forward safe table is not available 1.884 + // TODO: check whether this path can be discarded 1.885 + // It's probably OK to say that rules must supply both safe tables 1.886 + // if they use safe tables at all. We have certainly never described 1.887 + // to anyone how to work with just one safe table. 1.888 + utext_setNativeIndex(fText, offset); 1.889 + (void)UTEXT_NEXT32(fText); 1.890 + 1.891 + // handle previous will give result <= offset 1.892 + handlePrevious(fData->fSafeRevTable); 1.893 + 1.894 + // next will give result 0 or 1 boundary away from offset, 1.895 + // most of the time 1.896 + // we have to 1.897 + int32_t oldresult = next(); 1.898 + while (oldresult < offset) { 1.899 + int32_t result = next(); 1.900 + if (result >= offset) { 1.901 + return oldresult; 1.902 + } 1.903 + oldresult = result; 1.904 + } 1.905 + int32_t result = previous(); 1.906 + if (result >= offset) { 1.907 + return previous(); 1.908 + } 1.909 + return result; 1.910 + } 1.911 + 1.912 + // old rule syntax 1.913 + utext_setNativeIndex(fText, offset); 1.914 + return previous(); 1.915 +} 1.916 + 1.917 +/** 1.918 + * Returns true if the specfied position is a boundary position. As a side 1.919 + * effect, leaves the iterator pointing to the first boundary position at 1.920 + * or after "offset". 1.921 + * @param offset the offset to check. 1.922 + * @return True if "offset" is a boundary position. 1.923 + */ 1.924 +UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 1.925 + // the beginning index of the iterator is always a boundary position by definition 1.926 + if (offset == 0) { 1.927 + first(); // For side effects on current position, tag values. 1.928 + return TRUE; 1.929 + } 1.930 + 1.931 + if (offset == (int32_t)utext_nativeLength(fText)) { 1.932 + last(); // For side effects on current position, tag values. 1.933 + return TRUE; 1.934 + } 1.935 + 1.936 + // out-of-range indexes are never boundary positions 1.937 + if (offset < 0) { 1.938 + first(); // For side effects on current position, tag values. 1.939 + return FALSE; 1.940 + } 1.941 + 1.942 + if (offset > utext_nativeLength(fText)) { 1.943 + last(); // For side effects on current position, tag values. 1.944 + return FALSE; 1.945 + } 1.946 + 1.947 + // otherwise, we can use following() on the position before the specified 1.948 + // one and return true if the position we get back is the one the user 1.949 + // specified 1.950 + utext_previous32From(fText, offset); 1.951 + int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.952 + UBool result = following(backOne) == offset; 1.953 + return result; 1.954 +} 1.955 + 1.956 +/** 1.957 + * Returns the current iteration position. 1.958 + * @return The current iteration position. 1.959 + */ 1.960 +int32_t RuleBasedBreakIterator::current(void) const { 1.961 + int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.962 + return pos; 1.963 +} 1.964 + 1.965 +//======================================================================= 1.966 +// implementation 1.967 +//======================================================================= 1.968 + 1.969 +// 1.970 +// RBBIRunMode - the state machine runs an extra iteration at the beginning and end 1.971 +// of user text. A variable with this enum type keeps track of where we 1.972 +// are. The state machine only fetches user input while in the RUN mode. 1.973 +// 1.974 +enum RBBIRunMode { 1.975 + RBBI_START, // state machine processing is before first char of input 1.976 + RBBI_RUN, // state machine processing is in the user text 1.977 + RBBI_END // state machine processing is after end of user text. 1.978 +}; 1.979 + 1.980 + 1.981 +//----------------------------------------------------------------------------------- 1.982 +// 1.983 +// handleNext(stateTable) 1.984 +// This method is the actual implementation of the rbbi next() method. 1.985 +// This method initializes the state machine to state 1 1.986 +// and advances through the text character by character until we reach the end 1.987 +// of the text or the state machine transitions to state 0. We update our return 1.988 +// value every time the state machine passes through an accepting state. 1.989 +// 1.990 +//----------------------------------------------------------------------------------- 1.991 +int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 1.992 + int32_t state; 1.993 + uint16_t category = 0; 1.994 + RBBIRunMode mode; 1.995 + 1.996 + RBBIStateTableRow *row; 1.997 + UChar32 c; 1.998 + int32_t lookaheadStatus = 0; 1.999 + int32_t lookaheadTagIdx = 0; 1.1000 + int32_t result = 0; 1.1001 + int32_t initialPosition = 0; 1.1002 + int32_t lookaheadResult = 0; 1.1003 + UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1.1004 + const char *tableData = statetable->fTableData; 1.1005 + uint32_t tableRowLen = statetable->fRowLen; 1.1006 + 1.1007 + #ifdef RBBI_DEBUG 1.1008 + if (fTrace) { 1.1009 + RBBIDebugPuts("Handle Next pos char state category"); 1.1010 + } 1.1011 + #endif 1.1012 + 1.1013 + // No matter what, handleNext alway correctly sets the break tag value. 1.1014 + fLastStatusIndexValid = TRUE; 1.1015 + fLastRuleStatusIndex = 0; 1.1016 + 1.1017 + // if we're already at the end of the text, return DONE. 1.1018 + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1019 + result = initialPosition; 1.1020 + c = UTEXT_NEXT32(fText); 1.1021 + if (fData == NULL || c==U_SENTINEL) { 1.1022 + return BreakIterator::DONE; 1.1023 + } 1.1024 + 1.1025 + // Set the initial state for the state machine 1.1026 + state = START_STATE; 1.1027 + row = (RBBIStateTableRow *) 1.1028 + //(statetable->fTableData + (statetable->fRowLen * state)); 1.1029 + (tableData + tableRowLen * state); 1.1030 + 1.1031 + 1.1032 + mode = RBBI_RUN; 1.1033 + if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1.1034 + category = 2; 1.1035 + mode = RBBI_START; 1.1036 + } 1.1037 + 1.1038 + 1.1039 + // loop until we reach the end of the text or transition to state 0 1.1040 + // 1.1041 + for (;;) { 1.1042 + if (c == U_SENTINEL) { 1.1043 + // Reached end of input string. 1.1044 + if (mode == RBBI_END) { 1.1045 + // We have already run the loop one last time with the 1.1046 + // character set to the psueudo {eof} value. Now it is time 1.1047 + // to unconditionally bail out. 1.1048 + if (lookaheadResult > result) { 1.1049 + // We ran off the end of the string with a pending look-ahead match. 1.1050 + // Treat this as if the look-ahead condition had been met, and return 1.1051 + // the match at the / position from the look-ahead rule. 1.1052 + result = lookaheadResult; 1.1053 + fLastRuleStatusIndex = lookaheadTagIdx; 1.1054 + lookaheadStatus = 0; 1.1055 + } 1.1056 + break; 1.1057 + } 1.1058 + // Run the loop one last time with the fake end-of-input character category. 1.1059 + mode = RBBI_END; 1.1060 + category = 1; 1.1061 + } 1.1062 + 1.1063 + // 1.1064 + // Get the char category. An incoming category of 1 or 2 means that 1.1065 + // we are preset for doing the beginning or end of input, and 1.1066 + // that we shouldn't get a category from an actual text input character. 1.1067 + // 1.1068 + if (mode == RBBI_RUN) { 1.1069 + // look up the current character's character category, which tells us 1.1070 + // which column in the state table to look at. 1.1071 + // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1.1072 + // not the size of the character going in, which is a UChar32. 1.1073 + // 1.1074 + UTRIE_GET16(&fData->fTrie, c, category); 1.1075 + 1.1076 + // Check the dictionary bit in the character's category. 1.1077 + // Counter is only used by dictionary based iterators (subclasses). 1.1078 + // Chars that need to be handled by a dictionary have a flag bit set 1.1079 + // in their category values. 1.1080 + // 1.1081 + if ((category & 0x4000) != 0) { 1.1082 + fDictionaryCharCount++; 1.1083 + // And off the dictionary flag bit. 1.1084 + category &= ~0x4000; 1.1085 + } 1.1086 + } 1.1087 + 1.1088 + #ifdef RBBI_DEBUG 1.1089 + if (fTrace) { 1.1090 + RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 1.1091 + if (0x20<=c && c<0x7f) { 1.1092 + RBBIDebugPrintf("\"%c\" ", c); 1.1093 + } else { 1.1094 + RBBIDebugPrintf("%5x ", c); 1.1095 + } 1.1096 + RBBIDebugPrintf("%3d %3d\n", state, category); 1.1097 + } 1.1098 + #endif 1.1099 + 1.1100 + // State Transition - move machine to its next state 1.1101 + // 1.1102 + 1.1103 + // Note: fNextState is defined as uint16_t[2], but we are casting 1.1104 + // a generated RBBI table to RBBIStateTableRow and some tables 1.1105 + // actually have more than 2 categories. 1.1106 + U_ASSERT(category<fData->fHeader->fCatCount); 1.1107 + state = row->fNextState[category]; /*Not accessing beyond memory*/ 1.1108 + row = (RBBIStateTableRow *) 1.1109 + // (statetable->fTableData + (statetable->fRowLen * state)); 1.1110 + (tableData + tableRowLen * state); 1.1111 + 1.1112 + 1.1113 + if (row->fAccepting == -1) { 1.1114 + // Match found, common case. 1.1115 + if (mode != RBBI_START) { 1.1116 + result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1117 + } 1.1118 + fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1.1119 + } 1.1120 + 1.1121 + if (row->fLookAhead != 0) { 1.1122 + if (lookaheadStatus != 0 1.1123 + && row->fAccepting == lookaheadStatus) { 1.1124 + // Lookahead match is completed. 1.1125 + result = lookaheadResult; 1.1126 + fLastRuleStatusIndex = lookaheadTagIdx; 1.1127 + lookaheadStatus = 0; 1.1128 + // TODO: make a standalone hard break in a rule work. 1.1129 + if (lookAheadHardBreak) { 1.1130 + UTEXT_SETNATIVEINDEX(fText, result); 1.1131 + return result; 1.1132 + } 1.1133 + // Look-ahead completed, but other rules may match further. Continue on 1.1134 + // TODO: junk this feature? I don't think it's used anywhwere. 1.1135 + goto continueOn; 1.1136 + } 1.1137 + 1.1138 + int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1139 + lookaheadResult = r; 1.1140 + lookaheadStatus = row->fLookAhead; 1.1141 + lookaheadTagIdx = row->fTagIdx; 1.1142 + goto continueOn; 1.1143 + } 1.1144 + 1.1145 + 1.1146 + if (row->fAccepting != 0) { 1.1147 + // Because this is an accepting state, any in-progress look-ahead match 1.1148 + // is no longer relavant. Clear out the pending lookahead status. 1.1149 + lookaheadStatus = 0; // clear out any pending look-ahead match. 1.1150 + } 1.1151 + 1.1152 +continueOn: 1.1153 + if (state == STOP_STATE) { 1.1154 + // This is the normal exit from the lookup state machine. 1.1155 + // We have advanced through the string until it is certain that no 1.1156 + // longer match is possible, no matter what characters follow. 1.1157 + break; 1.1158 + } 1.1159 + 1.1160 + // Advance to the next character. 1.1161 + // If this is a beginning-of-input loop iteration, don't advance 1.1162 + // the input position. The next iteration will be processing the 1.1163 + // first real input character. 1.1164 + if (mode == RBBI_RUN) { 1.1165 + c = UTEXT_NEXT32(fText); 1.1166 + } else { 1.1167 + if (mode == RBBI_START) { 1.1168 + mode = RBBI_RUN; 1.1169 + } 1.1170 + } 1.1171 + 1.1172 + 1.1173 + } 1.1174 + 1.1175 + // The state machine is done. Check whether it found a match... 1.1176 + 1.1177 + // If the iterator failed to advance in the match engine, force it ahead by one. 1.1178 + // (This really indicates a defect in the break rules. They should always match 1.1179 + // at least one character.) 1.1180 + if (result == initialPosition) { 1.1181 + UTEXT_SETNATIVEINDEX(fText, initialPosition); 1.1182 + UTEXT_NEXT32(fText); 1.1183 + result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1184 + } 1.1185 + 1.1186 + // Leave the iterator at our result position. 1.1187 + UTEXT_SETNATIVEINDEX(fText, result); 1.1188 + #ifdef RBBI_DEBUG 1.1189 + if (fTrace) { 1.1190 + RBBIDebugPrintf("result = %d\n\n", result); 1.1191 + } 1.1192 + #endif 1.1193 + return result; 1.1194 +} 1.1195 + 1.1196 + 1.1197 + 1.1198 +//----------------------------------------------------------------------------------- 1.1199 +// 1.1200 +// handlePrevious() 1.1201 +// 1.1202 +// Iterate backwards, according to the logic of the reverse rules. 1.1203 +// This version handles the exact style backwards rules. 1.1204 +// 1.1205 +// The logic of this function is very similar to handleNext(), above. 1.1206 +// 1.1207 +//----------------------------------------------------------------------------------- 1.1208 +int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1.1209 + int32_t state; 1.1210 + uint16_t category = 0; 1.1211 + RBBIRunMode mode; 1.1212 + RBBIStateTableRow *row; 1.1213 + UChar32 c; 1.1214 + int32_t lookaheadStatus = 0; 1.1215 + int32_t result = 0; 1.1216 + int32_t initialPosition = 0; 1.1217 + int32_t lookaheadResult = 0; 1.1218 + UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1.1219 + 1.1220 + #ifdef RBBI_DEBUG 1.1221 + if (fTrace) { 1.1222 + RBBIDebugPuts("Handle Previous pos char state category"); 1.1223 + } 1.1224 + #endif 1.1225 + 1.1226 + // handlePrevious() never gets the rule status. 1.1227 + // Flag the status as invalid; if the user ever asks for status, we will need 1.1228 + // to back up, then re-find the break position using handleNext(), which does 1.1229 + // get the status value. 1.1230 + fLastStatusIndexValid = FALSE; 1.1231 + fLastRuleStatusIndex = 0; 1.1232 + 1.1233 + // if we're already at the start of the text, return DONE. 1.1234 + if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1.1235 + return BreakIterator::DONE; 1.1236 + } 1.1237 + 1.1238 + // Set up the starting char. 1.1239 + initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1240 + result = initialPosition; 1.1241 + c = UTEXT_PREVIOUS32(fText); 1.1242 + 1.1243 + // Set the initial state for the state machine 1.1244 + state = START_STATE; 1.1245 + row = (RBBIStateTableRow *) 1.1246 + (statetable->fTableData + (statetable->fRowLen * state)); 1.1247 + category = 3; 1.1248 + mode = RBBI_RUN; 1.1249 + if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1.1250 + category = 2; 1.1251 + mode = RBBI_START; 1.1252 + } 1.1253 + 1.1254 + 1.1255 + // loop until we reach the start of the text or transition to state 0 1.1256 + // 1.1257 + for (;;) { 1.1258 + if (c == U_SENTINEL) { 1.1259 + // Reached end of input string. 1.1260 + if (mode == RBBI_END) { 1.1261 + // We have already run the loop one last time with the 1.1262 + // character set to the psueudo {eof} value. Now it is time 1.1263 + // to unconditionally bail out. 1.1264 + if (lookaheadResult < result) { 1.1265 + // We ran off the end of the string with a pending look-ahead match. 1.1266 + // Treat this as if the look-ahead condition had been met, and return 1.1267 + // the match at the / position from the look-ahead rule. 1.1268 + result = lookaheadResult; 1.1269 + lookaheadStatus = 0; 1.1270 + } else if (result == initialPosition) { 1.1271 + // Ran off start, no match found. 1.1272 + // move one index one (towards the start, since we are doing a previous()) 1.1273 + UTEXT_SETNATIVEINDEX(fText, initialPosition); 1.1274 + (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1.1275 + } 1.1276 + break; 1.1277 + } 1.1278 + // Run the loop one last time with the fake end-of-input character category. 1.1279 + mode = RBBI_END; 1.1280 + category = 1; 1.1281 + } 1.1282 + 1.1283 + // 1.1284 + // Get the char category. An incoming category of 1 or 2 means that 1.1285 + // we are preset for doing the beginning or end of input, and 1.1286 + // that we shouldn't get a category from an actual text input character. 1.1287 + // 1.1288 + if (mode == RBBI_RUN) { 1.1289 + // look up the current character's character category, which tells us 1.1290 + // which column in the state table to look at. 1.1291 + // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1.1292 + // not the size of the character going in, which is a UChar32. 1.1293 + // 1.1294 + UTRIE_GET16(&fData->fTrie, c, category); 1.1295 + 1.1296 + // Check the dictionary bit in the character's category. 1.1297 + // Counter is only used by dictionary based iterators (subclasses). 1.1298 + // Chars that need to be handled by a dictionary have a flag bit set 1.1299 + // in their category values. 1.1300 + // 1.1301 + if ((category & 0x4000) != 0) { 1.1302 + fDictionaryCharCount++; 1.1303 + // And off the dictionary flag bit. 1.1304 + category &= ~0x4000; 1.1305 + } 1.1306 + } 1.1307 + 1.1308 + #ifdef RBBI_DEBUG 1.1309 + if (fTrace) { 1.1310 + RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1.1311 + if (0x20<=c && c<0x7f) { 1.1312 + RBBIDebugPrintf("\"%c\" ", c); 1.1313 + } else { 1.1314 + RBBIDebugPrintf("%5x ", c); 1.1315 + } 1.1316 + RBBIDebugPrintf("%3d %3d\n", state, category); 1.1317 + } 1.1318 + #endif 1.1319 + 1.1320 + // State Transition - move machine to its next state 1.1321 + // 1.1322 + 1.1323 + // Note: fNextState is defined as uint16_t[2], but we are casting 1.1324 + // a generated RBBI table to RBBIStateTableRow and some tables 1.1325 + // actually have more than 2 categories. 1.1326 + U_ASSERT(category<fData->fHeader->fCatCount); 1.1327 + state = row->fNextState[category]; /*Not accessing beyond memory*/ 1.1328 + row = (RBBIStateTableRow *) 1.1329 + (statetable->fTableData + (statetable->fRowLen * state)); 1.1330 + 1.1331 + if (row->fAccepting == -1) { 1.1332 + // Match found, common case. 1.1333 + result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1334 + } 1.1335 + 1.1336 + if (row->fLookAhead != 0) { 1.1337 + if (lookaheadStatus != 0 1.1338 + && row->fAccepting == lookaheadStatus) { 1.1339 + // Lookahead match is completed. 1.1340 + result = lookaheadResult; 1.1341 + lookaheadStatus = 0; 1.1342 + // TODO: make a standalone hard break in a rule work. 1.1343 + if (lookAheadHardBreak) { 1.1344 + UTEXT_SETNATIVEINDEX(fText, result); 1.1345 + return result; 1.1346 + } 1.1347 + // Look-ahead completed, but other rules may match further. Continue on 1.1348 + // TODO: junk this feature? I don't think it's used anywhwere. 1.1349 + goto continueOn; 1.1350 + } 1.1351 + 1.1352 + int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1353 + lookaheadResult = r; 1.1354 + lookaheadStatus = row->fLookAhead; 1.1355 + goto continueOn; 1.1356 + } 1.1357 + 1.1358 + 1.1359 + if (row->fAccepting != 0) { 1.1360 + // Because this is an accepting state, any in-progress look-ahead match 1.1361 + // is no longer relavant. Clear out the pending lookahead status. 1.1362 + lookaheadStatus = 0; 1.1363 + } 1.1364 + 1.1365 +continueOn: 1.1366 + if (state == STOP_STATE) { 1.1367 + // This is the normal exit from the lookup state machine. 1.1368 + // We have advanced through the string until it is certain that no 1.1369 + // longer match is possible, no matter what characters follow. 1.1370 + break; 1.1371 + } 1.1372 + 1.1373 + // Move (backwards) to the next character to process. 1.1374 + // If this is a beginning-of-input loop iteration, don't advance 1.1375 + // the input position. The next iteration will be processing the 1.1376 + // first real input character. 1.1377 + if (mode == RBBI_RUN) { 1.1378 + c = UTEXT_PREVIOUS32(fText); 1.1379 + } else { 1.1380 + if (mode == RBBI_START) { 1.1381 + mode = RBBI_RUN; 1.1382 + } 1.1383 + } 1.1384 + } 1.1385 + 1.1386 + // The state machine is done. Check whether it found a match... 1.1387 + 1.1388 + // If the iterator failed to advance in the match engine, force it ahead by one. 1.1389 + // (This really indicates a defect in the break rules. They should always match 1.1390 + // at least one character.) 1.1391 + if (result == initialPosition) { 1.1392 + UTEXT_SETNATIVEINDEX(fText, initialPosition); 1.1393 + UTEXT_PREVIOUS32(fText); 1.1394 + result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1395 + } 1.1396 + 1.1397 + // Leave the iterator at our result position. 1.1398 + UTEXT_SETNATIVEINDEX(fText, result); 1.1399 + #ifdef RBBI_DEBUG 1.1400 + if (fTrace) { 1.1401 + RBBIDebugPrintf("result = %d\n\n", result); 1.1402 + } 1.1403 + #endif 1.1404 + return result; 1.1405 +} 1.1406 + 1.1407 + 1.1408 +void 1.1409 +RuleBasedBreakIterator::reset() 1.1410 +{ 1.1411 + if (fCachedBreakPositions) { 1.1412 + uprv_free(fCachedBreakPositions); 1.1413 + } 1.1414 + fCachedBreakPositions = NULL; 1.1415 + fNumCachedBreakPositions = 0; 1.1416 + fDictionaryCharCount = 0; 1.1417 + fPositionInCache = 0; 1.1418 +} 1.1419 + 1.1420 + 1.1421 + 1.1422 +//------------------------------------------------------------------------------- 1.1423 +// 1.1424 +// getRuleStatus() Return the break rule tag associated with the current 1.1425 +// iterator position. If the iterator arrived at its current 1.1426 +// position by iterating forwards, the value will have been 1.1427 +// cached by the handleNext() function. 1.1428 +// 1.1429 +// If no cached status value is available, the status is 1.1430 +// found by doing a previous() followed by a next(), which 1.1431 +// leaves the iterator where it started, and computes the 1.1432 +// status while doing the next(). 1.1433 +// 1.1434 +//------------------------------------------------------------------------------- 1.1435 +void RuleBasedBreakIterator::makeRuleStatusValid() { 1.1436 + if (fLastStatusIndexValid == FALSE) { 1.1437 + // No cached status is available. 1.1438 + if (fText == NULL || current() == 0) { 1.1439 + // At start of text, or there is no text. Status is always zero. 1.1440 + fLastRuleStatusIndex = 0; 1.1441 + fLastStatusIndexValid = TRUE; 1.1442 + } else { 1.1443 + // Not at start of text. Find status the tedious way. 1.1444 + int32_t pa = current(); 1.1445 + previous(); 1.1446 + if (fNumCachedBreakPositions > 0) { 1.1447 + reset(); // Blow off the dictionary cache 1.1448 + } 1.1449 + int32_t pb = next(); 1.1450 + if (pa != pb) { 1.1451 + // note: the if (pa != pb) test is here only to eliminate warnings for 1.1452 + // unused local variables on gcc. Logically, it isn't needed. 1.1453 + U_ASSERT(pa == pb); 1.1454 + } 1.1455 + } 1.1456 + } 1.1457 + U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1.1458 +} 1.1459 + 1.1460 + 1.1461 +int32_t RuleBasedBreakIterator::getRuleStatus() const { 1.1462 + RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1.1463 + nonConstThis->makeRuleStatusValid(); 1.1464 + 1.1465 + // fLastRuleStatusIndex indexes to the start of the appropriate status record 1.1466 + // (the number of status values.) 1.1467 + // This function returns the last (largest) of the array of status values. 1.1468 + int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1.1469 + int32_t tagVal = fData->fRuleStatusTable[idx]; 1.1470 + 1.1471 + return tagVal; 1.1472 +} 1.1473 + 1.1474 + 1.1475 + 1.1476 + 1.1477 +int32_t RuleBasedBreakIterator::getRuleStatusVec( 1.1478 + int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1.1479 +{ 1.1480 + if (U_FAILURE(status)) { 1.1481 + return 0; 1.1482 + } 1.1483 + 1.1484 + RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1.1485 + nonConstThis->makeRuleStatusValid(); 1.1486 + int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1.1487 + int32_t numValsToCopy = numVals; 1.1488 + if (numVals > capacity) { 1.1489 + status = U_BUFFER_OVERFLOW_ERROR; 1.1490 + numValsToCopy = capacity; 1.1491 + } 1.1492 + int i; 1.1493 + for (i=0; i<numValsToCopy; i++) { 1.1494 + fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1.1495 + } 1.1496 + return numVals; 1.1497 +} 1.1498 + 1.1499 + 1.1500 + 1.1501 +//------------------------------------------------------------------------------- 1.1502 +// 1.1503 +// getBinaryRules Access to the compiled form of the rules, 1.1504 +// for use by build system tools that save the data 1.1505 +// for standard iterator types. 1.1506 +// 1.1507 +//------------------------------------------------------------------------------- 1.1508 +const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1.1509 + const uint8_t *retPtr = NULL; 1.1510 + length = 0; 1.1511 + 1.1512 + if (fData != NULL) { 1.1513 + retPtr = (const uint8_t *)fData->fHeader; 1.1514 + length = fData->fHeader->fLength; 1.1515 + } 1.1516 + return retPtr; 1.1517 +} 1.1518 + 1.1519 + 1.1520 +BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, 1.1521 + int32_t &bufferSize, 1.1522 + UErrorCode &status) 1.1523 +{ 1.1524 + if (U_FAILURE(status)){ 1.1525 + return NULL; 1.1526 + } 1.1527 + 1.1528 + if (bufferSize == 0) { 1.1529 + bufferSize = 1; // preflighting for deprecated functionality 1.1530 + return NULL; 1.1531 + } 1.1532 + 1.1533 + BreakIterator *clonedBI = clone(); 1.1534 + if (clonedBI == NULL) { 1.1535 + status = U_MEMORY_ALLOCATION_ERROR; 1.1536 + } else { 1.1537 + status = U_SAFECLONE_ALLOCATED_WARNING; 1.1538 + } 1.1539 + return (RuleBasedBreakIterator *)clonedBI; 1.1540 +} 1.1541 + 1.1542 + 1.1543 +//------------------------------------------------------------------------------- 1.1544 +// 1.1545 +// isDictionaryChar Return true if the category lookup for this char 1.1546 +// indicates that it is in the set of dictionary lookup 1.1547 +// chars. 1.1548 +// 1.1549 +// This function is intended for use by dictionary based 1.1550 +// break iterators. 1.1551 +// 1.1552 +//------------------------------------------------------------------------------- 1.1553 +/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1.1554 + if (fData == NULL) { 1.1555 + return FALSE; 1.1556 + } 1.1557 + uint16_t category; 1.1558 + UTRIE_GET16(&fData->fTrie, c, category); 1.1559 + return (category & 0x4000) != 0; 1.1560 +}*/ 1.1561 + 1.1562 + 1.1563 +//------------------------------------------------------------------------------- 1.1564 +// 1.1565 +// checkDictionary This function handles all processing of characters in 1.1566 +// the "dictionary" set. It will determine the appropriate 1.1567 +// course of action, and possibly set up a cache in the 1.1568 +// process. 1.1569 +// 1.1570 +//------------------------------------------------------------------------------- 1.1571 +int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1.1572 + int32_t endPos, 1.1573 + UBool reverse) { 1.1574 + // Reset the old break cache first. 1.1575 + reset(); 1.1576 + 1.1577 + // note: code segment below assumes that dictionary chars are in the 1.1578 + // startPos-endPos range 1.1579 + // value returned should be next character in sequence 1.1580 + if ((endPos - startPos) <= 1) { 1.1581 + return (reverse ? startPos : endPos); 1.1582 + } 1.1583 + 1.1584 + // Bug 5532. The dictionary code will crash if the input text is UTF-8 1.1585 + // because native indexes are different from UTF-16 indexes. 1.1586 + // Temporary hack: skip dictionary lookup for UTF-8 encoded text. 1.1587 + // It wont give the right breaks, but it's better than a crash. 1.1588 + // 1.1589 + // Check the type of the UText by checking its pFuncs field, which 1.1590 + // is UText's function dispatch table. It will be the same for all 1.1591 + // UTF-8 UTexts and different for any other UText type. 1.1592 + // 1.1593 + // We have no other type of UText available with non-UTF-16 native indexing. 1.1594 + // This whole check will go away once the dictionary code is fixed. 1.1595 + static const void *utext_utf8Funcs; 1.1596 + if (utext_utf8Funcs == NULL) { 1.1597 + // Cache the UTF-8 UText function pointer value. 1.1598 + UErrorCode status = U_ZERO_ERROR; 1.1599 + UText tempUText = UTEXT_INITIALIZER; 1.1600 + utext_openUTF8(&tempUText, NULL, 0, &status); 1.1601 + utext_utf8Funcs = tempUText.pFuncs; 1.1602 + utext_close(&tempUText); 1.1603 + } 1.1604 + if (fText->pFuncs == utext_utf8Funcs) { 1.1605 + return (reverse ? startPos : endPos); 1.1606 + } 1.1607 + 1.1608 + // Starting from the starting point, scan towards the proposed result, 1.1609 + // looking for the first dictionary character (which may be the one 1.1610 + // we're on, if we're starting in the middle of a range). 1.1611 + utext_setNativeIndex(fText, reverse ? endPos : startPos); 1.1612 + if (reverse) { 1.1613 + UTEXT_PREVIOUS32(fText); 1.1614 + } 1.1615 + 1.1616 + int32_t rangeStart = startPos; 1.1617 + int32_t rangeEnd = endPos; 1.1618 + 1.1619 + uint16_t category; 1.1620 + int32_t current; 1.1621 + UErrorCode status = U_ZERO_ERROR; 1.1622 + UStack breaks(status); 1.1623 + int32_t foundBreakCount = 0; 1.1624 + UChar32 c = utext_current32(fText); 1.1625 + 1.1626 + UTRIE_GET16(&fData->fTrie, c, category); 1.1627 + 1.1628 + // Is the character we're starting on a dictionary character? If so, we 1.1629 + // need to back up to include the entire run; otherwise the results of 1.1630 + // the break algorithm will differ depending on where we start. Since 1.1631 + // the result is cached and there is typically a non-dictionary break 1.1632 + // within a small number of words, there should be little performance impact. 1.1633 + if (category & 0x4000) { 1.1634 + if (reverse) { 1.1635 + do { 1.1636 + utext_next32(fText); // TODO: recast to work directly with postincrement. 1.1637 + c = utext_current32(fText); 1.1638 + UTRIE_GET16(&fData->fTrie, c, category); 1.1639 + } while (c != U_SENTINEL && (category & 0x4000)); 1.1640 + // Back up to the last dictionary character 1.1641 + rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1.1642 + if (c == U_SENTINEL) { 1.1643 + // c = fText->last32(); 1.1644 + // TODO: why was this if needed? 1.1645 + c = UTEXT_PREVIOUS32(fText); 1.1646 + } 1.1647 + else { 1.1648 + c = UTEXT_PREVIOUS32(fText); 1.1649 + } 1.1650 + } 1.1651 + else { 1.1652 + do { 1.1653 + c = UTEXT_PREVIOUS32(fText); 1.1654 + UTRIE_GET16(&fData->fTrie, c, category); 1.1655 + } 1.1656 + while (c != U_SENTINEL && (category & 0x4000)); 1.1657 + // Back up to the last dictionary character 1.1658 + if (c == U_SENTINEL) { 1.1659 + // c = fText->first32(); 1.1660 + c = utext_current32(fText); 1.1661 + } 1.1662 + else { 1.1663 + utext_next32(fText); 1.1664 + c = utext_current32(fText); 1.1665 + } 1.1666 + rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1.1667 + } 1.1668 + UTRIE_GET16(&fData->fTrie, c, category); 1.1669 + } 1.1670 + 1.1671 + // Loop through the text, looking for ranges of dictionary characters. 1.1672 + // For each span, find the appropriate break engine, and ask it to find 1.1673 + // any breaks within the span. 1.1674 + // Note: we always do this in the forward direction, so that the break 1.1675 + // cache is built in the right order. 1.1676 + if (reverse) { 1.1677 + utext_setNativeIndex(fText, rangeStart); 1.1678 + c = utext_current32(fText); 1.1679 + UTRIE_GET16(&fData->fTrie, c, category); 1.1680 + } 1.1681 + while(U_SUCCESS(status)) { 1.1682 + while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1.1683 + utext_next32(fText); // TODO: tweak for post-increment operation 1.1684 + c = utext_current32(fText); 1.1685 + UTRIE_GET16(&fData->fTrie, c, category); 1.1686 + } 1.1687 + if (current >= rangeEnd) { 1.1688 + break; 1.1689 + } 1.1690 + 1.1691 + // We now have a dictionary character. Get the appropriate language object 1.1692 + // to deal with it. 1.1693 + const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1.1694 + 1.1695 + // Ask the language object if there are any breaks. It will leave the text 1.1696 + // pointer on the other side of its range, ready to search for the next one. 1.1697 + if (lbe != NULL) { 1.1698 + foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1.1699 + } 1.1700 + 1.1701 + // Reload the loop variables for the next go-round 1.1702 + c = utext_current32(fText); 1.1703 + UTRIE_GET16(&fData->fTrie, c, category); 1.1704 + } 1.1705 + 1.1706 + // If we found breaks, build a new break cache. The first and last entries must 1.1707 + // be the original starting and ending position. 1.1708 + if (foundBreakCount > 0) { 1.1709 + int32_t totalBreaks = foundBreakCount; 1.1710 + if (startPos < breaks.elementAti(0)) { 1.1711 + totalBreaks += 1; 1.1712 + } 1.1713 + if (endPos > breaks.peeki()) { 1.1714 + totalBreaks += 1; 1.1715 + } 1.1716 + fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1.1717 + if (fCachedBreakPositions != NULL) { 1.1718 + int32_t out = 0; 1.1719 + fNumCachedBreakPositions = totalBreaks; 1.1720 + if (startPos < breaks.elementAti(0)) { 1.1721 + fCachedBreakPositions[out++] = startPos; 1.1722 + } 1.1723 + for (int32_t i = 0; i < foundBreakCount; ++i) { 1.1724 + fCachedBreakPositions[out++] = breaks.elementAti(i); 1.1725 + } 1.1726 + if (endPos > fCachedBreakPositions[out-1]) { 1.1727 + fCachedBreakPositions[out] = endPos; 1.1728 + } 1.1729 + // If there are breaks, then by definition, we are replacing the original 1.1730 + // proposed break by one of the breaks we found. Use following() and 1.1731 + // preceding() to do the work. They should never recurse in this case. 1.1732 + if (reverse) { 1.1733 + return preceding(endPos); 1.1734 + } 1.1735 + else { 1.1736 + return following(startPos); 1.1737 + } 1.1738 + } 1.1739 + // If the allocation failed, just fall through to the "no breaks found" case. 1.1740 + } 1.1741 + 1.1742 + // If we get here, there were no language-based breaks. Set the text pointer 1.1743 + // to the original proposed break. 1.1744 + utext_setNativeIndex(fText, reverse ? startPos : endPos); 1.1745 + return (reverse ? startPos : endPos); 1.1746 +} 1.1747 + 1.1748 +// defined in ucln_cmn.h 1.1749 + 1.1750 +U_NAMESPACE_END 1.1751 + 1.1752 + 1.1753 +static icu::UStack *gLanguageBreakFactories = NULL; 1.1754 +static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1.1755 + 1.1756 +/** 1.1757 + * Release all static memory held by breakiterator. 1.1758 + */ 1.1759 +U_CDECL_BEGIN 1.1760 +static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1.1761 + if (gLanguageBreakFactories) { 1.1762 + delete gLanguageBreakFactories; 1.1763 + gLanguageBreakFactories = NULL; 1.1764 + } 1.1765 + gLanguageBreakFactoriesInitOnce.reset(); 1.1766 + return TRUE; 1.1767 +} 1.1768 +U_CDECL_END 1.1769 + 1.1770 +U_CDECL_BEGIN 1.1771 +static void U_CALLCONV _deleteFactory(void *obj) { 1.1772 + delete (icu::LanguageBreakFactory *) obj; 1.1773 +} 1.1774 +U_CDECL_END 1.1775 +U_NAMESPACE_BEGIN 1.1776 + 1.1777 +static void U_CALLCONV initLanguageFactories() { 1.1778 + UErrorCode status = U_ZERO_ERROR; 1.1779 + U_ASSERT(gLanguageBreakFactories == NULL); 1.1780 + gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); 1.1781 + if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { 1.1782 + ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1.1783 + gLanguageBreakFactories->push(builtIn, status); 1.1784 +#ifdef U_LOCAL_SERVICE_HOOK 1.1785 + LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1.1786 + if (extra != NULL) { 1.1787 + gLanguageBreakFactories->push(extra, status); 1.1788 + } 1.1789 +#endif 1.1790 + } 1.1791 + ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1.1792 +} 1.1793 + 1.1794 + 1.1795 +static const LanguageBreakEngine* 1.1796 +getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1.1797 +{ 1.1798 + umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); 1.1799 + if (gLanguageBreakFactories == NULL) { 1.1800 + return NULL; 1.1801 + } 1.1802 + 1.1803 + int32_t i = gLanguageBreakFactories->size(); 1.1804 + const LanguageBreakEngine *lbe = NULL; 1.1805 + while (--i >= 0) { 1.1806 + LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1.1807 + lbe = factory->getEngineFor(c, breakType); 1.1808 + if (lbe != NULL) { 1.1809 + break; 1.1810 + } 1.1811 + } 1.1812 + return lbe; 1.1813 +} 1.1814 + 1.1815 + 1.1816 +//------------------------------------------------------------------------------- 1.1817 +// 1.1818 +// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1.1819 +// the character c. 1.1820 +// 1.1821 +//------------------------------------------------------------------------------- 1.1822 +const LanguageBreakEngine * 1.1823 +RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1.1824 + const LanguageBreakEngine *lbe = NULL; 1.1825 + UErrorCode status = U_ZERO_ERROR; 1.1826 + 1.1827 + if (fLanguageBreakEngines == NULL) { 1.1828 + fLanguageBreakEngines = new UStack(status); 1.1829 + if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1.1830 + delete fLanguageBreakEngines; 1.1831 + fLanguageBreakEngines = 0; 1.1832 + return NULL; 1.1833 + } 1.1834 + } 1.1835 + 1.1836 + int32_t i = fLanguageBreakEngines->size(); 1.1837 + while (--i >= 0) { 1.1838 + lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1.1839 + if (lbe->handles(c, fBreakType)) { 1.1840 + return lbe; 1.1841 + } 1.1842 + } 1.1843 + 1.1844 + // No existing dictionary took the character. See if a factory wants to 1.1845 + // give us a new LanguageBreakEngine for this character. 1.1846 + lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1.1847 + 1.1848 + // If we got one, use it and push it on our stack. 1.1849 + if (lbe != NULL) { 1.1850 + fLanguageBreakEngines->push((void *)lbe, status); 1.1851 + // Even if we can't remember it, we can keep looking it up, so 1.1852 + // return it even if the push fails. 1.1853 + return lbe; 1.1854 + } 1.1855 + 1.1856 + // No engine is forthcoming for this character. Add it to the 1.1857 + // reject set. Create the reject break engine if needed. 1.1858 + if (fUnhandledBreakEngine == NULL) { 1.1859 + fUnhandledBreakEngine = new UnhandledEngine(status); 1.1860 + if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1.1861 + status = U_MEMORY_ALLOCATION_ERROR; 1.1862 + } 1.1863 + // Put it last so that scripts for which we have an engine get tried 1.1864 + // first. 1.1865 + fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1.1866 + // If we can't insert it, or creation failed, get rid of it 1.1867 + if (U_FAILURE(status)) { 1.1868 + delete fUnhandledBreakEngine; 1.1869 + fUnhandledBreakEngine = 0; 1.1870 + return NULL; 1.1871 + } 1.1872 + } 1.1873 + 1.1874 + // Tell the reject engine about the character; at its discretion, it may 1.1875 + // add more than just the one character. 1.1876 + fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1.1877 + 1.1878 + return fUnhandledBreakEngine; 1.1879 +} 1.1880 + 1.1881 + 1.1882 + 1.1883 +/*int32_t RuleBasedBreakIterator::getBreakType() const { 1.1884 + return fBreakType; 1.1885 +}*/ 1.1886 + 1.1887 +void RuleBasedBreakIterator::setBreakType(int32_t type) { 1.1888 + fBreakType = type; 1.1889 + reset(); 1.1890 +} 1.1891 + 1.1892 +U_NAMESPACE_END 1.1893 + 1.1894 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */