Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | *************************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2013 International Business Machines Corporation |
michael@0 | 4 | * and others. All rights reserved. |
michael@0 | 5 | *************************************************************************** |
michael@0 | 6 | */ |
michael@0 | 7 | // |
michael@0 | 8 | // file: rbbi.c Contains the implementation of the rule based break iterator |
michael@0 | 9 | // runtime engine and the API implementation for |
michael@0 | 10 | // class RuleBasedBreakIterator |
michael@0 | 11 | // |
michael@0 | 12 | |
michael@0 | 13 | #include "utypeinfo.h" // for 'typeid' to work |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/utypes.h" |
michael@0 | 16 | |
michael@0 | 17 | #if !UCONFIG_NO_BREAK_ITERATION |
michael@0 | 18 | |
michael@0 | 19 | #include "unicode/rbbi.h" |
michael@0 | 20 | #include "unicode/schriter.h" |
michael@0 | 21 | #include "unicode/uchriter.h" |
michael@0 | 22 | #include "unicode/udata.h" |
michael@0 | 23 | #include "unicode/uclean.h" |
michael@0 | 24 | #include "rbbidata.h" |
michael@0 | 25 | #include "rbbirb.h" |
michael@0 | 26 | #include "cmemory.h" |
michael@0 | 27 | #include "cstring.h" |
michael@0 | 28 | #include "umutex.h" |
michael@0 | 29 | #include "ucln_cmn.h" |
michael@0 | 30 | #include "brkeng.h" |
michael@0 | 31 | |
michael@0 | 32 | #include "uassert.h" |
michael@0 | 33 | #include "uvector.h" |
michael@0 | 34 | |
michael@0 | 35 | // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. |
michael@0 | 36 | #if U_LOCAL_SERVICE_HOOK |
michael@0 | 37 | #include "localsvc.h" |
michael@0 | 38 | #endif |
michael@0 | 39 | |
michael@0 | 40 | #ifdef RBBI_DEBUG |
michael@0 | 41 | static UBool fTrace = FALSE; |
michael@0 | 42 | #endif |
michael@0 | 43 | |
michael@0 | 44 | U_NAMESPACE_BEGIN |
michael@0 | 45 | |
michael@0 | 46 | // The state number of the starting state |
michael@0 | 47 | #define START_STATE 1 |
michael@0 | 48 | |
michael@0 | 49 | // The state-transition value indicating "stop" |
michael@0 | 50 | #define STOP_STATE 0 |
michael@0 | 51 | |
michael@0 | 52 | |
michael@0 | 53 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) |
michael@0 | 54 | |
michael@0 | 55 | |
michael@0 | 56 | //======================================================================= |
michael@0 | 57 | // constructors |
michael@0 | 58 | //======================================================================= |
michael@0 | 59 | |
michael@0 | 60 | /** |
michael@0 | 61 | * Constructs a RuleBasedBreakIterator that uses the already-created |
michael@0 | 62 | * tables object that is passed in as a parameter. |
michael@0 | 63 | */ |
michael@0 | 64 | RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) |
michael@0 | 65 | { |
michael@0 | 66 | init(); |
michael@0 | 67 | fData = new RBBIDataWrapper(data, status); // status checked in constructor |
michael@0 | 68 | if (U_FAILURE(status)) {return;} |
michael@0 | 69 | if(fData == 0) { |
michael@0 | 70 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 71 | return; |
michael@0 | 72 | } |
michael@0 | 73 | } |
michael@0 | 74 | |
michael@0 | 75 | /** |
michael@0 | 76 | * Same as above but does not adopt memory |
michael@0 | 77 | */ |
michael@0 | 78 | RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) |
michael@0 | 79 | { |
michael@0 | 80 | init(); |
michael@0 | 81 | fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor |
michael@0 | 82 | if (U_FAILURE(status)) {return;} |
michael@0 | 83 | if(fData == 0) { |
michael@0 | 84 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 85 | return; |
michael@0 | 86 | } |
michael@0 | 87 | } |
michael@0 | 88 | |
michael@0 | 89 | |
michael@0 | 90 | // |
michael@0 | 91 | // Construct from precompiled binary rules (tables). This constructor is public API, |
michael@0 | 92 | // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). |
michael@0 | 93 | // |
michael@0 | 94 | RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, |
michael@0 | 95 | uint32_t ruleLength, |
michael@0 | 96 | UErrorCode &status) { |
michael@0 | 97 | init(); |
michael@0 | 98 | if (U_FAILURE(status)) { |
michael@0 | 99 | return; |
michael@0 | 100 | } |
michael@0 | 101 | if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { |
michael@0 | 102 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 103 | return; |
michael@0 | 104 | } |
michael@0 | 105 | const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; |
michael@0 | 106 | if (data->fLength > ruleLength) { |
michael@0 | 107 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 108 | return; |
michael@0 | 109 | } |
michael@0 | 110 | fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); |
michael@0 | 111 | if (U_FAILURE(status)) {return;} |
michael@0 | 112 | if(fData == 0) { |
michael@0 | 113 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 114 | return; |
michael@0 | 115 | } |
michael@0 | 116 | } |
michael@0 | 117 | |
michael@0 | 118 | |
michael@0 | 119 | //------------------------------------------------------------------------------- |
michael@0 | 120 | // |
michael@0 | 121 | // Constructor from a UDataMemory handle to precompiled break rules |
michael@0 | 122 | // stored in an ICU data file. |
michael@0 | 123 | // |
michael@0 | 124 | //------------------------------------------------------------------------------- |
michael@0 | 125 | RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) |
michael@0 | 126 | { |
michael@0 | 127 | init(); |
michael@0 | 128 | fData = new RBBIDataWrapper(udm, status); // status checked in constructor |
michael@0 | 129 | if (U_FAILURE(status)) {return;} |
michael@0 | 130 | if(fData == 0) { |
michael@0 | 131 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 132 | return; |
michael@0 | 133 | } |
michael@0 | 134 | } |
michael@0 | 135 | |
michael@0 | 136 | |
michael@0 | 137 | |
michael@0 | 138 | //------------------------------------------------------------------------------- |
michael@0 | 139 | // |
michael@0 | 140 | // Constructor from a set of rules supplied as a string. |
michael@0 | 141 | // |
michael@0 | 142 | //------------------------------------------------------------------------------- |
michael@0 | 143 | RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, |
michael@0 | 144 | UParseError &parseError, |
michael@0 | 145 | UErrorCode &status) |
michael@0 | 146 | { |
michael@0 | 147 | init(); |
michael@0 | 148 | if (U_FAILURE(status)) {return;} |
michael@0 | 149 | RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) |
michael@0 | 150 | RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); |
michael@0 | 151 | // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that |
michael@0 | 152 | // creates and returns a complete RBBI. From here, in a constructor, we |
michael@0 | 153 | // can't just return the object created by the builder factory, hence |
michael@0 | 154 | // the assignment of the factory created object to "this". |
michael@0 | 155 | if (U_SUCCESS(status)) { |
michael@0 | 156 | *this = *bi; |
michael@0 | 157 | delete bi; |
michael@0 | 158 | } |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | |
michael@0 | 162 | //------------------------------------------------------------------------------- |
michael@0 | 163 | // |
michael@0 | 164 | // Default Constructor. Create an empty shell that can be set up later. |
michael@0 | 165 | // Used when creating a RuleBasedBreakIterator from a set |
michael@0 | 166 | // of rules. |
michael@0 | 167 | //------------------------------------------------------------------------------- |
michael@0 | 168 | RuleBasedBreakIterator::RuleBasedBreakIterator() { |
michael@0 | 169 | init(); |
michael@0 | 170 | } |
michael@0 | 171 | |
michael@0 | 172 | |
michael@0 | 173 | //------------------------------------------------------------------------------- |
michael@0 | 174 | // |
michael@0 | 175 | // Copy constructor. Will produce a break iterator with the same behavior, |
michael@0 | 176 | // and which iterates over the same text, as the one passed in. |
michael@0 | 177 | // |
michael@0 | 178 | //------------------------------------------------------------------------------- |
michael@0 | 179 | RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) |
michael@0 | 180 | : BreakIterator(other) |
michael@0 | 181 | { |
michael@0 | 182 | this->init(); |
michael@0 | 183 | *this = other; |
michael@0 | 184 | } |
michael@0 | 185 | |
michael@0 | 186 | |
michael@0 | 187 | /** |
michael@0 | 188 | * Destructor |
michael@0 | 189 | */ |
michael@0 | 190 | RuleBasedBreakIterator::~RuleBasedBreakIterator() { |
michael@0 | 191 | if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
michael@0 | 192 | // fCharIter was adopted from the outside. |
michael@0 | 193 | delete fCharIter; |
michael@0 | 194 | } |
michael@0 | 195 | fCharIter = NULL; |
michael@0 | 196 | delete fSCharIter; |
michael@0 | 197 | fCharIter = NULL; |
michael@0 | 198 | delete fDCharIter; |
michael@0 | 199 | fDCharIter = NULL; |
michael@0 | 200 | |
michael@0 | 201 | utext_close(fText); |
michael@0 | 202 | |
michael@0 | 203 | if (fData != NULL) { |
michael@0 | 204 | fData->removeReference(); |
michael@0 | 205 | fData = NULL; |
michael@0 | 206 | } |
michael@0 | 207 | if (fCachedBreakPositions) { |
michael@0 | 208 | uprv_free(fCachedBreakPositions); |
michael@0 | 209 | fCachedBreakPositions = NULL; |
michael@0 | 210 | } |
michael@0 | 211 | if (fLanguageBreakEngines) { |
michael@0 | 212 | delete fLanguageBreakEngines; |
michael@0 | 213 | fLanguageBreakEngines = NULL; |
michael@0 | 214 | } |
michael@0 | 215 | if (fUnhandledBreakEngine) { |
michael@0 | 216 | delete fUnhandledBreakEngine; |
michael@0 | 217 | fUnhandledBreakEngine = NULL; |
michael@0 | 218 | } |
michael@0 | 219 | } |
michael@0 | 220 | |
michael@0 | 221 | /** |
michael@0 | 222 | * Assignment operator. Sets this iterator to have the same behavior, |
michael@0 | 223 | * and iterate over the same text, as the one passed in. |
michael@0 | 224 | */ |
michael@0 | 225 | RuleBasedBreakIterator& |
michael@0 | 226 | RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { |
michael@0 | 227 | if (this == &that) { |
michael@0 | 228 | return *this; |
michael@0 | 229 | } |
michael@0 | 230 | reset(); // Delete break cache information |
michael@0 | 231 | fBreakType = that.fBreakType; |
michael@0 | 232 | if (fLanguageBreakEngines != NULL) { |
michael@0 | 233 | delete fLanguageBreakEngines; |
michael@0 | 234 | fLanguageBreakEngines = NULL; // Just rebuild for now |
michael@0 | 235 | } |
michael@0 | 236 | // TODO: clone fLanguageBreakEngines from "that" |
michael@0 | 237 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 238 | fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); |
michael@0 | 239 | |
michael@0 | 240 | if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
michael@0 | 241 | delete fCharIter; |
michael@0 | 242 | } |
michael@0 | 243 | fCharIter = NULL; |
michael@0 | 244 | |
michael@0 | 245 | if (that.fCharIter != NULL ) { |
michael@0 | 246 | // This is a little bit tricky - it will intially appear that |
michael@0 | 247 | // this->fCharIter is adopted, even if that->fCharIter was |
michael@0 | 248 | // not adopted. That's ok. |
michael@0 | 249 | fCharIter = that.fCharIter->clone(); |
michael@0 | 250 | } |
michael@0 | 251 | |
michael@0 | 252 | if (fData != NULL) { |
michael@0 | 253 | fData->removeReference(); |
michael@0 | 254 | fData = NULL; |
michael@0 | 255 | } |
michael@0 | 256 | if (that.fData != NULL) { |
michael@0 | 257 | fData = that.fData->addReference(); |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | return *this; |
michael@0 | 261 | } |
michael@0 | 262 | |
michael@0 | 263 | |
michael@0 | 264 | |
michael@0 | 265 | //----------------------------------------------------------------------------- |
michael@0 | 266 | // |
michael@0 | 267 | // init() Shared initialization routine. Used by all the constructors. |
michael@0 | 268 | // Initializes all fields, leaving the object in a consistent state. |
michael@0 | 269 | // |
michael@0 | 270 | //----------------------------------------------------------------------------- |
michael@0 | 271 | void RuleBasedBreakIterator::init() { |
michael@0 | 272 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 273 | fText = utext_openUChars(NULL, NULL, 0, &status); |
michael@0 | 274 | fCharIter = NULL; |
michael@0 | 275 | fSCharIter = NULL; |
michael@0 | 276 | fDCharIter = NULL; |
michael@0 | 277 | fData = NULL; |
michael@0 | 278 | fLastRuleStatusIndex = 0; |
michael@0 | 279 | fLastStatusIndexValid = TRUE; |
michael@0 | 280 | fDictionaryCharCount = 0; |
michael@0 | 281 | fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable |
michael@0 | 282 | // dictionary behavior for Break Iterators that are |
michael@0 | 283 | // built from rules. Even better would be the ability to |
michael@0 | 284 | // declare the type in the rules. |
michael@0 | 285 | |
michael@0 | 286 | fCachedBreakPositions = NULL; |
michael@0 | 287 | fLanguageBreakEngines = NULL; |
michael@0 | 288 | fUnhandledBreakEngine = NULL; |
michael@0 | 289 | fNumCachedBreakPositions = 0; |
michael@0 | 290 | fPositionInCache = 0; |
michael@0 | 291 | |
michael@0 | 292 | #ifdef RBBI_DEBUG |
michael@0 | 293 | static UBool debugInitDone = FALSE; |
michael@0 | 294 | if (debugInitDone == FALSE) { |
michael@0 | 295 | char *debugEnv = getenv("U_RBBIDEBUG"); |
michael@0 | 296 | if (debugEnv && uprv_strstr(debugEnv, "trace")) { |
michael@0 | 297 | fTrace = TRUE; |
michael@0 | 298 | } |
michael@0 | 299 | debugInitDone = TRUE; |
michael@0 | 300 | } |
michael@0 | 301 | #endif |
michael@0 | 302 | } |
michael@0 | 303 | |
michael@0 | 304 | |
michael@0 | 305 | |
michael@0 | 306 | //----------------------------------------------------------------------------- |
michael@0 | 307 | // |
michael@0 | 308 | // clone - Returns a newly-constructed RuleBasedBreakIterator with the same |
michael@0 | 309 | // behavior, and iterating over the same text, as this one. |
michael@0 | 310 | // Virtual function: does the right thing with subclasses. |
michael@0 | 311 | // |
michael@0 | 312 | //----------------------------------------------------------------------------- |
michael@0 | 313 | BreakIterator* |
michael@0 | 314 | RuleBasedBreakIterator::clone(void) const { |
michael@0 | 315 | return new RuleBasedBreakIterator(*this); |
michael@0 | 316 | } |
michael@0 | 317 | |
michael@0 | 318 | /** |
michael@0 | 319 | * Equality operator. Returns TRUE if both BreakIterators are of the |
michael@0 | 320 | * same class, have the same behavior, and iterate over the same text. |
michael@0 | 321 | */ |
michael@0 | 322 | UBool |
michael@0 | 323 | RuleBasedBreakIterator::operator==(const BreakIterator& that) const { |
michael@0 | 324 | if (typeid(*this) != typeid(that)) { |
michael@0 | 325 | return FALSE; |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; |
michael@0 | 329 | |
michael@0 | 330 | if (!utext_equals(fText, that2.fText)) { |
michael@0 | 331 | // The two break iterators are operating on different text, |
michael@0 | 332 | // or have a different interation position. |
michael@0 | 333 | return FALSE; |
michael@0 | 334 | }; |
michael@0 | 335 | |
michael@0 | 336 | // TODO: need a check for when in a dictionary region at different offsets. |
michael@0 | 337 | |
michael@0 | 338 | if (that2.fData == fData || |
michael@0 | 339 | (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { |
michael@0 | 340 | // The two break iterators are using the same rules. |
michael@0 | 341 | return TRUE; |
michael@0 | 342 | } |
michael@0 | 343 | return FALSE; |
michael@0 | 344 | } |
michael@0 | 345 | |
michael@0 | 346 | /** |
michael@0 | 347 | * Compute a hash code for this BreakIterator |
michael@0 | 348 | * @return A hash code |
michael@0 | 349 | */ |
michael@0 | 350 | int32_t |
michael@0 | 351 | RuleBasedBreakIterator::hashCode(void) const { |
michael@0 | 352 | int32_t hash = 0; |
michael@0 | 353 | if (fData != NULL) { |
michael@0 | 354 | hash = fData->hashCode(); |
michael@0 | 355 | } |
michael@0 | 356 | return hash; |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | |
michael@0 | 360 | void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { |
michael@0 | 361 | if (U_FAILURE(status)) { |
michael@0 | 362 | return; |
michael@0 | 363 | } |
michael@0 | 364 | reset(); |
michael@0 | 365 | fText = utext_clone(fText, ut, FALSE, TRUE, &status); |
michael@0 | 366 | |
michael@0 | 367 | // Set up a dummy CharacterIterator to be returned if anyone |
michael@0 | 368 | // calls getText(). With input from UText, there is no reasonable |
michael@0 | 369 | // way to return a characterIterator over the actual input text. |
michael@0 | 370 | // Return one over an empty string instead - this is the closest |
michael@0 | 371 | // we can come to signaling a failure. |
michael@0 | 372 | // (GetText() is obsolete, this failure is sort of OK) |
michael@0 | 373 | if (fDCharIter == NULL) { |
michael@0 | 374 | static const UChar c = 0; |
michael@0 | 375 | fDCharIter = new UCharCharacterIterator(&c, 0); |
michael@0 | 376 | if (fDCharIter == NULL) { |
michael@0 | 377 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 378 | return; |
michael@0 | 379 | } |
michael@0 | 380 | } |
michael@0 | 381 | |
michael@0 | 382 | if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
michael@0 | 383 | // existing fCharIter was adopted from the outside. Delete it now. |
michael@0 | 384 | delete fCharIter; |
michael@0 | 385 | } |
michael@0 | 386 | fCharIter = fDCharIter; |
michael@0 | 387 | |
michael@0 | 388 | this->first(); |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | |
michael@0 | 392 | UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { |
michael@0 | 393 | UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); |
michael@0 | 394 | return result; |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | |
michael@0 | 398 | |
michael@0 | 399 | /** |
michael@0 | 400 | * Returns the description used to create this iterator |
michael@0 | 401 | */ |
michael@0 | 402 | const UnicodeString& |
michael@0 | 403 | RuleBasedBreakIterator::getRules() const { |
michael@0 | 404 | if (fData != NULL) { |
michael@0 | 405 | return fData->getRuleSourceString(); |
michael@0 | 406 | } else { |
michael@0 | 407 | static const UnicodeString *s; |
michael@0 | 408 | if (s == NULL) { |
michael@0 | 409 | // TODO: something more elegant here. |
michael@0 | 410 | // perhaps API should return the string by value. |
michael@0 | 411 | // Note: thread unsafe init & leak are semi-ok, better than |
michael@0 | 412 | // what was before. Sould be cleaned up, though. |
michael@0 | 413 | s = new UnicodeString; |
michael@0 | 414 | } |
michael@0 | 415 | return *s; |
michael@0 | 416 | } |
michael@0 | 417 | } |
michael@0 | 418 | |
michael@0 | 419 | //======================================================================= |
michael@0 | 420 | // BreakIterator overrides |
michael@0 | 421 | //======================================================================= |
michael@0 | 422 | |
michael@0 | 423 | /** |
michael@0 | 424 | * Return a CharacterIterator over the text being analyzed. |
michael@0 | 425 | */ |
michael@0 | 426 | CharacterIterator& |
michael@0 | 427 | RuleBasedBreakIterator::getText() const { |
michael@0 | 428 | return *fCharIter; |
michael@0 | 429 | } |
michael@0 | 430 | |
michael@0 | 431 | /** |
michael@0 | 432 | * Set the iterator to analyze a new piece of text. This function resets |
michael@0 | 433 | * the current iteration position to the beginning of the text. |
michael@0 | 434 | * @param newText An iterator over the text to analyze. |
michael@0 | 435 | */ |
michael@0 | 436 | void |
michael@0 | 437 | RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { |
michael@0 | 438 | // If we are holding a CharacterIterator adopted from a |
michael@0 | 439 | // previous call to this function, delete it now. |
michael@0 | 440 | if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
michael@0 | 441 | delete fCharIter; |
michael@0 | 442 | } |
michael@0 | 443 | |
michael@0 | 444 | fCharIter = newText; |
michael@0 | 445 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 446 | reset(); |
michael@0 | 447 | if (newText==NULL || newText->startIndex() != 0) { |
michael@0 | 448 | // startIndex !=0 wants to be an error, but there's no way to report it. |
michael@0 | 449 | // Make the iterator text be an empty string. |
michael@0 | 450 | fText = utext_openUChars(fText, NULL, 0, &status); |
michael@0 | 451 | } else { |
michael@0 | 452 | fText = utext_openCharacterIterator(fText, newText, &status); |
michael@0 | 453 | } |
michael@0 | 454 | this->first(); |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | /** |
michael@0 | 458 | * Set the iterator to analyze a new piece of text. This function resets |
michael@0 | 459 | * the current iteration position to the beginning of the text. |
michael@0 | 460 | * @param newText An iterator over the text to analyze. |
michael@0 | 461 | */ |
michael@0 | 462 | void |
michael@0 | 463 | RuleBasedBreakIterator::setText(const UnicodeString& newText) { |
michael@0 | 464 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 465 | reset(); |
michael@0 | 466 | fText = utext_openConstUnicodeString(fText, &newText, &status); |
michael@0 | 467 | |
michael@0 | 468 | // Set up a character iterator on the string. |
michael@0 | 469 | // Needed in case someone calls getText(). |
michael@0 | 470 | // Can not, unfortunately, do this lazily on the (probably never) |
michael@0 | 471 | // call to getText(), because getText is const. |
michael@0 | 472 | if (fSCharIter == NULL) { |
michael@0 | 473 | fSCharIter = new StringCharacterIterator(newText); |
michael@0 | 474 | } else { |
michael@0 | 475 | fSCharIter->setText(newText); |
michael@0 | 476 | } |
michael@0 | 477 | |
michael@0 | 478 | if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { |
michael@0 | 479 | // old fCharIter was adopted from the outside. Delete it. |
michael@0 | 480 | delete fCharIter; |
michael@0 | 481 | } |
michael@0 | 482 | fCharIter = fSCharIter; |
michael@0 | 483 | |
michael@0 | 484 | this->first(); |
michael@0 | 485 | } |
michael@0 | 486 | |
michael@0 | 487 | |
michael@0 | 488 | /** |
michael@0 | 489 | * Provide a new UText for the input text. Must reference text with contents identical |
michael@0 | 490 | * to the original. |
michael@0 | 491 | * Intended for use with text data originating in Java (garbage collected) environments |
michael@0 | 492 | * where the data may be moved in memory at arbitrary times. |
michael@0 | 493 | */ |
michael@0 | 494 | RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { |
michael@0 | 495 | if (U_FAILURE(status)) { |
michael@0 | 496 | return *this; |
michael@0 | 497 | } |
michael@0 | 498 | if (input == NULL) { |
michael@0 | 499 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 500 | return *this; |
michael@0 | 501 | } |
michael@0 | 502 | int64_t pos = utext_getNativeIndex(fText); |
michael@0 | 503 | // Shallow read-only clone of the new UText into the existing input UText |
michael@0 | 504 | fText = utext_clone(fText, input, FALSE, TRUE, &status); |
michael@0 | 505 | if (U_FAILURE(status)) { |
michael@0 | 506 | return *this; |
michael@0 | 507 | } |
michael@0 | 508 | utext_setNativeIndex(fText, pos); |
michael@0 | 509 | if (utext_getNativeIndex(fText) != pos) { |
michael@0 | 510 | // Sanity check. The new input utext is supposed to have the exact same |
michael@0 | 511 | // contents as the old. If we can't set to the same position, it doesn't. |
michael@0 | 512 | // The contents underlying the old utext might be invalid at this point, |
michael@0 | 513 | // so it's not safe to check directly. |
michael@0 | 514 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 515 | } |
michael@0 | 516 | return *this; |
michael@0 | 517 | } |
michael@0 | 518 | |
michael@0 | 519 | |
michael@0 | 520 | /** |
michael@0 | 521 | * Sets the current iteration position to the beginning of the text. |
michael@0 | 522 | * @return The offset of the beginning of the text. |
michael@0 | 523 | */ |
michael@0 | 524 | int32_t RuleBasedBreakIterator::first(void) { |
michael@0 | 525 | reset(); |
michael@0 | 526 | fLastRuleStatusIndex = 0; |
michael@0 | 527 | fLastStatusIndexValid = TRUE; |
michael@0 | 528 | //if (fText == NULL) |
michael@0 | 529 | // return BreakIterator::DONE; |
michael@0 | 530 | |
michael@0 | 531 | utext_setNativeIndex(fText, 0); |
michael@0 | 532 | return 0; |
michael@0 | 533 | } |
michael@0 | 534 | |
michael@0 | 535 | /** |
michael@0 | 536 | * Sets the current iteration position to the end of the text. |
michael@0 | 537 | * @return The text's past-the-end offset. |
michael@0 | 538 | */ |
michael@0 | 539 | int32_t RuleBasedBreakIterator::last(void) { |
michael@0 | 540 | reset(); |
michael@0 | 541 | if (fText == NULL) { |
michael@0 | 542 | fLastRuleStatusIndex = 0; |
michael@0 | 543 | fLastStatusIndexValid = TRUE; |
michael@0 | 544 | return BreakIterator::DONE; |
michael@0 | 545 | } |
michael@0 | 546 | |
michael@0 | 547 | fLastStatusIndexValid = FALSE; |
michael@0 | 548 | int32_t pos = (int32_t)utext_nativeLength(fText); |
michael@0 | 549 | utext_setNativeIndex(fText, pos); |
michael@0 | 550 | return pos; |
michael@0 | 551 | } |
michael@0 | 552 | |
michael@0 | 553 | /** |
michael@0 | 554 | * Advances the iterator either forward or backward the specified number of steps. |
michael@0 | 555 | * Negative values move backward, and positive values move forward. This is |
michael@0 | 556 | * equivalent to repeatedly calling next() or previous(). |
michael@0 | 557 | * @param n The number of steps to move. The sign indicates the direction |
michael@0 | 558 | * (negative is backwards, and positive is forwards). |
michael@0 | 559 | * @return The character offset of the boundary position n boundaries away from |
michael@0 | 560 | * the current one. |
michael@0 | 561 | */ |
michael@0 | 562 | int32_t RuleBasedBreakIterator::next(int32_t n) { |
michael@0 | 563 | int32_t result = current(); |
michael@0 | 564 | while (n > 0) { |
michael@0 | 565 | result = next(); |
michael@0 | 566 | --n; |
michael@0 | 567 | } |
michael@0 | 568 | while (n < 0) { |
michael@0 | 569 | result = previous(); |
michael@0 | 570 | ++n; |
michael@0 | 571 | } |
michael@0 | 572 | return result; |
michael@0 | 573 | } |
michael@0 | 574 | |
michael@0 | 575 | /** |
michael@0 | 576 | * Advances the iterator to the next boundary position. |
michael@0 | 577 | * @return The position of the first boundary after this one. |
michael@0 | 578 | */ |
michael@0 | 579 | int32_t RuleBasedBreakIterator::next(void) { |
michael@0 | 580 | // if we have cached break positions and we're still in the range |
michael@0 | 581 | // covered by them, just move one step forward in the cache |
michael@0 | 582 | if (fCachedBreakPositions != NULL) { |
michael@0 | 583 | if (fPositionInCache < fNumCachedBreakPositions - 1) { |
michael@0 | 584 | ++fPositionInCache; |
michael@0 | 585 | int32_t pos = fCachedBreakPositions[fPositionInCache]; |
michael@0 | 586 | utext_setNativeIndex(fText, pos); |
michael@0 | 587 | return pos; |
michael@0 | 588 | } |
michael@0 | 589 | else { |
michael@0 | 590 | reset(); |
michael@0 | 591 | } |
michael@0 | 592 | } |
michael@0 | 593 | |
michael@0 | 594 | int32_t startPos = current(); |
michael@0 | 595 | int32_t result = handleNext(fData->fForwardTable); |
michael@0 | 596 | if (fDictionaryCharCount > 0) { |
michael@0 | 597 | result = checkDictionary(startPos, result, FALSE); |
michael@0 | 598 | } |
michael@0 | 599 | return result; |
michael@0 | 600 | } |
michael@0 | 601 | |
michael@0 | 602 | /** |
michael@0 | 603 | * Advances the iterator backwards, to the last boundary preceding this one. |
michael@0 | 604 | * @return The position of the last boundary position preceding this one. |
michael@0 | 605 | */ |
michael@0 | 606 | int32_t RuleBasedBreakIterator::previous(void) { |
michael@0 | 607 | int32_t result; |
michael@0 | 608 | int32_t startPos; |
michael@0 | 609 | |
michael@0 | 610 | // if we have cached break positions and we're still in the range |
michael@0 | 611 | // covered by them, just move one step backward in the cache |
michael@0 | 612 | if (fCachedBreakPositions != NULL) { |
michael@0 | 613 | if (fPositionInCache > 0) { |
michael@0 | 614 | --fPositionInCache; |
michael@0 | 615 | // If we're at the beginning of the cache, need to reevaluate the |
michael@0 | 616 | // rule status |
michael@0 | 617 | if (fPositionInCache <= 0) { |
michael@0 | 618 | fLastStatusIndexValid = FALSE; |
michael@0 | 619 | } |
michael@0 | 620 | int32_t pos = fCachedBreakPositions[fPositionInCache]; |
michael@0 | 621 | utext_setNativeIndex(fText, pos); |
michael@0 | 622 | return pos; |
michael@0 | 623 | } |
michael@0 | 624 | else { |
michael@0 | 625 | reset(); |
michael@0 | 626 | } |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | // if we're already sitting at the beginning of the text, return DONE |
michael@0 | 630 | if (fText == NULL || (startPos = current()) == 0) { |
michael@0 | 631 | fLastRuleStatusIndex = 0; |
michael@0 | 632 | fLastStatusIndexValid = TRUE; |
michael@0 | 633 | return BreakIterator::DONE; |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { |
michael@0 | 637 | result = handlePrevious(fData->fReverseTable); |
michael@0 | 638 | if (fDictionaryCharCount > 0) { |
michael@0 | 639 | result = checkDictionary(result, startPos, TRUE); |
michael@0 | 640 | } |
michael@0 | 641 | return result; |
michael@0 | 642 | } |
michael@0 | 643 | |
michael@0 | 644 | // old rule syntax |
michael@0 | 645 | // set things up. handlePrevious() will back us up to some valid |
michael@0 | 646 | // break position before the current position (we back our internal |
michael@0 | 647 | // iterator up one step to prevent handlePrevious() from returning |
michael@0 | 648 | // the current position), but not necessarily the last one before |
michael@0 | 649 | |
michael@0 | 650 | // where we started |
michael@0 | 651 | |
michael@0 | 652 | int32_t start = current(); |
michael@0 | 653 | |
michael@0 | 654 | (void)UTEXT_PREVIOUS32(fText); |
michael@0 | 655 | int32_t lastResult = handlePrevious(fData->fReverseTable); |
michael@0 | 656 | if (lastResult == UBRK_DONE) { |
michael@0 | 657 | lastResult = 0; |
michael@0 | 658 | utext_setNativeIndex(fText, 0); |
michael@0 | 659 | } |
michael@0 | 660 | result = lastResult; |
michael@0 | 661 | int32_t lastTag = 0; |
michael@0 | 662 | UBool breakTagValid = FALSE; |
michael@0 | 663 | |
michael@0 | 664 | // iterate forward from the known break position until we pass our |
michael@0 | 665 | // starting point. The last break position before the starting |
michael@0 | 666 | // point is our return value |
michael@0 | 667 | |
michael@0 | 668 | for (;;) { |
michael@0 | 669 | result = next(); |
michael@0 | 670 | if (result == BreakIterator::DONE || result >= start) { |
michael@0 | 671 | break; |
michael@0 | 672 | } |
michael@0 | 673 | lastResult = result; |
michael@0 | 674 | lastTag = fLastRuleStatusIndex; |
michael@0 | 675 | breakTagValid = TRUE; |
michael@0 | 676 | } |
michael@0 | 677 | |
michael@0 | 678 | // fLastBreakTag wants to have the value for section of text preceding |
michael@0 | 679 | // the result position that we are to return (in lastResult.) If |
michael@0 | 680 | // the backwards rules overshot and the above loop had to do two or more |
michael@0 | 681 | // next()s to move up to the desired return position, we will have a valid |
michael@0 | 682 | // tag value. But, if handlePrevious() took us to exactly the correct result positon, |
michael@0 | 683 | // we wont have a tag value for that position, which is only set by handleNext(). |
michael@0 | 684 | |
michael@0 | 685 | // set the current iteration position to be the last break position |
michael@0 | 686 | // before where we started, and then return that value |
michael@0 | 687 | utext_setNativeIndex(fText, lastResult); |
michael@0 | 688 | fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() |
michael@0 | 689 | fLastStatusIndexValid = breakTagValid; |
michael@0 | 690 | |
michael@0 | 691 | // No need to check the dictionary; it will have been handled by |
michael@0 | 692 | // next() |
michael@0 | 693 | |
michael@0 | 694 | return lastResult; |
michael@0 | 695 | } |
michael@0 | 696 | |
michael@0 | 697 | /** |
michael@0 | 698 | * Sets the iterator to refer to the first boundary position following |
michael@0 | 699 | * the specified position. |
michael@0 | 700 | * @offset The position from which to begin searching for a break position. |
michael@0 | 701 | * @return The position of the first break after the current position. |
michael@0 | 702 | */ |
michael@0 | 703 | int32_t RuleBasedBreakIterator::following(int32_t offset) { |
michael@0 | 704 | // if we have cached break positions and offset is in the range |
michael@0 | 705 | // covered by them, use them |
michael@0 | 706 | // TODO: could use binary search |
michael@0 | 707 | // TODO: what if offset is outside range, but break is not? |
michael@0 | 708 | if (fCachedBreakPositions != NULL) { |
michael@0 | 709 | if (offset >= fCachedBreakPositions[0] |
michael@0 | 710 | && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { |
michael@0 | 711 | fPositionInCache = 0; |
michael@0 | 712 | // We are guaranteed not to leave the array due to range test above |
michael@0 | 713 | while (offset >= fCachedBreakPositions[fPositionInCache]) { |
michael@0 | 714 | ++fPositionInCache; |
michael@0 | 715 | } |
michael@0 | 716 | int32_t pos = fCachedBreakPositions[fPositionInCache]; |
michael@0 | 717 | utext_setNativeIndex(fText, pos); |
michael@0 | 718 | return pos; |
michael@0 | 719 | } |
michael@0 | 720 | else { |
michael@0 | 721 | reset(); |
michael@0 | 722 | } |
michael@0 | 723 | } |
michael@0 | 724 | |
michael@0 | 725 | // if the offset passed in is already past the end of the text, |
michael@0 | 726 | // just return DONE; if it's before the beginning, return the |
michael@0 | 727 | // text's starting offset |
michael@0 | 728 | fLastRuleStatusIndex = 0; |
michael@0 | 729 | fLastStatusIndexValid = TRUE; |
michael@0 | 730 | if (fText == NULL || offset >= utext_nativeLength(fText)) { |
michael@0 | 731 | last(); |
michael@0 | 732 | return next(); |
michael@0 | 733 | } |
michael@0 | 734 | else if (offset < 0) { |
michael@0 | 735 | return first(); |
michael@0 | 736 | } |
michael@0 | 737 | |
michael@0 | 738 | // otherwise, set our internal iteration position (temporarily) |
michael@0 | 739 | // to the position passed in. If this is the _beginning_ position, |
michael@0 | 740 | // then we can just use next() to get our return value |
michael@0 | 741 | |
michael@0 | 742 | int32_t result = 0; |
michael@0 | 743 | |
michael@0 | 744 | if (fData->fSafeRevTable != NULL) { |
michael@0 | 745 | // new rule syntax |
michael@0 | 746 | utext_setNativeIndex(fText, offset); |
michael@0 | 747 | // move forward one codepoint to prepare for moving back to a |
michael@0 | 748 | // safe point. |
michael@0 | 749 | // this handles offset being between a supplementary character |
michael@0 | 750 | (void)UTEXT_NEXT32(fText); |
michael@0 | 751 | // handlePrevious will move most of the time to < 1 boundary away |
michael@0 | 752 | handlePrevious(fData->fSafeRevTable); |
michael@0 | 753 | int32_t result = next(); |
michael@0 | 754 | while (result <= offset) { |
michael@0 | 755 | result = next(); |
michael@0 | 756 | } |
michael@0 | 757 | return result; |
michael@0 | 758 | } |
michael@0 | 759 | if (fData->fSafeFwdTable != NULL) { |
michael@0 | 760 | // backup plan if forward safe table is not available |
michael@0 | 761 | utext_setNativeIndex(fText, offset); |
michael@0 | 762 | (void)UTEXT_PREVIOUS32(fText); |
michael@0 | 763 | // handle next will give result >= offset |
michael@0 | 764 | handleNext(fData->fSafeFwdTable); |
michael@0 | 765 | // previous will give result 0 or 1 boundary away from offset, |
michael@0 | 766 | // most of the time |
michael@0 | 767 | // we have to |
michael@0 | 768 | int32_t oldresult = previous(); |
michael@0 | 769 | while (oldresult > offset) { |
michael@0 | 770 | int32_t result = previous(); |
michael@0 | 771 | if (result <= offset) { |
michael@0 | 772 | return oldresult; |
michael@0 | 773 | } |
michael@0 | 774 | oldresult = result; |
michael@0 | 775 | } |
michael@0 | 776 | int32_t result = next(); |
michael@0 | 777 | if (result <= offset) { |
michael@0 | 778 | return next(); |
michael@0 | 779 | } |
michael@0 | 780 | return result; |
michael@0 | 781 | } |
michael@0 | 782 | // otherwise, we have to sync up first. Use handlePrevious() to back |
michael@0 | 783 | // up to a known break position before the specified position (if |
michael@0 | 784 | // we can determine that the specified position is a break position, |
michael@0 | 785 | // we don't back up at all). This may or may not be the last break |
michael@0 | 786 | // position at or before our starting position. Advance forward |
michael@0 | 787 | // from here until we've passed the starting position. The position |
michael@0 | 788 | // we stop on will be the first break position after the specified one. |
michael@0 | 789 | // old rule syntax |
michael@0 | 790 | |
michael@0 | 791 | utext_setNativeIndex(fText, offset); |
michael@0 | 792 | if (offset==0 || |
michael@0 | 793 | (offset==1 && utext_getNativeIndex(fText)==0)) { |
michael@0 | 794 | return next(); |
michael@0 | 795 | } |
michael@0 | 796 | result = previous(); |
michael@0 | 797 | |
michael@0 | 798 | while (result != BreakIterator::DONE && result <= offset) { |
michael@0 | 799 | result = next(); |
michael@0 | 800 | } |
michael@0 | 801 | |
michael@0 | 802 | return result; |
michael@0 | 803 | } |
michael@0 | 804 | |
michael@0 | 805 | /** |
michael@0 | 806 | * Sets the iterator to refer to the last boundary position before the |
michael@0 | 807 | * specified position. |
michael@0 | 808 | * @offset The position to begin searching for a break from. |
michael@0 | 809 | * @return The position of the last boundary before the starting position. |
michael@0 | 810 | */ |
michael@0 | 811 | int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
michael@0 | 812 | // if we have cached break positions and offset is in the range |
michael@0 | 813 | // covered by them, use them |
michael@0 | 814 | if (fCachedBreakPositions != NULL) { |
michael@0 | 815 | // TODO: binary search? |
michael@0 | 816 | // TODO: What if offset is outside range, but break is not? |
michael@0 | 817 | if (offset > fCachedBreakPositions[0] |
michael@0 | 818 | && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { |
michael@0 | 819 | fPositionInCache = 0; |
michael@0 | 820 | while (fPositionInCache < fNumCachedBreakPositions |
michael@0 | 821 | && offset > fCachedBreakPositions[fPositionInCache]) |
michael@0 | 822 | ++fPositionInCache; |
michael@0 | 823 | --fPositionInCache; |
michael@0 | 824 | // If we're at the beginning of the cache, need to reevaluate the |
michael@0 | 825 | // rule status |
michael@0 | 826 | if (fPositionInCache <= 0) { |
michael@0 | 827 | fLastStatusIndexValid = FALSE; |
michael@0 | 828 | } |
michael@0 | 829 | utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); |
michael@0 | 830 | return fCachedBreakPositions[fPositionInCache]; |
michael@0 | 831 | } |
michael@0 | 832 | else { |
michael@0 | 833 | reset(); |
michael@0 | 834 | } |
michael@0 | 835 | } |
michael@0 | 836 | |
michael@0 | 837 | // if the offset passed in is already past the end of the text, |
michael@0 | 838 | // just return DONE; if it's before the beginning, return the |
michael@0 | 839 | // text's starting offset |
michael@0 | 840 | if (fText == NULL || offset > utext_nativeLength(fText)) { |
michael@0 | 841 | // return BreakIterator::DONE; |
michael@0 | 842 | return last(); |
michael@0 | 843 | } |
michael@0 | 844 | else if (offset < 0) { |
michael@0 | 845 | return first(); |
michael@0 | 846 | } |
michael@0 | 847 | |
michael@0 | 848 | // if we start by updating the current iteration position to the |
michael@0 | 849 | // position specified by the caller, we can just use previous() |
michael@0 | 850 | // to carry out this operation |
michael@0 | 851 | |
michael@0 | 852 | if (fData->fSafeFwdTable != NULL) { |
michael@0 | 853 | // new rule syntax |
michael@0 | 854 | utext_setNativeIndex(fText, offset); |
michael@0 | 855 | int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 856 | if (newOffset != offset) { |
michael@0 | 857 | // Will come here if specified offset was not a code point boundary AND |
michael@0 | 858 | // the underlying implmentation is using UText, which snaps any non-code-point-boundary |
michael@0 | 859 | // indices to the containing code point. |
michael@0 | 860 | // For breakitereator::preceding only, these non-code-point indices need to be moved |
michael@0 | 861 | // up to refer to the following codepoint. |
michael@0 | 862 | (void)UTEXT_NEXT32(fText); |
michael@0 | 863 | offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 864 | } |
michael@0 | 865 | |
michael@0 | 866 | // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, |
michael@0 | 867 | // rather than adjusting the position unconditionally? |
michael@0 | 868 | // (Change would interact with safe rules.) |
michael@0 | 869 | // TODO: change RBBI behavior for off-boundary indices to match that of UText? |
michael@0 | 870 | // affects only preceding(), seems cleaner, but is slightly different. |
michael@0 | 871 | (void)UTEXT_PREVIOUS32(fText); |
michael@0 | 872 | handleNext(fData->fSafeFwdTable); |
michael@0 | 873 | int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 874 | while (result >= offset) { |
michael@0 | 875 | result = previous(); |
michael@0 | 876 | } |
michael@0 | 877 | return result; |
michael@0 | 878 | } |
michael@0 | 879 | if (fData->fSafeRevTable != NULL) { |
michael@0 | 880 | // backup plan if forward safe table is not available |
michael@0 | 881 | // TODO: check whether this path can be discarded |
michael@0 | 882 | // It's probably OK to say that rules must supply both safe tables |
michael@0 | 883 | // if they use safe tables at all. We have certainly never described |
michael@0 | 884 | // to anyone how to work with just one safe table. |
michael@0 | 885 | utext_setNativeIndex(fText, offset); |
michael@0 | 886 | (void)UTEXT_NEXT32(fText); |
michael@0 | 887 | |
michael@0 | 888 | // handle previous will give result <= offset |
michael@0 | 889 | handlePrevious(fData->fSafeRevTable); |
michael@0 | 890 | |
michael@0 | 891 | // next will give result 0 or 1 boundary away from offset, |
michael@0 | 892 | // most of the time |
michael@0 | 893 | // we have to |
michael@0 | 894 | int32_t oldresult = next(); |
michael@0 | 895 | while (oldresult < offset) { |
michael@0 | 896 | int32_t result = next(); |
michael@0 | 897 | if (result >= offset) { |
michael@0 | 898 | return oldresult; |
michael@0 | 899 | } |
michael@0 | 900 | oldresult = result; |
michael@0 | 901 | } |
michael@0 | 902 | int32_t result = previous(); |
michael@0 | 903 | if (result >= offset) { |
michael@0 | 904 | return previous(); |
michael@0 | 905 | } |
michael@0 | 906 | return result; |
michael@0 | 907 | } |
michael@0 | 908 | |
michael@0 | 909 | // old rule syntax |
michael@0 | 910 | utext_setNativeIndex(fText, offset); |
michael@0 | 911 | return previous(); |
michael@0 | 912 | } |
michael@0 | 913 | |
michael@0 | 914 | /** |
michael@0 | 915 | * Returns true if the specfied position is a boundary position. As a side |
michael@0 | 916 | * effect, leaves the iterator pointing to the first boundary position at |
michael@0 | 917 | * or after "offset". |
michael@0 | 918 | * @param offset the offset to check. |
michael@0 | 919 | * @return True if "offset" is a boundary position. |
michael@0 | 920 | */ |
michael@0 | 921 | UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { |
michael@0 | 922 | // the beginning index of the iterator is always a boundary position by definition |
michael@0 | 923 | if (offset == 0) { |
michael@0 | 924 | first(); // For side effects on current position, tag values. |
michael@0 | 925 | return TRUE; |
michael@0 | 926 | } |
michael@0 | 927 | |
michael@0 | 928 | if (offset == (int32_t)utext_nativeLength(fText)) { |
michael@0 | 929 | last(); // For side effects on current position, tag values. |
michael@0 | 930 | return TRUE; |
michael@0 | 931 | } |
michael@0 | 932 | |
michael@0 | 933 | // out-of-range indexes are never boundary positions |
michael@0 | 934 | if (offset < 0) { |
michael@0 | 935 | first(); // For side effects on current position, tag values. |
michael@0 | 936 | return FALSE; |
michael@0 | 937 | } |
michael@0 | 938 | |
michael@0 | 939 | if (offset > utext_nativeLength(fText)) { |
michael@0 | 940 | last(); // For side effects on current position, tag values. |
michael@0 | 941 | return FALSE; |
michael@0 | 942 | } |
michael@0 | 943 | |
michael@0 | 944 | // otherwise, we can use following() on the position before the specified |
michael@0 | 945 | // one and return true if the position we get back is the one the user |
michael@0 | 946 | // specified |
michael@0 | 947 | utext_previous32From(fText, offset); |
michael@0 | 948 | int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 949 | UBool result = following(backOne) == offset; |
michael@0 | 950 | return result; |
michael@0 | 951 | } |
michael@0 | 952 | |
michael@0 | 953 | /** |
michael@0 | 954 | * Returns the current iteration position. |
michael@0 | 955 | * @return The current iteration position. |
michael@0 | 956 | */ |
michael@0 | 957 | int32_t RuleBasedBreakIterator::current(void) const { |
michael@0 | 958 | int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 959 | return pos; |
michael@0 | 960 | } |
michael@0 | 961 | |
michael@0 | 962 | //======================================================================= |
michael@0 | 963 | // implementation |
michael@0 | 964 | //======================================================================= |
michael@0 | 965 | |
michael@0 | 966 | // |
michael@0 | 967 | // RBBIRunMode - the state machine runs an extra iteration at the beginning and end |
michael@0 | 968 | // of user text. A variable with this enum type keeps track of where we |
michael@0 | 969 | // are. The state machine only fetches user input while in the RUN mode. |
michael@0 | 970 | // |
michael@0 | 971 | enum RBBIRunMode { |
michael@0 | 972 | RBBI_START, // state machine processing is before first char of input |
michael@0 | 973 | RBBI_RUN, // state machine processing is in the user text |
michael@0 | 974 | RBBI_END // state machine processing is after end of user text. |
michael@0 | 975 | }; |
michael@0 | 976 | |
michael@0 | 977 | |
michael@0 | 978 | //----------------------------------------------------------------------------------- |
michael@0 | 979 | // |
michael@0 | 980 | // handleNext(stateTable) |
michael@0 | 981 | // This method is the actual implementation of the rbbi next() method. |
michael@0 | 982 | // This method initializes the state machine to state 1 |
michael@0 | 983 | // and advances through the text character by character until we reach the end |
michael@0 | 984 | // of the text or the state machine transitions to state 0. We update our return |
michael@0 | 985 | // value every time the state machine passes through an accepting state. |
michael@0 | 986 | // |
michael@0 | 987 | //----------------------------------------------------------------------------------- |
michael@0 | 988 | int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { |
michael@0 | 989 | int32_t state; |
michael@0 | 990 | uint16_t category = 0; |
michael@0 | 991 | RBBIRunMode mode; |
michael@0 | 992 | |
michael@0 | 993 | RBBIStateTableRow *row; |
michael@0 | 994 | UChar32 c; |
michael@0 | 995 | int32_t lookaheadStatus = 0; |
michael@0 | 996 | int32_t lookaheadTagIdx = 0; |
michael@0 | 997 | int32_t result = 0; |
michael@0 | 998 | int32_t initialPosition = 0; |
michael@0 | 999 | int32_t lookaheadResult = 0; |
michael@0 | 1000 | UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; |
michael@0 | 1001 | const char *tableData = statetable->fTableData; |
michael@0 | 1002 | uint32_t tableRowLen = statetable->fRowLen; |
michael@0 | 1003 | |
michael@0 | 1004 | #ifdef RBBI_DEBUG |
michael@0 | 1005 | if (fTrace) { |
michael@0 | 1006 | RBBIDebugPuts("Handle Next pos char state category"); |
michael@0 | 1007 | } |
michael@0 | 1008 | #endif |
michael@0 | 1009 | |
michael@0 | 1010 | // No matter what, handleNext alway correctly sets the break tag value. |
michael@0 | 1011 | fLastStatusIndexValid = TRUE; |
michael@0 | 1012 | fLastRuleStatusIndex = 0; |
michael@0 | 1013 | |
michael@0 | 1014 | // if we're already at the end of the text, return DONE. |
michael@0 | 1015 | initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1016 | result = initialPosition; |
michael@0 | 1017 | c = UTEXT_NEXT32(fText); |
michael@0 | 1018 | if (fData == NULL || c==U_SENTINEL) { |
michael@0 | 1019 | return BreakIterator::DONE; |
michael@0 | 1020 | } |
michael@0 | 1021 | |
michael@0 | 1022 | // Set the initial state for the state machine |
michael@0 | 1023 | state = START_STATE; |
michael@0 | 1024 | row = (RBBIStateTableRow *) |
michael@0 | 1025 | //(statetable->fTableData + (statetable->fRowLen * state)); |
michael@0 | 1026 | (tableData + tableRowLen * state); |
michael@0 | 1027 | |
michael@0 | 1028 | |
michael@0 | 1029 | mode = RBBI_RUN; |
michael@0 | 1030 | if (statetable->fFlags & RBBI_BOF_REQUIRED) { |
michael@0 | 1031 | category = 2; |
michael@0 | 1032 | mode = RBBI_START; |
michael@0 | 1033 | } |
michael@0 | 1034 | |
michael@0 | 1035 | |
michael@0 | 1036 | // loop until we reach the end of the text or transition to state 0 |
michael@0 | 1037 | // |
michael@0 | 1038 | for (;;) { |
michael@0 | 1039 | if (c == U_SENTINEL) { |
michael@0 | 1040 | // Reached end of input string. |
michael@0 | 1041 | if (mode == RBBI_END) { |
michael@0 | 1042 | // We have already run the loop one last time with the |
michael@0 | 1043 | // character set to the psueudo {eof} value. Now it is time |
michael@0 | 1044 | // to unconditionally bail out. |
michael@0 | 1045 | if (lookaheadResult > result) { |
michael@0 | 1046 | // We ran off the end of the string with a pending look-ahead match. |
michael@0 | 1047 | // Treat this as if the look-ahead condition had been met, and return |
michael@0 | 1048 | // the match at the / position from the look-ahead rule. |
michael@0 | 1049 | result = lookaheadResult; |
michael@0 | 1050 | fLastRuleStatusIndex = lookaheadTagIdx; |
michael@0 | 1051 | lookaheadStatus = 0; |
michael@0 | 1052 | } |
michael@0 | 1053 | break; |
michael@0 | 1054 | } |
michael@0 | 1055 | // Run the loop one last time with the fake end-of-input character category. |
michael@0 | 1056 | mode = RBBI_END; |
michael@0 | 1057 | category = 1; |
michael@0 | 1058 | } |
michael@0 | 1059 | |
michael@0 | 1060 | // |
michael@0 | 1061 | // Get the char category. An incoming category of 1 or 2 means that |
michael@0 | 1062 | // we are preset for doing the beginning or end of input, and |
michael@0 | 1063 | // that we shouldn't get a category from an actual text input character. |
michael@0 | 1064 | // |
michael@0 | 1065 | if (mode == RBBI_RUN) { |
michael@0 | 1066 | // look up the current character's character category, which tells us |
michael@0 | 1067 | // which column in the state table to look at. |
michael@0 | 1068 | // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, |
michael@0 | 1069 | // not the size of the character going in, which is a UChar32. |
michael@0 | 1070 | // |
michael@0 | 1071 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1072 | |
michael@0 | 1073 | // Check the dictionary bit in the character's category. |
michael@0 | 1074 | // Counter is only used by dictionary based iterators (subclasses). |
michael@0 | 1075 | // Chars that need to be handled by a dictionary have a flag bit set |
michael@0 | 1076 | // in their category values. |
michael@0 | 1077 | // |
michael@0 | 1078 | if ((category & 0x4000) != 0) { |
michael@0 | 1079 | fDictionaryCharCount++; |
michael@0 | 1080 | // And off the dictionary flag bit. |
michael@0 | 1081 | category &= ~0x4000; |
michael@0 | 1082 | } |
michael@0 | 1083 | } |
michael@0 | 1084 | |
michael@0 | 1085 | #ifdef RBBI_DEBUG |
michael@0 | 1086 | if (fTrace) { |
michael@0 | 1087 | RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); |
michael@0 | 1088 | if (0x20<=c && c<0x7f) { |
michael@0 | 1089 | RBBIDebugPrintf("\"%c\" ", c); |
michael@0 | 1090 | } else { |
michael@0 | 1091 | RBBIDebugPrintf("%5x ", c); |
michael@0 | 1092 | } |
michael@0 | 1093 | RBBIDebugPrintf("%3d %3d\n", state, category); |
michael@0 | 1094 | } |
michael@0 | 1095 | #endif |
michael@0 | 1096 | |
michael@0 | 1097 | // State Transition - move machine to its next state |
michael@0 | 1098 | // |
michael@0 | 1099 | |
michael@0 | 1100 | // Note: fNextState is defined as uint16_t[2], but we are casting |
michael@0 | 1101 | // a generated RBBI table to RBBIStateTableRow and some tables |
michael@0 | 1102 | // actually have more than 2 categories. |
michael@0 | 1103 | U_ASSERT(category<fData->fHeader->fCatCount); |
michael@0 | 1104 | state = row->fNextState[category]; /*Not accessing beyond memory*/ |
michael@0 | 1105 | row = (RBBIStateTableRow *) |
michael@0 | 1106 | // (statetable->fTableData + (statetable->fRowLen * state)); |
michael@0 | 1107 | (tableData + tableRowLen * state); |
michael@0 | 1108 | |
michael@0 | 1109 | |
michael@0 | 1110 | if (row->fAccepting == -1) { |
michael@0 | 1111 | // Match found, common case. |
michael@0 | 1112 | if (mode != RBBI_START) { |
michael@0 | 1113 | result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1114 | } |
michael@0 | 1115 | fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. |
michael@0 | 1116 | } |
michael@0 | 1117 | |
michael@0 | 1118 | if (row->fLookAhead != 0) { |
michael@0 | 1119 | if (lookaheadStatus != 0 |
michael@0 | 1120 | && row->fAccepting == lookaheadStatus) { |
michael@0 | 1121 | // Lookahead match is completed. |
michael@0 | 1122 | result = lookaheadResult; |
michael@0 | 1123 | fLastRuleStatusIndex = lookaheadTagIdx; |
michael@0 | 1124 | lookaheadStatus = 0; |
michael@0 | 1125 | // TODO: make a standalone hard break in a rule work. |
michael@0 | 1126 | if (lookAheadHardBreak) { |
michael@0 | 1127 | UTEXT_SETNATIVEINDEX(fText, result); |
michael@0 | 1128 | return result; |
michael@0 | 1129 | } |
michael@0 | 1130 | // Look-ahead completed, but other rules may match further. Continue on |
michael@0 | 1131 | // TODO: junk this feature? I don't think it's used anywhwere. |
michael@0 | 1132 | goto continueOn; |
michael@0 | 1133 | } |
michael@0 | 1134 | |
michael@0 | 1135 | int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1136 | lookaheadResult = r; |
michael@0 | 1137 | lookaheadStatus = row->fLookAhead; |
michael@0 | 1138 | lookaheadTagIdx = row->fTagIdx; |
michael@0 | 1139 | goto continueOn; |
michael@0 | 1140 | } |
michael@0 | 1141 | |
michael@0 | 1142 | |
michael@0 | 1143 | if (row->fAccepting != 0) { |
michael@0 | 1144 | // Because this is an accepting state, any in-progress look-ahead match |
michael@0 | 1145 | // is no longer relavant. Clear out the pending lookahead status. |
michael@0 | 1146 | lookaheadStatus = 0; // clear out any pending look-ahead match. |
michael@0 | 1147 | } |
michael@0 | 1148 | |
michael@0 | 1149 | continueOn: |
michael@0 | 1150 | if (state == STOP_STATE) { |
michael@0 | 1151 | // This is the normal exit from the lookup state machine. |
michael@0 | 1152 | // We have advanced through the string until it is certain that no |
michael@0 | 1153 | // longer match is possible, no matter what characters follow. |
michael@0 | 1154 | break; |
michael@0 | 1155 | } |
michael@0 | 1156 | |
michael@0 | 1157 | // Advance to the next character. |
michael@0 | 1158 | // If this is a beginning-of-input loop iteration, don't advance |
michael@0 | 1159 | // the input position. The next iteration will be processing the |
michael@0 | 1160 | // first real input character. |
michael@0 | 1161 | if (mode == RBBI_RUN) { |
michael@0 | 1162 | c = UTEXT_NEXT32(fText); |
michael@0 | 1163 | } else { |
michael@0 | 1164 | if (mode == RBBI_START) { |
michael@0 | 1165 | mode = RBBI_RUN; |
michael@0 | 1166 | } |
michael@0 | 1167 | } |
michael@0 | 1168 | |
michael@0 | 1169 | |
michael@0 | 1170 | } |
michael@0 | 1171 | |
michael@0 | 1172 | // The state machine is done. Check whether it found a match... |
michael@0 | 1173 | |
michael@0 | 1174 | // If the iterator failed to advance in the match engine, force it ahead by one. |
michael@0 | 1175 | // (This really indicates a defect in the break rules. They should always match |
michael@0 | 1176 | // at least one character.) |
michael@0 | 1177 | if (result == initialPosition) { |
michael@0 | 1178 | UTEXT_SETNATIVEINDEX(fText, initialPosition); |
michael@0 | 1179 | UTEXT_NEXT32(fText); |
michael@0 | 1180 | result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1181 | } |
michael@0 | 1182 | |
michael@0 | 1183 | // Leave the iterator at our result position. |
michael@0 | 1184 | UTEXT_SETNATIVEINDEX(fText, result); |
michael@0 | 1185 | #ifdef RBBI_DEBUG |
michael@0 | 1186 | if (fTrace) { |
michael@0 | 1187 | RBBIDebugPrintf("result = %d\n\n", result); |
michael@0 | 1188 | } |
michael@0 | 1189 | #endif |
michael@0 | 1190 | return result; |
michael@0 | 1191 | } |
michael@0 | 1192 | |
michael@0 | 1193 | |
michael@0 | 1194 | |
michael@0 | 1195 | //----------------------------------------------------------------------------------- |
michael@0 | 1196 | // |
michael@0 | 1197 | // handlePrevious() |
michael@0 | 1198 | // |
michael@0 | 1199 | // Iterate backwards, according to the logic of the reverse rules. |
michael@0 | 1200 | // This version handles the exact style backwards rules. |
michael@0 | 1201 | // |
michael@0 | 1202 | // The logic of this function is very similar to handleNext(), above. |
michael@0 | 1203 | // |
michael@0 | 1204 | //----------------------------------------------------------------------------------- |
michael@0 | 1205 | int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { |
michael@0 | 1206 | int32_t state; |
michael@0 | 1207 | uint16_t category = 0; |
michael@0 | 1208 | RBBIRunMode mode; |
michael@0 | 1209 | RBBIStateTableRow *row; |
michael@0 | 1210 | UChar32 c; |
michael@0 | 1211 | int32_t lookaheadStatus = 0; |
michael@0 | 1212 | int32_t result = 0; |
michael@0 | 1213 | int32_t initialPosition = 0; |
michael@0 | 1214 | int32_t lookaheadResult = 0; |
michael@0 | 1215 | UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; |
michael@0 | 1216 | |
michael@0 | 1217 | #ifdef RBBI_DEBUG |
michael@0 | 1218 | if (fTrace) { |
michael@0 | 1219 | RBBIDebugPuts("Handle Previous pos char state category"); |
michael@0 | 1220 | } |
michael@0 | 1221 | #endif |
michael@0 | 1222 | |
michael@0 | 1223 | // handlePrevious() never gets the rule status. |
michael@0 | 1224 | // Flag the status as invalid; if the user ever asks for status, we will need |
michael@0 | 1225 | // to back up, then re-find the break position using handleNext(), which does |
michael@0 | 1226 | // get the status value. |
michael@0 | 1227 | fLastStatusIndexValid = FALSE; |
michael@0 | 1228 | fLastRuleStatusIndex = 0; |
michael@0 | 1229 | |
michael@0 | 1230 | // if we're already at the start of the text, return DONE. |
michael@0 | 1231 | if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { |
michael@0 | 1232 | return BreakIterator::DONE; |
michael@0 | 1233 | } |
michael@0 | 1234 | |
michael@0 | 1235 | // Set up the starting char. |
michael@0 | 1236 | initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1237 | result = initialPosition; |
michael@0 | 1238 | c = UTEXT_PREVIOUS32(fText); |
michael@0 | 1239 | |
michael@0 | 1240 | // Set the initial state for the state machine |
michael@0 | 1241 | state = START_STATE; |
michael@0 | 1242 | row = (RBBIStateTableRow *) |
michael@0 | 1243 | (statetable->fTableData + (statetable->fRowLen * state)); |
michael@0 | 1244 | category = 3; |
michael@0 | 1245 | mode = RBBI_RUN; |
michael@0 | 1246 | if (statetable->fFlags & RBBI_BOF_REQUIRED) { |
michael@0 | 1247 | category = 2; |
michael@0 | 1248 | mode = RBBI_START; |
michael@0 | 1249 | } |
michael@0 | 1250 | |
michael@0 | 1251 | |
michael@0 | 1252 | // loop until we reach the start of the text or transition to state 0 |
michael@0 | 1253 | // |
michael@0 | 1254 | for (;;) { |
michael@0 | 1255 | if (c == U_SENTINEL) { |
michael@0 | 1256 | // Reached end of input string. |
michael@0 | 1257 | if (mode == RBBI_END) { |
michael@0 | 1258 | // We have already run the loop one last time with the |
michael@0 | 1259 | // character set to the psueudo {eof} value. Now it is time |
michael@0 | 1260 | // to unconditionally bail out. |
michael@0 | 1261 | if (lookaheadResult < result) { |
michael@0 | 1262 | // We ran off the end of the string with a pending look-ahead match. |
michael@0 | 1263 | // Treat this as if the look-ahead condition had been met, and return |
michael@0 | 1264 | // the match at the / position from the look-ahead rule. |
michael@0 | 1265 | result = lookaheadResult; |
michael@0 | 1266 | lookaheadStatus = 0; |
michael@0 | 1267 | } else if (result == initialPosition) { |
michael@0 | 1268 | // Ran off start, no match found. |
michael@0 | 1269 | // move one index one (towards the start, since we are doing a previous()) |
michael@0 | 1270 | UTEXT_SETNATIVEINDEX(fText, initialPosition); |
michael@0 | 1271 | (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. |
michael@0 | 1272 | } |
michael@0 | 1273 | break; |
michael@0 | 1274 | } |
michael@0 | 1275 | // Run the loop one last time with the fake end-of-input character category. |
michael@0 | 1276 | mode = RBBI_END; |
michael@0 | 1277 | category = 1; |
michael@0 | 1278 | } |
michael@0 | 1279 | |
michael@0 | 1280 | // |
michael@0 | 1281 | // Get the char category. An incoming category of 1 or 2 means that |
michael@0 | 1282 | // we are preset for doing the beginning or end of input, and |
michael@0 | 1283 | // that we shouldn't get a category from an actual text input character. |
michael@0 | 1284 | // |
michael@0 | 1285 | if (mode == RBBI_RUN) { |
michael@0 | 1286 | // look up the current character's character category, which tells us |
michael@0 | 1287 | // which column in the state table to look at. |
michael@0 | 1288 | // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, |
michael@0 | 1289 | // not the size of the character going in, which is a UChar32. |
michael@0 | 1290 | // |
michael@0 | 1291 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1292 | |
michael@0 | 1293 | // Check the dictionary bit in the character's category. |
michael@0 | 1294 | // Counter is only used by dictionary based iterators (subclasses). |
michael@0 | 1295 | // Chars that need to be handled by a dictionary have a flag bit set |
michael@0 | 1296 | // in their category values. |
michael@0 | 1297 | // |
michael@0 | 1298 | if ((category & 0x4000) != 0) { |
michael@0 | 1299 | fDictionaryCharCount++; |
michael@0 | 1300 | // And off the dictionary flag bit. |
michael@0 | 1301 | category &= ~0x4000; |
michael@0 | 1302 | } |
michael@0 | 1303 | } |
michael@0 | 1304 | |
michael@0 | 1305 | #ifdef RBBI_DEBUG |
michael@0 | 1306 | if (fTrace) { |
michael@0 | 1307 | RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); |
michael@0 | 1308 | if (0x20<=c && c<0x7f) { |
michael@0 | 1309 | RBBIDebugPrintf("\"%c\" ", c); |
michael@0 | 1310 | } else { |
michael@0 | 1311 | RBBIDebugPrintf("%5x ", c); |
michael@0 | 1312 | } |
michael@0 | 1313 | RBBIDebugPrintf("%3d %3d\n", state, category); |
michael@0 | 1314 | } |
michael@0 | 1315 | #endif |
michael@0 | 1316 | |
michael@0 | 1317 | // State Transition - move machine to its next state |
michael@0 | 1318 | // |
michael@0 | 1319 | |
michael@0 | 1320 | // Note: fNextState is defined as uint16_t[2], but we are casting |
michael@0 | 1321 | // a generated RBBI table to RBBIStateTableRow and some tables |
michael@0 | 1322 | // actually have more than 2 categories. |
michael@0 | 1323 | U_ASSERT(category<fData->fHeader->fCatCount); |
michael@0 | 1324 | state = row->fNextState[category]; /*Not accessing beyond memory*/ |
michael@0 | 1325 | row = (RBBIStateTableRow *) |
michael@0 | 1326 | (statetable->fTableData + (statetable->fRowLen * state)); |
michael@0 | 1327 | |
michael@0 | 1328 | if (row->fAccepting == -1) { |
michael@0 | 1329 | // Match found, common case. |
michael@0 | 1330 | result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | if (row->fLookAhead != 0) { |
michael@0 | 1334 | if (lookaheadStatus != 0 |
michael@0 | 1335 | && row->fAccepting == lookaheadStatus) { |
michael@0 | 1336 | // Lookahead match is completed. |
michael@0 | 1337 | result = lookaheadResult; |
michael@0 | 1338 | lookaheadStatus = 0; |
michael@0 | 1339 | // TODO: make a standalone hard break in a rule work. |
michael@0 | 1340 | if (lookAheadHardBreak) { |
michael@0 | 1341 | UTEXT_SETNATIVEINDEX(fText, result); |
michael@0 | 1342 | return result; |
michael@0 | 1343 | } |
michael@0 | 1344 | // Look-ahead completed, but other rules may match further. Continue on |
michael@0 | 1345 | // TODO: junk this feature? I don't think it's used anywhwere. |
michael@0 | 1346 | goto continueOn; |
michael@0 | 1347 | } |
michael@0 | 1348 | |
michael@0 | 1349 | int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1350 | lookaheadResult = r; |
michael@0 | 1351 | lookaheadStatus = row->fLookAhead; |
michael@0 | 1352 | goto continueOn; |
michael@0 | 1353 | } |
michael@0 | 1354 | |
michael@0 | 1355 | |
michael@0 | 1356 | if (row->fAccepting != 0) { |
michael@0 | 1357 | // Because this is an accepting state, any in-progress look-ahead match |
michael@0 | 1358 | // is no longer relavant. Clear out the pending lookahead status. |
michael@0 | 1359 | lookaheadStatus = 0; |
michael@0 | 1360 | } |
michael@0 | 1361 | |
michael@0 | 1362 | continueOn: |
michael@0 | 1363 | if (state == STOP_STATE) { |
michael@0 | 1364 | // This is the normal exit from the lookup state machine. |
michael@0 | 1365 | // We have advanced through the string until it is certain that no |
michael@0 | 1366 | // longer match is possible, no matter what characters follow. |
michael@0 | 1367 | break; |
michael@0 | 1368 | } |
michael@0 | 1369 | |
michael@0 | 1370 | // Move (backwards) to the next character to process. |
michael@0 | 1371 | // If this is a beginning-of-input loop iteration, don't advance |
michael@0 | 1372 | // the input position. The next iteration will be processing the |
michael@0 | 1373 | // first real input character. |
michael@0 | 1374 | if (mode == RBBI_RUN) { |
michael@0 | 1375 | c = UTEXT_PREVIOUS32(fText); |
michael@0 | 1376 | } else { |
michael@0 | 1377 | if (mode == RBBI_START) { |
michael@0 | 1378 | mode = RBBI_RUN; |
michael@0 | 1379 | } |
michael@0 | 1380 | } |
michael@0 | 1381 | } |
michael@0 | 1382 | |
michael@0 | 1383 | // The state machine is done. Check whether it found a match... |
michael@0 | 1384 | |
michael@0 | 1385 | // If the iterator failed to advance in the match engine, force it ahead by one. |
michael@0 | 1386 | // (This really indicates a defect in the break rules. They should always match |
michael@0 | 1387 | // at least one character.) |
michael@0 | 1388 | if (result == initialPosition) { |
michael@0 | 1389 | UTEXT_SETNATIVEINDEX(fText, initialPosition); |
michael@0 | 1390 | UTEXT_PREVIOUS32(fText); |
michael@0 | 1391 | result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1392 | } |
michael@0 | 1393 | |
michael@0 | 1394 | // Leave the iterator at our result position. |
michael@0 | 1395 | UTEXT_SETNATIVEINDEX(fText, result); |
michael@0 | 1396 | #ifdef RBBI_DEBUG |
michael@0 | 1397 | if (fTrace) { |
michael@0 | 1398 | RBBIDebugPrintf("result = %d\n\n", result); |
michael@0 | 1399 | } |
michael@0 | 1400 | #endif |
michael@0 | 1401 | return result; |
michael@0 | 1402 | } |
michael@0 | 1403 | |
michael@0 | 1404 | |
michael@0 | 1405 | void |
michael@0 | 1406 | RuleBasedBreakIterator::reset() |
michael@0 | 1407 | { |
michael@0 | 1408 | if (fCachedBreakPositions) { |
michael@0 | 1409 | uprv_free(fCachedBreakPositions); |
michael@0 | 1410 | } |
michael@0 | 1411 | fCachedBreakPositions = NULL; |
michael@0 | 1412 | fNumCachedBreakPositions = 0; |
michael@0 | 1413 | fDictionaryCharCount = 0; |
michael@0 | 1414 | fPositionInCache = 0; |
michael@0 | 1415 | } |
michael@0 | 1416 | |
michael@0 | 1417 | |
michael@0 | 1418 | |
michael@0 | 1419 | //------------------------------------------------------------------------------- |
michael@0 | 1420 | // |
michael@0 | 1421 | // getRuleStatus() Return the break rule tag associated with the current |
michael@0 | 1422 | // iterator position. If the iterator arrived at its current |
michael@0 | 1423 | // position by iterating forwards, the value will have been |
michael@0 | 1424 | // cached by the handleNext() function. |
michael@0 | 1425 | // |
michael@0 | 1426 | // If no cached status value is available, the status is |
michael@0 | 1427 | // found by doing a previous() followed by a next(), which |
michael@0 | 1428 | // leaves the iterator where it started, and computes the |
michael@0 | 1429 | // status while doing the next(). |
michael@0 | 1430 | // |
michael@0 | 1431 | //------------------------------------------------------------------------------- |
michael@0 | 1432 | void RuleBasedBreakIterator::makeRuleStatusValid() { |
michael@0 | 1433 | if (fLastStatusIndexValid == FALSE) { |
michael@0 | 1434 | // No cached status is available. |
michael@0 | 1435 | if (fText == NULL || current() == 0) { |
michael@0 | 1436 | // At start of text, or there is no text. Status is always zero. |
michael@0 | 1437 | fLastRuleStatusIndex = 0; |
michael@0 | 1438 | fLastStatusIndexValid = TRUE; |
michael@0 | 1439 | } else { |
michael@0 | 1440 | // Not at start of text. Find status the tedious way. |
michael@0 | 1441 | int32_t pa = current(); |
michael@0 | 1442 | previous(); |
michael@0 | 1443 | if (fNumCachedBreakPositions > 0) { |
michael@0 | 1444 | reset(); // Blow off the dictionary cache |
michael@0 | 1445 | } |
michael@0 | 1446 | int32_t pb = next(); |
michael@0 | 1447 | if (pa != pb) { |
michael@0 | 1448 | // note: the if (pa != pb) test is here only to eliminate warnings for |
michael@0 | 1449 | // unused local variables on gcc. Logically, it isn't needed. |
michael@0 | 1450 | U_ASSERT(pa == pb); |
michael@0 | 1451 | } |
michael@0 | 1452 | } |
michael@0 | 1453 | } |
michael@0 | 1454 | U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); |
michael@0 | 1455 | } |
michael@0 | 1456 | |
michael@0 | 1457 | |
michael@0 | 1458 | int32_t RuleBasedBreakIterator::getRuleStatus() const { |
michael@0 | 1459 | RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; |
michael@0 | 1460 | nonConstThis->makeRuleStatusValid(); |
michael@0 | 1461 | |
michael@0 | 1462 | // fLastRuleStatusIndex indexes to the start of the appropriate status record |
michael@0 | 1463 | // (the number of status values.) |
michael@0 | 1464 | // This function returns the last (largest) of the array of status values. |
michael@0 | 1465 | int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; |
michael@0 | 1466 | int32_t tagVal = fData->fRuleStatusTable[idx]; |
michael@0 | 1467 | |
michael@0 | 1468 | return tagVal; |
michael@0 | 1469 | } |
michael@0 | 1470 | |
michael@0 | 1471 | |
michael@0 | 1472 | |
michael@0 | 1473 | |
michael@0 | 1474 | int32_t RuleBasedBreakIterator::getRuleStatusVec( |
michael@0 | 1475 | int32_t *fillInVec, int32_t capacity, UErrorCode &status) |
michael@0 | 1476 | { |
michael@0 | 1477 | if (U_FAILURE(status)) { |
michael@0 | 1478 | return 0; |
michael@0 | 1479 | } |
michael@0 | 1480 | |
michael@0 | 1481 | RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; |
michael@0 | 1482 | nonConstThis->makeRuleStatusValid(); |
michael@0 | 1483 | int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; |
michael@0 | 1484 | int32_t numValsToCopy = numVals; |
michael@0 | 1485 | if (numVals > capacity) { |
michael@0 | 1486 | status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1487 | numValsToCopy = capacity; |
michael@0 | 1488 | } |
michael@0 | 1489 | int i; |
michael@0 | 1490 | for (i=0; i<numValsToCopy; i++) { |
michael@0 | 1491 | fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; |
michael@0 | 1492 | } |
michael@0 | 1493 | return numVals; |
michael@0 | 1494 | } |
michael@0 | 1495 | |
michael@0 | 1496 | |
michael@0 | 1497 | |
michael@0 | 1498 | //------------------------------------------------------------------------------- |
michael@0 | 1499 | // |
michael@0 | 1500 | // getBinaryRules Access to the compiled form of the rules, |
michael@0 | 1501 | // for use by build system tools that save the data |
michael@0 | 1502 | // for standard iterator types. |
michael@0 | 1503 | // |
michael@0 | 1504 | //------------------------------------------------------------------------------- |
michael@0 | 1505 | const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { |
michael@0 | 1506 | const uint8_t *retPtr = NULL; |
michael@0 | 1507 | length = 0; |
michael@0 | 1508 | |
michael@0 | 1509 | if (fData != NULL) { |
michael@0 | 1510 | retPtr = (const uint8_t *)fData->fHeader; |
michael@0 | 1511 | length = fData->fHeader->fLength; |
michael@0 | 1512 | } |
michael@0 | 1513 | return retPtr; |
michael@0 | 1514 | } |
michael@0 | 1515 | |
michael@0 | 1516 | |
michael@0 | 1517 | BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, |
michael@0 | 1518 | int32_t &bufferSize, |
michael@0 | 1519 | UErrorCode &status) |
michael@0 | 1520 | { |
michael@0 | 1521 | if (U_FAILURE(status)){ |
michael@0 | 1522 | return NULL; |
michael@0 | 1523 | } |
michael@0 | 1524 | |
michael@0 | 1525 | if (bufferSize == 0) { |
michael@0 | 1526 | bufferSize = 1; // preflighting for deprecated functionality |
michael@0 | 1527 | return NULL; |
michael@0 | 1528 | } |
michael@0 | 1529 | |
michael@0 | 1530 | BreakIterator *clonedBI = clone(); |
michael@0 | 1531 | if (clonedBI == NULL) { |
michael@0 | 1532 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1533 | } else { |
michael@0 | 1534 | status = U_SAFECLONE_ALLOCATED_WARNING; |
michael@0 | 1535 | } |
michael@0 | 1536 | return (RuleBasedBreakIterator *)clonedBI; |
michael@0 | 1537 | } |
michael@0 | 1538 | |
michael@0 | 1539 | |
michael@0 | 1540 | //------------------------------------------------------------------------------- |
michael@0 | 1541 | // |
michael@0 | 1542 | // isDictionaryChar Return true if the category lookup for this char |
michael@0 | 1543 | // indicates that it is in the set of dictionary lookup |
michael@0 | 1544 | // chars. |
michael@0 | 1545 | // |
michael@0 | 1546 | // This function is intended for use by dictionary based |
michael@0 | 1547 | // break iterators. |
michael@0 | 1548 | // |
michael@0 | 1549 | //------------------------------------------------------------------------------- |
michael@0 | 1550 | /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { |
michael@0 | 1551 | if (fData == NULL) { |
michael@0 | 1552 | return FALSE; |
michael@0 | 1553 | } |
michael@0 | 1554 | uint16_t category; |
michael@0 | 1555 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1556 | return (category & 0x4000) != 0; |
michael@0 | 1557 | }*/ |
michael@0 | 1558 | |
michael@0 | 1559 | |
michael@0 | 1560 | //------------------------------------------------------------------------------- |
michael@0 | 1561 | // |
michael@0 | 1562 | // checkDictionary This function handles all processing of characters in |
michael@0 | 1563 | // the "dictionary" set. It will determine the appropriate |
michael@0 | 1564 | // course of action, and possibly set up a cache in the |
michael@0 | 1565 | // process. |
michael@0 | 1566 | // |
michael@0 | 1567 | //------------------------------------------------------------------------------- |
michael@0 | 1568 | int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, |
michael@0 | 1569 | int32_t endPos, |
michael@0 | 1570 | UBool reverse) { |
michael@0 | 1571 | // Reset the old break cache first. |
michael@0 | 1572 | reset(); |
michael@0 | 1573 | |
michael@0 | 1574 | // note: code segment below assumes that dictionary chars are in the |
michael@0 | 1575 | // startPos-endPos range |
michael@0 | 1576 | // value returned should be next character in sequence |
michael@0 | 1577 | if ((endPos - startPos) <= 1) { |
michael@0 | 1578 | return (reverse ? startPos : endPos); |
michael@0 | 1579 | } |
michael@0 | 1580 | |
michael@0 | 1581 | // Bug 5532. The dictionary code will crash if the input text is UTF-8 |
michael@0 | 1582 | // because native indexes are different from UTF-16 indexes. |
michael@0 | 1583 | // Temporary hack: skip dictionary lookup for UTF-8 encoded text. |
michael@0 | 1584 | // It wont give the right breaks, but it's better than a crash. |
michael@0 | 1585 | // |
michael@0 | 1586 | // Check the type of the UText by checking its pFuncs field, which |
michael@0 | 1587 | // is UText's function dispatch table. It will be the same for all |
michael@0 | 1588 | // UTF-8 UTexts and different for any other UText type. |
michael@0 | 1589 | // |
michael@0 | 1590 | // We have no other type of UText available with non-UTF-16 native indexing. |
michael@0 | 1591 | // This whole check will go away once the dictionary code is fixed. |
michael@0 | 1592 | static const void *utext_utf8Funcs; |
michael@0 | 1593 | if (utext_utf8Funcs == NULL) { |
michael@0 | 1594 | // Cache the UTF-8 UText function pointer value. |
michael@0 | 1595 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1596 | UText tempUText = UTEXT_INITIALIZER; |
michael@0 | 1597 | utext_openUTF8(&tempUText, NULL, 0, &status); |
michael@0 | 1598 | utext_utf8Funcs = tempUText.pFuncs; |
michael@0 | 1599 | utext_close(&tempUText); |
michael@0 | 1600 | } |
michael@0 | 1601 | if (fText->pFuncs == utext_utf8Funcs) { |
michael@0 | 1602 | return (reverse ? startPos : endPos); |
michael@0 | 1603 | } |
michael@0 | 1604 | |
michael@0 | 1605 | // Starting from the starting point, scan towards the proposed result, |
michael@0 | 1606 | // looking for the first dictionary character (which may be the one |
michael@0 | 1607 | // we're on, if we're starting in the middle of a range). |
michael@0 | 1608 | utext_setNativeIndex(fText, reverse ? endPos : startPos); |
michael@0 | 1609 | if (reverse) { |
michael@0 | 1610 | UTEXT_PREVIOUS32(fText); |
michael@0 | 1611 | } |
michael@0 | 1612 | |
michael@0 | 1613 | int32_t rangeStart = startPos; |
michael@0 | 1614 | int32_t rangeEnd = endPos; |
michael@0 | 1615 | |
michael@0 | 1616 | uint16_t category; |
michael@0 | 1617 | int32_t current; |
michael@0 | 1618 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1619 | UStack breaks(status); |
michael@0 | 1620 | int32_t foundBreakCount = 0; |
michael@0 | 1621 | UChar32 c = utext_current32(fText); |
michael@0 | 1622 | |
michael@0 | 1623 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1624 | |
michael@0 | 1625 | // Is the character we're starting on a dictionary character? If so, we |
michael@0 | 1626 | // need to back up to include the entire run; otherwise the results of |
michael@0 | 1627 | // the break algorithm will differ depending on where we start. Since |
michael@0 | 1628 | // the result is cached and there is typically a non-dictionary break |
michael@0 | 1629 | // within a small number of words, there should be little performance impact. |
michael@0 | 1630 | if (category & 0x4000) { |
michael@0 | 1631 | if (reverse) { |
michael@0 | 1632 | do { |
michael@0 | 1633 | utext_next32(fText); // TODO: recast to work directly with postincrement. |
michael@0 | 1634 | c = utext_current32(fText); |
michael@0 | 1635 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1636 | } while (c != U_SENTINEL && (category & 0x4000)); |
michael@0 | 1637 | // Back up to the last dictionary character |
michael@0 | 1638 | rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
michael@0 | 1639 | if (c == U_SENTINEL) { |
michael@0 | 1640 | // c = fText->last32(); |
michael@0 | 1641 | // TODO: why was this if needed? |
michael@0 | 1642 | c = UTEXT_PREVIOUS32(fText); |
michael@0 | 1643 | } |
michael@0 | 1644 | else { |
michael@0 | 1645 | c = UTEXT_PREVIOUS32(fText); |
michael@0 | 1646 | } |
michael@0 | 1647 | } |
michael@0 | 1648 | else { |
michael@0 | 1649 | do { |
michael@0 | 1650 | c = UTEXT_PREVIOUS32(fText); |
michael@0 | 1651 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1652 | } |
michael@0 | 1653 | while (c != U_SENTINEL && (category & 0x4000)); |
michael@0 | 1654 | // Back up to the last dictionary character |
michael@0 | 1655 | if (c == U_SENTINEL) { |
michael@0 | 1656 | // c = fText->first32(); |
michael@0 | 1657 | c = utext_current32(fText); |
michael@0 | 1658 | } |
michael@0 | 1659 | else { |
michael@0 | 1660 | utext_next32(fText); |
michael@0 | 1661 | c = utext_current32(fText); |
michael@0 | 1662 | } |
michael@0 | 1663 | rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; |
michael@0 | 1664 | } |
michael@0 | 1665 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1666 | } |
michael@0 | 1667 | |
michael@0 | 1668 | // Loop through the text, looking for ranges of dictionary characters. |
michael@0 | 1669 | // For each span, find the appropriate break engine, and ask it to find |
michael@0 | 1670 | // any breaks within the span. |
michael@0 | 1671 | // Note: we always do this in the forward direction, so that the break |
michael@0 | 1672 | // cache is built in the right order. |
michael@0 | 1673 | if (reverse) { |
michael@0 | 1674 | utext_setNativeIndex(fText, rangeStart); |
michael@0 | 1675 | c = utext_current32(fText); |
michael@0 | 1676 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1677 | } |
michael@0 | 1678 | while(U_SUCCESS(status)) { |
michael@0 | 1679 | while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { |
michael@0 | 1680 | utext_next32(fText); // TODO: tweak for post-increment operation |
michael@0 | 1681 | c = utext_current32(fText); |
michael@0 | 1682 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1683 | } |
michael@0 | 1684 | if (current >= rangeEnd) { |
michael@0 | 1685 | break; |
michael@0 | 1686 | } |
michael@0 | 1687 | |
michael@0 | 1688 | // We now have a dictionary character. Get the appropriate language object |
michael@0 | 1689 | // to deal with it. |
michael@0 | 1690 | const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); |
michael@0 | 1691 | |
michael@0 | 1692 | // Ask the language object if there are any breaks. It will leave the text |
michael@0 | 1693 | // pointer on the other side of its range, ready to search for the next one. |
michael@0 | 1694 | if (lbe != NULL) { |
michael@0 | 1695 | foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); |
michael@0 | 1696 | } |
michael@0 | 1697 | |
michael@0 | 1698 | // Reload the loop variables for the next go-round |
michael@0 | 1699 | c = utext_current32(fText); |
michael@0 | 1700 | UTRIE_GET16(&fData->fTrie, c, category); |
michael@0 | 1701 | } |
michael@0 | 1702 | |
michael@0 | 1703 | // If we found breaks, build a new break cache. The first and last entries must |
michael@0 | 1704 | // be the original starting and ending position. |
michael@0 | 1705 | if (foundBreakCount > 0) { |
michael@0 | 1706 | int32_t totalBreaks = foundBreakCount; |
michael@0 | 1707 | if (startPos < breaks.elementAti(0)) { |
michael@0 | 1708 | totalBreaks += 1; |
michael@0 | 1709 | } |
michael@0 | 1710 | if (endPos > breaks.peeki()) { |
michael@0 | 1711 | totalBreaks += 1; |
michael@0 | 1712 | } |
michael@0 | 1713 | fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); |
michael@0 | 1714 | if (fCachedBreakPositions != NULL) { |
michael@0 | 1715 | int32_t out = 0; |
michael@0 | 1716 | fNumCachedBreakPositions = totalBreaks; |
michael@0 | 1717 | if (startPos < breaks.elementAti(0)) { |
michael@0 | 1718 | fCachedBreakPositions[out++] = startPos; |
michael@0 | 1719 | } |
michael@0 | 1720 | for (int32_t i = 0; i < foundBreakCount; ++i) { |
michael@0 | 1721 | fCachedBreakPositions[out++] = breaks.elementAti(i); |
michael@0 | 1722 | } |
michael@0 | 1723 | if (endPos > fCachedBreakPositions[out-1]) { |
michael@0 | 1724 | fCachedBreakPositions[out] = endPos; |
michael@0 | 1725 | } |
michael@0 | 1726 | // If there are breaks, then by definition, we are replacing the original |
michael@0 | 1727 | // proposed break by one of the breaks we found. Use following() and |
michael@0 | 1728 | // preceding() to do the work. They should never recurse in this case. |
michael@0 | 1729 | if (reverse) { |
michael@0 | 1730 | return preceding(endPos); |
michael@0 | 1731 | } |
michael@0 | 1732 | else { |
michael@0 | 1733 | return following(startPos); |
michael@0 | 1734 | } |
michael@0 | 1735 | } |
michael@0 | 1736 | // If the allocation failed, just fall through to the "no breaks found" case. |
michael@0 | 1737 | } |
michael@0 | 1738 | |
michael@0 | 1739 | // If we get here, there were no language-based breaks. Set the text pointer |
michael@0 | 1740 | // to the original proposed break. |
michael@0 | 1741 | utext_setNativeIndex(fText, reverse ? startPos : endPos); |
michael@0 | 1742 | return (reverse ? startPos : endPos); |
michael@0 | 1743 | } |
michael@0 | 1744 | |
michael@0 | 1745 | // defined in ucln_cmn.h |
michael@0 | 1746 | |
michael@0 | 1747 | U_NAMESPACE_END |
michael@0 | 1748 | |
michael@0 | 1749 | |
michael@0 | 1750 | static icu::UStack *gLanguageBreakFactories = NULL; |
michael@0 | 1751 | static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; |
michael@0 | 1752 | |
michael@0 | 1753 | /** |
michael@0 | 1754 | * Release all static memory held by breakiterator. |
michael@0 | 1755 | */ |
michael@0 | 1756 | U_CDECL_BEGIN |
michael@0 | 1757 | static UBool U_CALLCONV breakiterator_cleanup_dict(void) { |
michael@0 | 1758 | if (gLanguageBreakFactories) { |
michael@0 | 1759 | delete gLanguageBreakFactories; |
michael@0 | 1760 | gLanguageBreakFactories = NULL; |
michael@0 | 1761 | } |
michael@0 | 1762 | gLanguageBreakFactoriesInitOnce.reset(); |
michael@0 | 1763 | return TRUE; |
michael@0 | 1764 | } |
michael@0 | 1765 | U_CDECL_END |
michael@0 | 1766 | |
michael@0 | 1767 | U_CDECL_BEGIN |
michael@0 | 1768 | static void U_CALLCONV _deleteFactory(void *obj) { |
michael@0 | 1769 | delete (icu::LanguageBreakFactory *) obj; |
michael@0 | 1770 | } |
michael@0 | 1771 | U_CDECL_END |
michael@0 | 1772 | U_NAMESPACE_BEGIN |
michael@0 | 1773 | |
michael@0 | 1774 | static void U_CALLCONV initLanguageFactories() { |
michael@0 | 1775 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1776 | U_ASSERT(gLanguageBreakFactories == NULL); |
michael@0 | 1777 | gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); |
michael@0 | 1778 | if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { |
michael@0 | 1779 | ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); |
michael@0 | 1780 | gLanguageBreakFactories->push(builtIn, status); |
michael@0 | 1781 | #ifdef U_LOCAL_SERVICE_HOOK |
michael@0 | 1782 | LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); |
michael@0 | 1783 | if (extra != NULL) { |
michael@0 | 1784 | gLanguageBreakFactories->push(extra, status); |
michael@0 | 1785 | } |
michael@0 | 1786 | #endif |
michael@0 | 1787 | } |
michael@0 | 1788 | ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); |
michael@0 | 1789 | } |
michael@0 | 1790 | |
michael@0 | 1791 | |
michael@0 | 1792 | static const LanguageBreakEngine* |
michael@0 | 1793 | getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) |
michael@0 | 1794 | { |
michael@0 | 1795 | umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); |
michael@0 | 1796 | if (gLanguageBreakFactories == NULL) { |
michael@0 | 1797 | return NULL; |
michael@0 | 1798 | } |
michael@0 | 1799 | |
michael@0 | 1800 | int32_t i = gLanguageBreakFactories->size(); |
michael@0 | 1801 | const LanguageBreakEngine *lbe = NULL; |
michael@0 | 1802 | while (--i >= 0) { |
michael@0 | 1803 | LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); |
michael@0 | 1804 | lbe = factory->getEngineFor(c, breakType); |
michael@0 | 1805 | if (lbe != NULL) { |
michael@0 | 1806 | break; |
michael@0 | 1807 | } |
michael@0 | 1808 | } |
michael@0 | 1809 | return lbe; |
michael@0 | 1810 | } |
michael@0 | 1811 | |
michael@0 | 1812 | |
michael@0 | 1813 | //------------------------------------------------------------------------------- |
michael@0 | 1814 | // |
michael@0 | 1815 | // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the |
michael@0 | 1816 | // the character c. |
michael@0 | 1817 | // |
michael@0 | 1818 | //------------------------------------------------------------------------------- |
michael@0 | 1819 | const LanguageBreakEngine * |
michael@0 | 1820 | RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { |
michael@0 | 1821 | const LanguageBreakEngine *lbe = NULL; |
michael@0 | 1822 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1823 | |
michael@0 | 1824 | if (fLanguageBreakEngines == NULL) { |
michael@0 | 1825 | fLanguageBreakEngines = new UStack(status); |
michael@0 | 1826 | if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { |
michael@0 | 1827 | delete fLanguageBreakEngines; |
michael@0 | 1828 | fLanguageBreakEngines = 0; |
michael@0 | 1829 | return NULL; |
michael@0 | 1830 | } |
michael@0 | 1831 | } |
michael@0 | 1832 | |
michael@0 | 1833 | int32_t i = fLanguageBreakEngines->size(); |
michael@0 | 1834 | while (--i >= 0) { |
michael@0 | 1835 | lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); |
michael@0 | 1836 | if (lbe->handles(c, fBreakType)) { |
michael@0 | 1837 | return lbe; |
michael@0 | 1838 | } |
michael@0 | 1839 | } |
michael@0 | 1840 | |
michael@0 | 1841 | // No existing dictionary took the character. See if a factory wants to |
michael@0 | 1842 | // give us a new LanguageBreakEngine for this character. |
michael@0 | 1843 | lbe = getLanguageBreakEngineFromFactory(c, fBreakType); |
michael@0 | 1844 | |
michael@0 | 1845 | // If we got one, use it and push it on our stack. |
michael@0 | 1846 | if (lbe != NULL) { |
michael@0 | 1847 | fLanguageBreakEngines->push((void *)lbe, status); |
michael@0 | 1848 | // Even if we can't remember it, we can keep looking it up, so |
michael@0 | 1849 | // return it even if the push fails. |
michael@0 | 1850 | return lbe; |
michael@0 | 1851 | } |
michael@0 | 1852 | |
michael@0 | 1853 | // No engine is forthcoming for this character. Add it to the |
michael@0 | 1854 | // reject set. Create the reject break engine if needed. |
michael@0 | 1855 | if (fUnhandledBreakEngine == NULL) { |
michael@0 | 1856 | fUnhandledBreakEngine = new UnhandledEngine(status); |
michael@0 | 1857 | if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { |
michael@0 | 1858 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1859 | } |
michael@0 | 1860 | // Put it last so that scripts for which we have an engine get tried |
michael@0 | 1861 | // first. |
michael@0 | 1862 | fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); |
michael@0 | 1863 | // If we can't insert it, or creation failed, get rid of it |
michael@0 | 1864 | if (U_FAILURE(status)) { |
michael@0 | 1865 | delete fUnhandledBreakEngine; |
michael@0 | 1866 | fUnhandledBreakEngine = 0; |
michael@0 | 1867 | return NULL; |
michael@0 | 1868 | } |
michael@0 | 1869 | } |
michael@0 | 1870 | |
michael@0 | 1871 | // Tell the reject engine about the character; at its discretion, it may |
michael@0 | 1872 | // add more than just the one character. |
michael@0 | 1873 | fUnhandledBreakEngine->handleCharacter(c, fBreakType); |
michael@0 | 1874 | |
michael@0 | 1875 | return fUnhandledBreakEngine; |
michael@0 | 1876 | } |
michael@0 | 1877 | |
michael@0 | 1878 | |
michael@0 | 1879 | |
michael@0 | 1880 | /*int32_t RuleBasedBreakIterator::getBreakType() const { |
michael@0 | 1881 | return fBreakType; |
michael@0 | 1882 | }*/ |
michael@0 | 1883 | |
michael@0 | 1884 | void RuleBasedBreakIterator::setBreakType(int32_t type) { |
michael@0 | 1885 | fBreakType = type; |
michael@0 | 1886 | reset(); |
michael@0 | 1887 | } |
michael@0 | 1888 | |
michael@0 | 1889 | U_NAMESPACE_END |
michael@0 | 1890 | |
michael@0 | 1891 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |