intl/icu/source/common/rbbi.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 ***************************************************************************
michael@0 3 * Copyright (C) 1999-2013 International Business Machines Corporation
michael@0 4 * and others. All rights reserved.
michael@0 5 ***************************************************************************
michael@0 6 */
michael@0 7 //
michael@0 8 // file: rbbi.c Contains the implementation of the rule based break iterator
michael@0 9 // runtime engine and the API implementation for
michael@0 10 // class RuleBasedBreakIterator
michael@0 11 //
michael@0 12
michael@0 13 #include "utypeinfo.h" // for 'typeid' to work
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16
michael@0 17 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 18
michael@0 19 #include "unicode/rbbi.h"
michael@0 20 #include "unicode/schriter.h"
michael@0 21 #include "unicode/uchriter.h"
michael@0 22 #include "unicode/udata.h"
michael@0 23 #include "unicode/uclean.h"
michael@0 24 #include "rbbidata.h"
michael@0 25 #include "rbbirb.h"
michael@0 26 #include "cmemory.h"
michael@0 27 #include "cstring.h"
michael@0 28 #include "umutex.h"
michael@0 29 #include "ucln_cmn.h"
michael@0 30 #include "brkeng.h"
michael@0 31
michael@0 32 #include "uassert.h"
michael@0 33 #include "uvector.h"
michael@0 34
michael@0 35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
michael@0 36 #if U_LOCAL_SERVICE_HOOK
michael@0 37 #include "localsvc.h"
michael@0 38 #endif
michael@0 39
michael@0 40 #ifdef RBBI_DEBUG
michael@0 41 static UBool fTrace = FALSE;
michael@0 42 #endif
michael@0 43
michael@0 44 U_NAMESPACE_BEGIN
michael@0 45
michael@0 46 // The state number of the starting state
michael@0 47 #define START_STATE 1
michael@0 48
michael@0 49 // The state-transition value indicating "stop"
michael@0 50 #define STOP_STATE 0
michael@0 51
michael@0 52
michael@0 53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
michael@0 54
michael@0 55
michael@0 56 //=======================================================================
michael@0 57 // constructors
michael@0 58 //=======================================================================
michael@0 59
michael@0 60 /**
michael@0 61 * Constructs a RuleBasedBreakIterator that uses the already-created
michael@0 62 * tables object that is passed in as a parameter.
michael@0 63 */
michael@0 64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
michael@0 65 {
michael@0 66 init();
michael@0 67 fData = new RBBIDataWrapper(data, status); // status checked in constructor
michael@0 68 if (U_FAILURE(status)) {return;}
michael@0 69 if(fData == 0) {
michael@0 70 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 71 return;
michael@0 72 }
michael@0 73 }
michael@0 74
michael@0 75 /**
michael@0 76 * Same as above but does not adopt memory
michael@0 77 */
michael@0 78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
michael@0 79 {
michael@0 80 init();
michael@0 81 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
michael@0 82 if (U_FAILURE(status)) {return;}
michael@0 83 if(fData == 0) {
michael@0 84 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 85 return;
michael@0 86 }
michael@0 87 }
michael@0 88
michael@0 89
michael@0 90 //
michael@0 91 // Construct from precompiled binary rules (tables). This constructor is public API,
michael@0 92 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
michael@0 93 //
michael@0 94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
michael@0 95 uint32_t ruleLength,
michael@0 96 UErrorCode &status) {
michael@0 97 init();
michael@0 98 if (U_FAILURE(status)) {
michael@0 99 return;
michael@0 100 }
michael@0 101 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
michael@0 102 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 103 return;
michael@0 104 }
michael@0 105 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
michael@0 106 if (data->fLength > ruleLength) {
michael@0 107 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 108 return;
michael@0 109 }
michael@0 110 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
michael@0 111 if (U_FAILURE(status)) {return;}
michael@0 112 if(fData == 0) {
michael@0 113 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 114 return;
michael@0 115 }
michael@0 116 }
michael@0 117
michael@0 118
michael@0 119 //-------------------------------------------------------------------------------
michael@0 120 //
michael@0 121 // Constructor from a UDataMemory handle to precompiled break rules
michael@0 122 // stored in an ICU data file.
michael@0 123 //
michael@0 124 //-------------------------------------------------------------------------------
michael@0 125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
michael@0 126 {
michael@0 127 init();
michael@0 128 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
michael@0 129 if (U_FAILURE(status)) {return;}
michael@0 130 if(fData == 0) {
michael@0 131 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 132 return;
michael@0 133 }
michael@0 134 }
michael@0 135
michael@0 136
michael@0 137
michael@0 138 //-------------------------------------------------------------------------------
michael@0 139 //
michael@0 140 // Constructor from a set of rules supplied as a string.
michael@0 141 //
michael@0 142 //-------------------------------------------------------------------------------
michael@0 143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
michael@0 144 UParseError &parseError,
michael@0 145 UErrorCode &status)
michael@0 146 {
michael@0 147 init();
michael@0 148 if (U_FAILURE(status)) {return;}
michael@0 149 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
michael@0 150 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
michael@0 151 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
michael@0 152 // creates and returns a complete RBBI. From here, in a constructor, we
michael@0 153 // can't just return the object created by the builder factory, hence
michael@0 154 // the assignment of the factory created object to "this".
michael@0 155 if (U_SUCCESS(status)) {
michael@0 156 *this = *bi;
michael@0 157 delete bi;
michael@0 158 }
michael@0 159 }
michael@0 160
michael@0 161
michael@0 162 //-------------------------------------------------------------------------------
michael@0 163 //
michael@0 164 // Default Constructor. Create an empty shell that can be set up later.
michael@0 165 // Used when creating a RuleBasedBreakIterator from a set
michael@0 166 // of rules.
michael@0 167 //-------------------------------------------------------------------------------
michael@0 168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
michael@0 169 init();
michael@0 170 }
michael@0 171
michael@0 172
michael@0 173 //-------------------------------------------------------------------------------
michael@0 174 //
michael@0 175 // Copy constructor. Will produce a break iterator with the same behavior,
michael@0 176 // and which iterates over the same text, as the one passed in.
michael@0 177 //
michael@0 178 //-------------------------------------------------------------------------------
michael@0 179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
michael@0 180 : BreakIterator(other)
michael@0 181 {
michael@0 182 this->init();
michael@0 183 *this = other;
michael@0 184 }
michael@0 185
michael@0 186
michael@0 187 /**
michael@0 188 * Destructor
michael@0 189 */
michael@0 190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
michael@0 191 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
michael@0 192 // fCharIter was adopted from the outside.
michael@0 193 delete fCharIter;
michael@0 194 }
michael@0 195 fCharIter = NULL;
michael@0 196 delete fSCharIter;
michael@0 197 fCharIter = NULL;
michael@0 198 delete fDCharIter;
michael@0 199 fDCharIter = NULL;
michael@0 200
michael@0 201 utext_close(fText);
michael@0 202
michael@0 203 if (fData != NULL) {
michael@0 204 fData->removeReference();
michael@0 205 fData = NULL;
michael@0 206 }
michael@0 207 if (fCachedBreakPositions) {
michael@0 208 uprv_free(fCachedBreakPositions);
michael@0 209 fCachedBreakPositions = NULL;
michael@0 210 }
michael@0 211 if (fLanguageBreakEngines) {
michael@0 212 delete fLanguageBreakEngines;
michael@0 213 fLanguageBreakEngines = NULL;
michael@0 214 }
michael@0 215 if (fUnhandledBreakEngine) {
michael@0 216 delete fUnhandledBreakEngine;
michael@0 217 fUnhandledBreakEngine = NULL;
michael@0 218 }
michael@0 219 }
michael@0 220
michael@0 221 /**
michael@0 222 * Assignment operator. Sets this iterator to have the same behavior,
michael@0 223 * and iterate over the same text, as the one passed in.
michael@0 224 */
michael@0 225 RuleBasedBreakIterator&
michael@0 226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
michael@0 227 if (this == &that) {
michael@0 228 return *this;
michael@0 229 }
michael@0 230 reset(); // Delete break cache information
michael@0 231 fBreakType = that.fBreakType;
michael@0 232 if (fLanguageBreakEngines != NULL) {
michael@0 233 delete fLanguageBreakEngines;
michael@0 234 fLanguageBreakEngines = NULL; // Just rebuild for now
michael@0 235 }
michael@0 236 // TODO: clone fLanguageBreakEngines from "that"
michael@0 237 UErrorCode status = U_ZERO_ERROR;
michael@0 238 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
michael@0 239
michael@0 240 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
michael@0 241 delete fCharIter;
michael@0 242 }
michael@0 243 fCharIter = NULL;
michael@0 244
michael@0 245 if (that.fCharIter != NULL ) {
michael@0 246 // This is a little bit tricky - it will intially appear that
michael@0 247 // this->fCharIter is adopted, even if that->fCharIter was
michael@0 248 // not adopted. That's ok.
michael@0 249 fCharIter = that.fCharIter->clone();
michael@0 250 }
michael@0 251
michael@0 252 if (fData != NULL) {
michael@0 253 fData->removeReference();
michael@0 254 fData = NULL;
michael@0 255 }
michael@0 256 if (that.fData != NULL) {
michael@0 257 fData = that.fData->addReference();
michael@0 258 }
michael@0 259
michael@0 260 return *this;
michael@0 261 }
michael@0 262
michael@0 263
michael@0 264
michael@0 265 //-----------------------------------------------------------------------------
michael@0 266 //
michael@0 267 // init() Shared initialization routine. Used by all the constructors.
michael@0 268 // Initializes all fields, leaving the object in a consistent state.
michael@0 269 //
michael@0 270 //-----------------------------------------------------------------------------
michael@0 271 void RuleBasedBreakIterator::init() {
michael@0 272 UErrorCode status = U_ZERO_ERROR;
michael@0 273 fText = utext_openUChars(NULL, NULL, 0, &status);
michael@0 274 fCharIter = NULL;
michael@0 275 fSCharIter = NULL;
michael@0 276 fDCharIter = NULL;
michael@0 277 fData = NULL;
michael@0 278 fLastRuleStatusIndex = 0;
michael@0 279 fLastStatusIndexValid = TRUE;
michael@0 280 fDictionaryCharCount = 0;
michael@0 281 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
michael@0 282 // dictionary behavior for Break Iterators that are
michael@0 283 // built from rules. Even better would be the ability to
michael@0 284 // declare the type in the rules.
michael@0 285
michael@0 286 fCachedBreakPositions = NULL;
michael@0 287 fLanguageBreakEngines = NULL;
michael@0 288 fUnhandledBreakEngine = NULL;
michael@0 289 fNumCachedBreakPositions = 0;
michael@0 290 fPositionInCache = 0;
michael@0 291
michael@0 292 #ifdef RBBI_DEBUG
michael@0 293 static UBool debugInitDone = FALSE;
michael@0 294 if (debugInitDone == FALSE) {
michael@0 295 char *debugEnv = getenv("U_RBBIDEBUG");
michael@0 296 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
michael@0 297 fTrace = TRUE;
michael@0 298 }
michael@0 299 debugInitDone = TRUE;
michael@0 300 }
michael@0 301 #endif
michael@0 302 }
michael@0 303
michael@0 304
michael@0 305
michael@0 306 //-----------------------------------------------------------------------------
michael@0 307 //
michael@0 308 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
michael@0 309 // behavior, and iterating over the same text, as this one.
michael@0 310 // Virtual function: does the right thing with subclasses.
michael@0 311 //
michael@0 312 //-----------------------------------------------------------------------------
michael@0 313 BreakIterator*
michael@0 314 RuleBasedBreakIterator::clone(void) const {
michael@0 315 return new RuleBasedBreakIterator(*this);
michael@0 316 }
michael@0 317
michael@0 318 /**
michael@0 319 * Equality operator. Returns TRUE if both BreakIterators are of the
michael@0 320 * same class, have the same behavior, and iterate over the same text.
michael@0 321 */
michael@0 322 UBool
michael@0 323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
michael@0 324 if (typeid(*this) != typeid(that)) {
michael@0 325 return FALSE;
michael@0 326 }
michael@0 327
michael@0 328 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
michael@0 329
michael@0 330 if (!utext_equals(fText, that2.fText)) {
michael@0 331 // The two break iterators are operating on different text,
michael@0 332 // or have a different interation position.
michael@0 333 return FALSE;
michael@0 334 };
michael@0 335
michael@0 336 // TODO: need a check for when in a dictionary region at different offsets.
michael@0 337
michael@0 338 if (that2.fData == fData ||
michael@0 339 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
michael@0 340 // The two break iterators are using the same rules.
michael@0 341 return TRUE;
michael@0 342 }
michael@0 343 return FALSE;
michael@0 344 }
michael@0 345
michael@0 346 /**
michael@0 347 * Compute a hash code for this BreakIterator
michael@0 348 * @return A hash code
michael@0 349 */
michael@0 350 int32_t
michael@0 351 RuleBasedBreakIterator::hashCode(void) const {
michael@0 352 int32_t hash = 0;
michael@0 353 if (fData != NULL) {
michael@0 354 hash = fData->hashCode();
michael@0 355 }
michael@0 356 return hash;
michael@0 357 }
michael@0 358
michael@0 359
michael@0 360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
michael@0 361 if (U_FAILURE(status)) {
michael@0 362 return;
michael@0 363 }
michael@0 364 reset();
michael@0 365 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
michael@0 366
michael@0 367 // Set up a dummy CharacterIterator to be returned if anyone
michael@0 368 // calls getText(). With input from UText, there is no reasonable
michael@0 369 // way to return a characterIterator over the actual input text.
michael@0 370 // Return one over an empty string instead - this is the closest
michael@0 371 // we can come to signaling a failure.
michael@0 372 // (GetText() is obsolete, this failure is sort of OK)
michael@0 373 if (fDCharIter == NULL) {
michael@0 374 static const UChar c = 0;
michael@0 375 fDCharIter = new UCharCharacterIterator(&c, 0);
michael@0 376 if (fDCharIter == NULL) {
michael@0 377 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 378 return;
michael@0 379 }
michael@0 380 }
michael@0 381
michael@0 382 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
michael@0 383 // existing fCharIter was adopted from the outside. Delete it now.
michael@0 384 delete fCharIter;
michael@0 385 }
michael@0 386 fCharIter = fDCharIter;
michael@0 387
michael@0 388 this->first();
michael@0 389 }
michael@0 390
michael@0 391
michael@0 392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
michael@0 393 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
michael@0 394 return result;
michael@0 395 }
michael@0 396
michael@0 397
michael@0 398
michael@0 399 /**
michael@0 400 * Returns the description used to create this iterator
michael@0 401 */
michael@0 402 const UnicodeString&
michael@0 403 RuleBasedBreakIterator::getRules() const {
michael@0 404 if (fData != NULL) {
michael@0 405 return fData->getRuleSourceString();
michael@0 406 } else {
michael@0 407 static const UnicodeString *s;
michael@0 408 if (s == NULL) {
michael@0 409 // TODO: something more elegant here.
michael@0 410 // perhaps API should return the string by value.
michael@0 411 // Note: thread unsafe init & leak are semi-ok, better than
michael@0 412 // what was before. Sould be cleaned up, though.
michael@0 413 s = new UnicodeString;
michael@0 414 }
michael@0 415 return *s;
michael@0 416 }
michael@0 417 }
michael@0 418
michael@0 419 //=======================================================================
michael@0 420 // BreakIterator overrides
michael@0 421 //=======================================================================
michael@0 422
michael@0 423 /**
michael@0 424 * Return a CharacterIterator over the text being analyzed.
michael@0 425 */
michael@0 426 CharacterIterator&
michael@0 427 RuleBasedBreakIterator::getText() const {
michael@0 428 return *fCharIter;
michael@0 429 }
michael@0 430
michael@0 431 /**
michael@0 432 * Set the iterator to analyze a new piece of text. This function resets
michael@0 433 * the current iteration position to the beginning of the text.
michael@0 434 * @param newText An iterator over the text to analyze.
michael@0 435 */
michael@0 436 void
michael@0 437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
michael@0 438 // If we are holding a CharacterIterator adopted from a
michael@0 439 // previous call to this function, delete it now.
michael@0 440 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
michael@0 441 delete fCharIter;
michael@0 442 }
michael@0 443
michael@0 444 fCharIter = newText;
michael@0 445 UErrorCode status = U_ZERO_ERROR;
michael@0 446 reset();
michael@0 447 if (newText==NULL || newText->startIndex() != 0) {
michael@0 448 // startIndex !=0 wants to be an error, but there's no way to report it.
michael@0 449 // Make the iterator text be an empty string.
michael@0 450 fText = utext_openUChars(fText, NULL, 0, &status);
michael@0 451 } else {
michael@0 452 fText = utext_openCharacterIterator(fText, newText, &status);
michael@0 453 }
michael@0 454 this->first();
michael@0 455 }
michael@0 456
michael@0 457 /**
michael@0 458 * Set the iterator to analyze a new piece of text. This function resets
michael@0 459 * the current iteration position to the beginning of the text.
michael@0 460 * @param newText An iterator over the text to analyze.
michael@0 461 */
michael@0 462 void
michael@0 463 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
michael@0 464 UErrorCode status = U_ZERO_ERROR;
michael@0 465 reset();
michael@0 466 fText = utext_openConstUnicodeString(fText, &newText, &status);
michael@0 467
michael@0 468 // Set up a character iterator on the string.
michael@0 469 // Needed in case someone calls getText().
michael@0 470 // Can not, unfortunately, do this lazily on the (probably never)
michael@0 471 // call to getText(), because getText is const.
michael@0 472 if (fSCharIter == NULL) {
michael@0 473 fSCharIter = new StringCharacterIterator(newText);
michael@0 474 } else {
michael@0 475 fSCharIter->setText(newText);
michael@0 476 }
michael@0 477
michael@0 478 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
michael@0 479 // old fCharIter was adopted from the outside. Delete it.
michael@0 480 delete fCharIter;
michael@0 481 }
michael@0 482 fCharIter = fSCharIter;
michael@0 483
michael@0 484 this->first();
michael@0 485 }
michael@0 486
michael@0 487
michael@0 488 /**
michael@0 489 * Provide a new UText for the input text. Must reference text with contents identical
michael@0 490 * to the original.
michael@0 491 * Intended for use with text data originating in Java (garbage collected) environments
michael@0 492 * where the data may be moved in memory at arbitrary times.
michael@0 493 */
michael@0 494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
michael@0 495 if (U_FAILURE(status)) {
michael@0 496 return *this;
michael@0 497 }
michael@0 498 if (input == NULL) {
michael@0 499 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 500 return *this;
michael@0 501 }
michael@0 502 int64_t pos = utext_getNativeIndex(fText);
michael@0 503 // Shallow read-only clone of the new UText into the existing input UText
michael@0 504 fText = utext_clone(fText, input, FALSE, TRUE, &status);
michael@0 505 if (U_FAILURE(status)) {
michael@0 506 return *this;
michael@0 507 }
michael@0 508 utext_setNativeIndex(fText, pos);
michael@0 509 if (utext_getNativeIndex(fText) != pos) {
michael@0 510 // Sanity check. The new input utext is supposed to have the exact same
michael@0 511 // contents as the old. If we can't set to the same position, it doesn't.
michael@0 512 // The contents underlying the old utext might be invalid at this point,
michael@0 513 // so it's not safe to check directly.
michael@0 514 status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 515 }
michael@0 516 return *this;
michael@0 517 }
michael@0 518
michael@0 519
michael@0 520 /**
michael@0 521 * Sets the current iteration position to the beginning of the text.
michael@0 522 * @return The offset of the beginning of the text.
michael@0 523 */
michael@0 524 int32_t RuleBasedBreakIterator::first(void) {
michael@0 525 reset();
michael@0 526 fLastRuleStatusIndex = 0;
michael@0 527 fLastStatusIndexValid = TRUE;
michael@0 528 //if (fText == NULL)
michael@0 529 // return BreakIterator::DONE;
michael@0 530
michael@0 531 utext_setNativeIndex(fText, 0);
michael@0 532 return 0;
michael@0 533 }
michael@0 534
michael@0 535 /**
michael@0 536 * Sets the current iteration position to the end of the text.
michael@0 537 * @return The text's past-the-end offset.
michael@0 538 */
michael@0 539 int32_t RuleBasedBreakIterator::last(void) {
michael@0 540 reset();
michael@0 541 if (fText == NULL) {
michael@0 542 fLastRuleStatusIndex = 0;
michael@0 543 fLastStatusIndexValid = TRUE;
michael@0 544 return BreakIterator::DONE;
michael@0 545 }
michael@0 546
michael@0 547 fLastStatusIndexValid = FALSE;
michael@0 548 int32_t pos = (int32_t)utext_nativeLength(fText);
michael@0 549 utext_setNativeIndex(fText, pos);
michael@0 550 return pos;
michael@0 551 }
michael@0 552
michael@0 553 /**
michael@0 554 * Advances the iterator either forward or backward the specified number of steps.
michael@0 555 * Negative values move backward, and positive values move forward. This is
michael@0 556 * equivalent to repeatedly calling next() or previous().
michael@0 557 * @param n The number of steps to move. The sign indicates the direction
michael@0 558 * (negative is backwards, and positive is forwards).
michael@0 559 * @return The character offset of the boundary position n boundaries away from
michael@0 560 * the current one.
michael@0 561 */
michael@0 562 int32_t RuleBasedBreakIterator::next(int32_t n) {
michael@0 563 int32_t result = current();
michael@0 564 while (n > 0) {
michael@0 565 result = next();
michael@0 566 --n;
michael@0 567 }
michael@0 568 while (n < 0) {
michael@0 569 result = previous();
michael@0 570 ++n;
michael@0 571 }
michael@0 572 return result;
michael@0 573 }
michael@0 574
michael@0 575 /**
michael@0 576 * Advances the iterator to the next boundary position.
michael@0 577 * @return The position of the first boundary after this one.
michael@0 578 */
michael@0 579 int32_t RuleBasedBreakIterator::next(void) {
michael@0 580 // if we have cached break positions and we're still in the range
michael@0 581 // covered by them, just move one step forward in the cache
michael@0 582 if (fCachedBreakPositions != NULL) {
michael@0 583 if (fPositionInCache < fNumCachedBreakPositions - 1) {
michael@0 584 ++fPositionInCache;
michael@0 585 int32_t pos = fCachedBreakPositions[fPositionInCache];
michael@0 586 utext_setNativeIndex(fText, pos);
michael@0 587 return pos;
michael@0 588 }
michael@0 589 else {
michael@0 590 reset();
michael@0 591 }
michael@0 592 }
michael@0 593
michael@0 594 int32_t startPos = current();
michael@0 595 int32_t result = handleNext(fData->fForwardTable);
michael@0 596 if (fDictionaryCharCount > 0) {
michael@0 597 result = checkDictionary(startPos, result, FALSE);
michael@0 598 }
michael@0 599 return result;
michael@0 600 }
michael@0 601
michael@0 602 /**
michael@0 603 * Advances the iterator backwards, to the last boundary preceding this one.
michael@0 604 * @return The position of the last boundary position preceding this one.
michael@0 605 */
michael@0 606 int32_t RuleBasedBreakIterator::previous(void) {
michael@0 607 int32_t result;
michael@0 608 int32_t startPos;
michael@0 609
michael@0 610 // if we have cached break positions and we're still in the range
michael@0 611 // covered by them, just move one step backward in the cache
michael@0 612 if (fCachedBreakPositions != NULL) {
michael@0 613 if (fPositionInCache > 0) {
michael@0 614 --fPositionInCache;
michael@0 615 // If we're at the beginning of the cache, need to reevaluate the
michael@0 616 // rule status
michael@0 617 if (fPositionInCache <= 0) {
michael@0 618 fLastStatusIndexValid = FALSE;
michael@0 619 }
michael@0 620 int32_t pos = fCachedBreakPositions[fPositionInCache];
michael@0 621 utext_setNativeIndex(fText, pos);
michael@0 622 return pos;
michael@0 623 }
michael@0 624 else {
michael@0 625 reset();
michael@0 626 }
michael@0 627 }
michael@0 628
michael@0 629 // if we're already sitting at the beginning of the text, return DONE
michael@0 630 if (fText == NULL || (startPos = current()) == 0) {
michael@0 631 fLastRuleStatusIndex = 0;
michael@0 632 fLastStatusIndexValid = TRUE;
michael@0 633 return BreakIterator::DONE;
michael@0 634 }
michael@0 635
michael@0 636 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
michael@0 637 result = handlePrevious(fData->fReverseTable);
michael@0 638 if (fDictionaryCharCount > 0) {
michael@0 639 result = checkDictionary(result, startPos, TRUE);
michael@0 640 }
michael@0 641 return result;
michael@0 642 }
michael@0 643
michael@0 644 // old rule syntax
michael@0 645 // set things up. handlePrevious() will back us up to some valid
michael@0 646 // break position before the current position (we back our internal
michael@0 647 // iterator up one step to prevent handlePrevious() from returning
michael@0 648 // the current position), but not necessarily the last one before
michael@0 649
michael@0 650 // where we started
michael@0 651
michael@0 652 int32_t start = current();
michael@0 653
michael@0 654 (void)UTEXT_PREVIOUS32(fText);
michael@0 655 int32_t lastResult = handlePrevious(fData->fReverseTable);
michael@0 656 if (lastResult == UBRK_DONE) {
michael@0 657 lastResult = 0;
michael@0 658 utext_setNativeIndex(fText, 0);
michael@0 659 }
michael@0 660 result = lastResult;
michael@0 661 int32_t lastTag = 0;
michael@0 662 UBool breakTagValid = FALSE;
michael@0 663
michael@0 664 // iterate forward from the known break position until we pass our
michael@0 665 // starting point. The last break position before the starting
michael@0 666 // point is our return value
michael@0 667
michael@0 668 for (;;) {
michael@0 669 result = next();
michael@0 670 if (result == BreakIterator::DONE || result >= start) {
michael@0 671 break;
michael@0 672 }
michael@0 673 lastResult = result;
michael@0 674 lastTag = fLastRuleStatusIndex;
michael@0 675 breakTagValid = TRUE;
michael@0 676 }
michael@0 677
michael@0 678 // fLastBreakTag wants to have the value for section of text preceding
michael@0 679 // the result position that we are to return (in lastResult.) If
michael@0 680 // the backwards rules overshot and the above loop had to do two or more
michael@0 681 // next()s to move up to the desired return position, we will have a valid
michael@0 682 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
michael@0 683 // we wont have a tag value for that position, which is only set by handleNext().
michael@0 684
michael@0 685 // set the current iteration position to be the last break position
michael@0 686 // before where we started, and then return that value
michael@0 687 utext_setNativeIndex(fText, lastResult);
michael@0 688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
michael@0 689 fLastStatusIndexValid = breakTagValid;
michael@0 690
michael@0 691 // No need to check the dictionary; it will have been handled by
michael@0 692 // next()
michael@0 693
michael@0 694 return lastResult;
michael@0 695 }
michael@0 696
michael@0 697 /**
michael@0 698 * Sets the iterator to refer to the first boundary position following
michael@0 699 * the specified position.
michael@0 700 * @offset The position from which to begin searching for a break position.
michael@0 701 * @return The position of the first break after the current position.
michael@0 702 */
michael@0 703 int32_t RuleBasedBreakIterator::following(int32_t offset) {
michael@0 704 // if we have cached break positions and offset is in the range
michael@0 705 // covered by them, use them
michael@0 706 // TODO: could use binary search
michael@0 707 // TODO: what if offset is outside range, but break is not?
michael@0 708 if (fCachedBreakPositions != NULL) {
michael@0 709 if (offset >= fCachedBreakPositions[0]
michael@0 710 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
michael@0 711 fPositionInCache = 0;
michael@0 712 // We are guaranteed not to leave the array due to range test above
michael@0 713 while (offset >= fCachedBreakPositions[fPositionInCache]) {
michael@0 714 ++fPositionInCache;
michael@0 715 }
michael@0 716 int32_t pos = fCachedBreakPositions[fPositionInCache];
michael@0 717 utext_setNativeIndex(fText, pos);
michael@0 718 return pos;
michael@0 719 }
michael@0 720 else {
michael@0 721 reset();
michael@0 722 }
michael@0 723 }
michael@0 724
michael@0 725 // if the offset passed in is already past the end of the text,
michael@0 726 // just return DONE; if it's before the beginning, return the
michael@0 727 // text's starting offset
michael@0 728 fLastRuleStatusIndex = 0;
michael@0 729 fLastStatusIndexValid = TRUE;
michael@0 730 if (fText == NULL || offset >= utext_nativeLength(fText)) {
michael@0 731 last();
michael@0 732 return next();
michael@0 733 }
michael@0 734 else if (offset < 0) {
michael@0 735 return first();
michael@0 736 }
michael@0 737
michael@0 738 // otherwise, set our internal iteration position (temporarily)
michael@0 739 // to the position passed in. If this is the _beginning_ position,
michael@0 740 // then we can just use next() to get our return value
michael@0 741
michael@0 742 int32_t result = 0;
michael@0 743
michael@0 744 if (fData->fSafeRevTable != NULL) {
michael@0 745 // new rule syntax
michael@0 746 utext_setNativeIndex(fText, offset);
michael@0 747 // move forward one codepoint to prepare for moving back to a
michael@0 748 // safe point.
michael@0 749 // this handles offset being between a supplementary character
michael@0 750 (void)UTEXT_NEXT32(fText);
michael@0 751 // handlePrevious will move most of the time to < 1 boundary away
michael@0 752 handlePrevious(fData->fSafeRevTable);
michael@0 753 int32_t result = next();
michael@0 754 while (result <= offset) {
michael@0 755 result = next();
michael@0 756 }
michael@0 757 return result;
michael@0 758 }
michael@0 759 if (fData->fSafeFwdTable != NULL) {
michael@0 760 // backup plan if forward safe table is not available
michael@0 761 utext_setNativeIndex(fText, offset);
michael@0 762 (void)UTEXT_PREVIOUS32(fText);
michael@0 763 // handle next will give result >= offset
michael@0 764 handleNext(fData->fSafeFwdTable);
michael@0 765 // previous will give result 0 or 1 boundary away from offset,
michael@0 766 // most of the time
michael@0 767 // we have to
michael@0 768 int32_t oldresult = previous();
michael@0 769 while (oldresult > offset) {
michael@0 770 int32_t result = previous();
michael@0 771 if (result <= offset) {
michael@0 772 return oldresult;
michael@0 773 }
michael@0 774 oldresult = result;
michael@0 775 }
michael@0 776 int32_t result = next();
michael@0 777 if (result <= offset) {
michael@0 778 return next();
michael@0 779 }
michael@0 780 return result;
michael@0 781 }
michael@0 782 // otherwise, we have to sync up first. Use handlePrevious() to back
michael@0 783 // up to a known break position before the specified position (if
michael@0 784 // we can determine that the specified position is a break position,
michael@0 785 // we don't back up at all). This may or may not be the last break
michael@0 786 // position at or before our starting position. Advance forward
michael@0 787 // from here until we've passed the starting position. The position
michael@0 788 // we stop on will be the first break position after the specified one.
michael@0 789 // old rule syntax
michael@0 790
michael@0 791 utext_setNativeIndex(fText, offset);
michael@0 792 if (offset==0 ||
michael@0 793 (offset==1 && utext_getNativeIndex(fText)==0)) {
michael@0 794 return next();
michael@0 795 }
michael@0 796 result = previous();
michael@0 797
michael@0 798 while (result != BreakIterator::DONE && result <= offset) {
michael@0 799 result = next();
michael@0 800 }
michael@0 801
michael@0 802 return result;
michael@0 803 }
michael@0 804
michael@0 805 /**
michael@0 806 * Sets the iterator to refer to the last boundary position before the
michael@0 807 * specified position.
michael@0 808 * @offset The position to begin searching for a break from.
michael@0 809 * @return The position of the last boundary before the starting position.
michael@0 810 */
michael@0 811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
michael@0 812 // if we have cached break positions and offset is in the range
michael@0 813 // covered by them, use them
michael@0 814 if (fCachedBreakPositions != NULL) {
michael@0 815 // TODO: binary search?
michael@0 816 // TODO: What if offset is outside range, but break is not?
michael@0 817 if (offset > fCachedBreakPositions[0]
michael@0 818 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
michael@0 819 fPositionInCache = 0;
michael@0 820 while (fPositionInCache < fNumCachedBreakPositions
michael@0 821 && offset > fCachedBreakPositions[fPositionInCache])
michael@0 822 ++fPositionInCache;
michael@0 823 --fPositionInCache;
michael@0 824 // If we're at the beginning of the cache, need to reevaluate the
michael@0 825 // rule status
michael@0 826 if (fPositionInCache <= 0) {
michael@0 827 fLastStatusIndexValid = FALSE;
michael@0 828 }
michael@0 829 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
michael@0 830 return fCachedBreakPositions[fPositionInCache];
michael@0 831 }
michael@0 832 else {
michael@0 833 reset();
michael@0 834 }
michael@0 835 }
michael@0 836
michael@0 837 // if the offset passed in is already past the end of the text,
michael@0 838 // just return DONE; if it's before the beginning, return the
michael@0 839 // text's starting offset
michael@0 840 if (fText == NULL || offset > utext_nativeLength(fText)) {
michael@0 841 // return BreakIterator::DONE;
michael@0 842 return last();
michael@0 843 }
michael@0 844 else if (offset < 0) {
michael@0 845 return first();
michael@0 846 }
michael@0 847
michael@0 848 // if we start by updating the current iteration position to the
michael@0 849 // position specified by the caller, we can just use previous()
michael@0 850 // to carry out this operation
michael@0 851
michael@0 852 if (fData->fSafeFwdTable != NULL) {
michael@0 853 // new rule syntax
michael@0 854 utext_setNativeIndex(fText, offset);
michael@0 855 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 856 if (newOffset != offset) {
michael@0 857 // Will come here if specified offset was not a code point boundary AND
michael@0 858 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
michael@0 859 // indices to the containing code point.
michael@0 860 // For breakitereator::preceding only, these non-code-point indices need to be moved
michael@0 861 // up to refer to the following codepoint.
michael@0 862 (void)UTEXT_NEXT32(fText);
michael@0 863 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 864 }
michael@0 865
michael@0 866 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
michael@0 867 // rather than adjusting the position unconditionally?
michael@0 868 // (Change would interact with safe rules.)
michael@0 869 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
michael@0 870 // affects only preceding(), seems cleaner, but is slightly different.
michael@0 871 (void)UTEXT_PREVIOUS32(fText);
michael@0 872 handleNext(fData->fSafeFwdTable);
michael@0 873 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 874 while (result >= offset) {
michael@0 875 result = previous();
michael@0 876 }
michael@0 877 return result;
michael@0 878 }
michael@0 879 if (fData->fSafeRevTable != NULL) {
michael@0 880 // backup plan if forward safe table is not available
michael@0 881 // TODO: check whether this path can be discarded
michael@0 882 // It's probably OK to say that rules must supply both safe tables
michael@0 883 // if they use safe tables at all. We have certainly never described
michael@0 884 // to anyone how to work with just one safe table.
michael@0 885 utext_setNativeIndex(fText, offset);
michael@0 886 (void)UTEXT_NEXT32(fText);
michael@0 887
michael@0 888 // handle previous will give result <= offset
michael@0 889 handlePrevious(fData->fSafeRevTable);
michael@0 890
michael@0 891 // next will give result 0 or 1 boundary away from offset,
michael@0 892 // most of the time
michael@0 893 // we have to
michael@0 894 int32_t oldresult = next();
michael@0 895 while (oldresult < offset) {
michael@0 896 int32_t result = next();
michael@0 897 if (result >= offset) {
michael@0 898 return oldresult;
michael@0 899 }
michael@0 900 oldresult = result;
michael@0 901 }
michael@0 902 int32_t result = previous();
michael@0 903 if (result >= offset) {
michael@0 904 return previous();
michael@0 905 }
michael@0 906 return result;
michael@0 907 }
michael@0 908
michael@0 909 // old rule syntax
michael@0 910 utext_setNativeIndex(fText, offset);
michael@0 911 return previous();
michael@0 912 }
michael@0 913
michael@0 914 /**
michael@0 915 * Returns true if the specfied position is a boundary position. As a side
michael@0 916 * effect, leaves the iterator pointing to the first boundary position at
michael@0 917 * or after "offset".
michael@0 918 * @param offset the offset to check.
michael@0 919 * @return True if "offset" is a boundary position.
michael@0 920 */
michael@0 921 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
michael@0 922 // the beginning index of the iterator is always a boundary position by definition
michael@0 923 if (offset == 0) {
michael@0 924 first(); // For side effects on current position, tag values.
michael@0 925 return TRUE;
michael@0 926 }
michael@0 927
michael@0 928 if (offset == (int32_t)utext_nativeLength(fText)) {
michael@0 929 last(); // For side effects on current position, tag values.
michael@0 930 return TRUE;
michael@0 931 }
michael@0 932
michael@0 933 // out-of-range indexes are never boundary positions
michael@0 934 if (offset < 0) {
michael@0 935 first(); // For side effects on current position, tag values.
michael@0 936 return FALSE;
michael@0 937 }
michael@0 938
michael@0 939 if (offset > utext_nativeLength(fText)) {
michael@0 940 last(); // For side effects on current position, tag values.
michael@0 941 return FALSE;
michael@0 942 }
michael@0 943
michael@0 944 // otherwise, we can use following() on the position before the specified
michael@0 945 // one and return true if the position we get back is the one the user
michael@0 946 // specified
michael@0 947 utext_previous32From(fText, offset);
michael@0 948 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 949 UBool result = following(backOne) == offset;
michael@0 950 return result;
michael@0 951 }
michael@0 952
michael@0 953 /**
michael@0 954 * Returns the current iteration position.
michael@0 955 * @return The current iteration position.
michael@0 956 */
michael@0 957 int32_t RuleBasedBreakIterator::current(void) const {
michael@0 958 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 959 return pos;
michael@0 960 }
michael@0 961
michael@0 962 //=======================================================================
michael@0 963 // implementation
michael@0 964 //=======================================================================
michael@0 965
michael@0 966 //
michael@0 967 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
michael@0 968 // of user text. A variable with this enum type keeps track of where we
michael@0 969 // are. The state machine only fetches user input while in the RUN mode.
michael@0 970 //
michael@0 971 enum RBBIRunMode {
michael@0 972 RBBI_START, // state machine processing is before first char of input
michael@0 973 RBBI_RUN, // state machine processing is in the user text
michael@0 974 RBBI_END // state machine processing is after end of user text.
michael@0 975 };
michael@0 976
michael@0 977
michael@0 978 //-----------------------------------------------------------------------------------
michael@0 979 //
michael@0 980 // handleNext(stateTable)
michael@0 981 // This method is the actual implementation of the rbbi next() method.
michael@0 982 // This method initializes the state machine to state 1
michael@0 983 // and advances through the text character by character until we reach the end
michael@0 984 // of the text or the state machine transitions to state 0. We update our return
michael@0 985 // value every time the state machine passes through an accepting state.
michael@0 986 //
michael@0 987 //-----------------------------------------------------------------------------------
michael@0 988 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
michael@0 989 int32_t state;
michael@0 990 uint16_t category = 0;
michael@0 991 RBBIRunMode mode;
michael@0 992
michael@0 993 RBBIStateTableRow *row;
michael@0 994 UChar32 c;
michael@0 995 int32_t lookaheadStatus = 0;
michael@0 996 int32_t lookaheadTagIdx = 0;
michael@0 997 int32_t result = 0;
michael@0 998 int32_t initialPosition = 0;
michael@0 999 int32_t lookaheadResult = 0;
michael@0 1000 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
michael@0 1001 const char *tableData = statetable->fTableData;
michael@0 1002 uint32_t tableRowLen = statetable->fRowLen;
michael@0 1003
michael@0 1004 #ifdef RBBI_DEBUG
michael@0 1005 if (fTrace) {
michael@0 1006 RBBIDebugPuts("Handle Next pos char state category");
michael@0 1007 }
michael@0 1008 #endif
michael@0 1009
michael@0 1010 // No matter what, handleNext alway correctly sets the break tag value.
michael@0 1011 fLastStatusIndexValid = TRUE;
michael@0 1012 fLastRuleStatusIndex = 0;
michael@0 1013
michael@0 1014 // if we're already at the end of the text, return DONE.
michael@0 1015 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1016 result = initialPosition;
michael@0 1017 c = UTEXT_NEXT32(fText);
michael@0 1018 if (fData == NULL || c==U_SENTINEL) {
michael@0 1019 return BreakIterator::DONE;
michael@0 1020 }
michael@0 1021
michael@0 1022 // Set the initial state for the state machine
michael@0 1023 state = START_STATE;
michael@0 1024 row = (RBBIStateTableRow *)
michael@0 1025 //(statetable->fTableData + (statetable->fRowLen * state));
michael@0 1026 (tableData + tableRowLen * state);
michael@0 1027
michael@0 1028
michael@0 1029 mode = RBBI_RUN;
michael@0 1030 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
michael@0 1031 category = 2;
michael@0 1032 mode = RBBI_START;
michael@0 1033 }
michael@0 1034
michael@0 1035
michael@0 1036 // loop until we reach the end of the text or transition to state 0
michael@0 1037 //
michael@0 1038 for (;;) {
michael@0 1039 if (c == U_SENTINEL) {
michael@0 1040 // Reached end of input string.
michael@0 1041 if (mode == RBBI_END) {
michael@0 1042 // We have already run the loop one last time with the
michael@0 1043 // character set to the psueudo {eof} value. Now it is time
michael@0 1044 // to unconditionally bail out.
michael@0 1045 if (lookaheadResult > result) {
michael@0 1046 // We ran off the end of the string with a pending look-ahead match.
michael@0 1047 // Treat this as if the look-ahead condition had been met, and return
michael@0 1048 // the match at the / position from the look-ahead rule.
michael@0 1049 result = lookaheadResult;
michael@0 1050 fLastRuleStatusIndex = lookaheadTagIdx;
michael@0 1051 lookaheadStatus = 0;
michael@0 1052 }
michael@0 1053 break;
michael@0 1054 }
michael@0 1055 // Run the loop one last time with the fake end-of-input character category.
michael@0 1056 mode = RBBI_END;
michael@0 1057 category = 1;
michael@0 1058 }
michael@0 1059
michael@0 1060 //
michael@0 1061 // Get the char category. An incoming category of 1 or 2 means that
michael@0 1062 // we are preset for doing the beginning or end of input, and
michael@0 1063 // that we shouldn't get a category from an actual text input character.
michael@0 1064 //
michael@0 1065 if (mode == RBBI_RUN) {
michael@0 1066 // look up the current character's character category, which tells us
michael@0 1067 // which column in the state table to look at.
michael@0 1068 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
michael@0 1069 // not the size of the character going in, which is a UChar32.
michael@0 1070 //
michael@0 1071 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1072
michael@0 1073 // Check the dictionary bit in the character's category.
michael@0 1074 // Counter is only used by dictionary based iterators (subclasses).
michael@0 1075 // Chars that need to be handled by a dictionary have a flag bit set
michael@0 1076 // in their category values.
michael@0 1077 //
michael@0 1078 if ((category & 0x4000) != 0) {
michael@0 1079 fDictionaryCharCount++;
michael@0 1080 // And off the dictionary flag bit.
michael@0 1081 category &= ~0x4000;
michael@0 1082 }
michael@0 1083 }
michael@0 1084
michael@0 1085 #ifdef RBBI_DEBUG
michael@0 1086 if (fTrace) {
michael@0 1087 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText));
michael@0 1088 if (0x20<=c && c<0x7f) {
michael@0 1089 RBBIDebugPrintf("\"%c\" ", c);
michael@0 1090 } else {
michael@0 1091 RBBIDebugPrintf("%5x ", c);
michael@0 1092 }
michael@0 1093 RBBIDebugPrintf("%3d %3d\n", state, category);
michael@0 1094 }
michael@0 1095 #endif
michael@0 1096
michael@0 1097 // State Transition - move machine to its next state
michael@0 1098 //
michael@0 1099
michael@0 1100 // Note: fNextState is defined as uint16_t[2], but we are casting
michael@0 1101 // a generated RBBI table to RBBIStateTableRow and some tables
michael@0 1102 // actually have more than 2 categories.
michael@0 1103 U_ASSERT(category<fData->fHeader->fCatCount);
michael@0 1104 state = row->fNextState[category]; /*Not accessing beyond memory*/
michael@0 1105 row = (RBBIStateTableRow *)
michael@0 1106 // (statetable->fTableData + (statetable->fRowLen * state));
michael@0 1107 (tableData + tableRowLen * state);
michael@0 1108
michael@0 1109
michael@0 1110 if (row->fAccepting == -1) {
michael@0 1111 // Match found, common case.
michael@0 1112 if (mode != RBBI_START) {
michael@0 1113 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1114 }
michael@0 1115 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
michael@0 1116 }
michael@0 1117
michael@0 1118 if (row->fLookAhead != 0) {
michael@0 1119 if (lookaheadStatus != 0
michael@0 1120 && row->fAccepting == lookaheadStatus) {
michael@0 1121 // Lookahead match is completed.
michael@0 1122 result = lookaheadResult;
michael@0 1123 fLastRuleStatusIndex = lookaheadTagIdx;
michael@0 1124 lookaheadStatus = 0;
michael@0 1125 // TODO: make a standalone hard break in a rule work.
michael@0 1126 if (lookAheadHardBreak) {
michael@0 1127 UTEXT_SETNATIVEINDEX(fText, result);
michael@0 1128 return result;
michael@0 1129 }
michael@0 1130 // Look-ahead completed, but other rules may match further. Continue on
michael@0 1131 // TODO: junk this feature? I don't think it's used anywhwere.
michael@0 1132 goto continueOn;
michael@0 1133 }
michael@0 1134
michael@0 1135 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1136 lookaheadResult = r;
michael@0 1137 lookaheadStatus = row->fLookAhead;
michael@0 1138 lookaheadTagIdx = row->fTagIdx;
michael@0 1139 goto continueOn;
michael@0 1140 }
michael@0 1141
michael@0 1142
michael@0 1143 if (row->fAccepting != 0) {
michael@0 1144 // Because this is an accepting state, any in-progress look-ahead match
michael@0 1145 // is no longer relavant. Clear out the pending lookahead status.
michael@0 1146 lookaheadStatus = 0; // clear out any pending look-ahead match.
michael@0 1147 }
michael@0 1148
michael@0 1149 continueOn:
michael@0 1150 if (state == STOP_STATE) {
michael@0 1151 // This is the normal exit from the lookup state machine.
michael@0 1152 // We have advanced through the string until it is certain that no
michael@0 1153 // longer match is possible, no matter what characters follow.
michael@0 1154 break;
michael@0 1155 }
michael@0 1156
michael@0 1157 // Advance to the next character.
michael@0 1158 // If this is a beginning-of-input loop iteration, don't advance
michael@0 1159 // the input position. The next iteration will be processing the
michael@0 1160 // first real input character.
michael@0 1161 if (mode == RBBI_RUN) {
michael@0 1162 c = UTEXT_NEXT32(fText);
michael@0 1163 } else {
michael@0 1164 if (mode == RBBI_START) {
michael@0 1165 mode = RBBI_RUN;
michael@0 1166 }
michael@0 1167 }
michael@0 1168
michael@0 1169
michael@0 1170 }
michael@0 1171
michael@0 1172 // The state machine is done. Check whether it found a match...
michael@0 1173
michael@0 1174 // If the iterator failed to advance in the match engine, force it ahead by one.
michael@0 1175 // (This really indicates a defect in the break rules. They should always match
michael@0 1176 // at least one character.)
michael@0 1177 if (result == initialPosition) {
michael@0 1178 UTEXT_SETNATIVEINDEX(fText, initialPosition);
michael@0 1179 UTEXT_NEXT32(fText);
michael@0 1180 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1181 }
michael@0 1182
michael@0 1183 // Leave the iterator at our result position.
michael@0 1184 UTEXT_SETNATIVEINDEX(fText, result);
michael@0 1185 #ifdef RBBI_DEBUG
michael@0 1186 if (fTrace) {
michael@0 1187 RBBIDebugPrintf("result = %d\n\n", result);
michael@0 1188 }
michael@0 1189 #endif
michael@0 1190 return result;
michael@0 1191 }
michael@0 1192
michael@0 1193
michael@0 1194
michael@0 1195 //-----------------------------------------------------------------------------------
michael@0 1196 //
michael@0 1197 // handlePrevious()
michael@0 1198 //
michael@0 1199 // Iterate backwards, according to the logic of the reverse rules.
michael@0 1200 // This version handles the exact style backwards rules.
michael@0 1201 //
michael@0 1202 // The logic of this function is very similar to handleNext(), above.
michael@0 1203 //
michael@0 1204 //-----------------------------------------------------------------------------------
michael@0 1205 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
michael@0 1206 int32_t state;
michael@0 1207 uint16_t category = 0;
michael@0 1208 RBBIRunMode mode;
michael@0 1209 RBBIStateTableRow *row;
michael@0 1210 UChar32 c;
michael@0 1211 int32_t lookaheadStatus = 0;
michael@0 1212 int32_t result = 0;
michael@0 1213 int32_t initialPosition = 0;
michael@0 1214 int32_t lookaheadResult = 0;
michael@0 1215 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
michael@0 1216
michael@0 1217 #ifdef RBBI_DEBUG
michael@0 1218 if (fTrace) {
michael@0 1219 RBBIDebugPuts("Handle Previous pos char state category");
michael@0 1220 }
michael@0 1221 #endif
michael@0 1222
michael@0 1223 // handlePrevious() never gets the rule status.
michael@0 1224 // Flag the status as invalid; if the user ever asks for status, we will need
michael@0 1225 // to back up, then re-find the break position using handleNext(), which does
michael@0 1226 // get the status value.
michael@0 1227 fLastStatusIndexValid = FALSE;
michael@0 1228 fLastRuleStatusIndex = 0;
michael@0 1229
michael@0 1230 // if we're already at the start of the text, return DONE.
michael@0 1231 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
michael@0 1232 return BreakIterator::DONE;
michael@0 1233 }
michael@0 1234
michael@0 1235 // Set up the starting char.
michael@0 1236 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1237 result = initialPosition;
michael@0 1238 c = UTEXT_PREVIOUS32(fText);
michael@0 1239
michael@0 1240 // Set the initial state for the state machine
michael@0 1241 state = START_STATE;
michael@0 1242 row = (RBBIStateTableRow *)
michael@0 1243 (statetable->fTableData + (statetable->fRowLen * state));
michael@0 1244 category = 3;
michael@0 1245 mode = RBBI_RUN;
michael@0 1246 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
michael@0 1247 category = 2;
michael@0 1248 mode = RBBI_START;
michael@0 1249 }
michael@0 1250
michael@0 1251
michael@0 1252 // loop until we reach the start of the text or transition to state 0
michael@0 1253 //
michael@0 1254 for (;;) {
michael@0 1255 if (c == U_SENTINEL) {
michael@0 1256 // Reached end of input string.
michael@0 1257 if (mode == RBBI_END) {
michael@0 1258 // We have already run the loop one last time with the
michael@0 1259 // character set to the psueudo {eof} value. Now it is time
michael@0 1260 // to unconditionally bail out.
michael@0 1261 if (lookaheadResult < result) {
michael@0 1262 // We ran off the end of the string with a pending look-ahead match.
michael@0 1263 // Treat this as if the look-ahead condition had been met, and return
michael@0 1264 // the match at the / position from the look-ahead rule.
michael@0 1265 result = lookaheadResult;
michael@0 1266 lookaheadStatus = 0;
michael@0 1267 } else if (result == initialPosition) {
michael@0 1268 // Ran off start, no match found.
michael@0 1269 // move one index one (towards the start, since we are doing a previous())
michael@0 1270 UTEXT_SETNATIVEINDEX(fText, initialPosition);
michael@0 1271 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
michael@0 1272 }
michael@0 1273 break;
michael@0 1274 }
michael@0 1275 // Run the loop one last time with the fake end-of-input character category.
michael@0 1276 mode = RBBI_END;
michael@0 1277 category = 1;
michael@0 1278 }
michael@0 1279
michael@0 1280 //
michael@0 1281 // Get the char category. An incoming category of 1 or 2 means that
michael@0 1282 // we are preset for doing the beginning or end of input, and
michael@0 1283 // that we shouldn't get a category from an actual text input character.
michael@0 1284 //
michael@0 1285 if (mode == RBBI_RUN) {
michael@0 1286 // look up the current character's character category, which tells us
michael@0 1287 // which column in the state table to look at.
michael@0 1288 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
michael@0 1289 // not the size of the character going in, which is a UChar32.
michael@0 1290 //
michael@0 1291 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1292
michael@0 1293 // Check the dictionary bit in the character's category.
michael@0 1294 // Counter is only used by dictionary based iterators (subclasses).
michael@0 1295 // Chars that need to be handled by a dictionary have a flag bit set
michael@0 1296 // in their category values.
michael@0 1297 //
michael@0 1298 if ((category & 0x4000) != 0) {
michael@0 1299 fDictionaryCharCount++;
michael@0 1300 // And off the dictionary flag bit.
michael@0 1301 category &= ~0x4000;
michael@0 1302 }
michael@0 1303 }
michael@0 1304
michael@0 1305 #ifdef RBBI_DEBUG
michael@0 1306 if (fTrace) {
michael@0 1307 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
michael@0 1308 if (0x20<=c && c<0x7f) {
michael@0 1309 RBBIDebugPrintf("\"%c\" ", c);
michael@0 1310 } else {
michael@0 1311 RBBIDebugPrintf("%5x ", c);
michael@0 1312 }
michael@0 1313 RBBIDebugPrintf("%3d %3d\n", state, category);
michael@0 1314 }
michael@0 1315 #endif
michael@0 1316
michael@0 1317 // State Transition - move machine to its next state
michael@0 1318 //
michael@0 1319
michael@0 1320 // Note: fNextState is defined as uint16_t[2], but we are casting
michael@0 1321 // a generated RBBI table to RBBIStateTableRow and some tables
michael@0 1322 // actually have more than 2 categories.
michael@0 1323 U_ASSERT(category<fData->fHeader->fCatCount);
michael@0 1324 state = row->fNextState[category]; /*Not accessing beyond memory*/
michael@0 1325 row = (RBBIStateTableRow *)
michael@0 1326 (statetable->fTableData + (statetable->fRowLen * state));
michael@0 1327
michael@0 1328 if (row->fAccepting == -1) {
michael@0 1329 // Match found, common case.
michael@0 1330 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1331 }
michael@0 1332
michael@0 1333 if (row->fLookAhead != 0) {
michael@0 1334 if (lookaheadStatus != 0
michael@0 1335 && row->fAccepting == lookaheadStatus) {
michael@0 1336 // Lookahead match is completed.
michael@0 1337 result = lookaheadResult;
michael@0 1338 lookaheadStatus = 0;
michael@0 1339 // TODO: make a standalone hard break in a rule work.
michael@0 1340 if (lookAheadHardBreak) {
michael@0 1341 UTEXT_SETNATIVEINDEX(fText, result);
michael@0 1342 return result;
michael@0 1343 }
michael@0 1344 // Look-ahead completed, but other rules may match further. Continue on
michael@0 1345 // TODO: junk this feature? I don't think it's used anywhwere.
michael@0 1346 goto continueOn;
michael@0 1347 }
michael@0 1348
michael@0 1349 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1350 lookaheadResult = r;
michael@0 1351 lookaheadStatus = row->fLookAhead;
michael@0 1352 goto continueOn;
michael@0 1353 }
michael@0 1354
michael@0 1355
michael@0 1356 if (row->fAccepting != 0) {
michael@0 1357 // Because this is an accepting state, any in-progress look-ahead match
michael@0 1358 // is no longer relavant. Clear out the pending lookahead status.
michael@0 1359 lookaheadStatus = 0;
michael@0 1360 }
michael@0 1361
michael@0 1362 continueOn:
michael@0 1363 if (state == STOP_STATE) {
michael@0 1364 // This is the normal exit from the lookup state machine.
michael@0 1365 // We have advanced through the string until it is certain that no
michael@0 1366 // longer match is possible, no matter what characters follow.
michael@0 1367 break;
michael@0 1368 }
michael@0 1369
michael@0 1370 // Move (backwards) to the next character to process.
michael@0 1371 // If this is a beginning-of-input loop iteration, don't advance
michael@0 1372 // the input position. The next iteration will be processing the
michael@0 1373 // first real input character.
michael@0 1374 if (mode == RBBI_RUN) {
michael@0 1375 c = UTEXT_PREVIOUS32(fText);
michael@0 1376 } else {
michael@0 1377 if (mode == RBBI_START) {
michael@0 1378 mode = RBBI_RUN;
michael@0 1379 }
michael@0 1380 }
michael@0 1381 }
michael@0 1382
michael@0 1383 // The state machine is done. Check whether it found a match...
michael@0 1384
michael@0 1385 // If the iterator failed to advance in the match engine, force it ahead by one.
michael@0 1386 // (This really indicates a defect in the break rules. They should always match
michael@0 1387 // at least one character.)
michael@0 1388 if (result == initialPosition) {
michael@0 1389 UTEXT_SETNATIVEINDEX(fText, initialPosition);
michael@0 1390 UTEXT_PREVIOUS32(fText);
michael@0 1391 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1392 }
michael@0 1393
michael@0 1394 // Leave the iterator at our result position.
michael@0 1395 UTEXT_SETNATIVEINDEX(fText, result);
michael@0 1396 #ifdef RBBI_DEBUG
michael@0 1397 if (fTrace) {
michael@0 1398 RBBIDebugPrintf("result = %d\n\n", result);
michael@0 1399 }
michael@0 1400 #endif
michael@0 1401 return result;
michael@0 1402 }
michael@0 1403
michael@0 1404
michael@0 1405 void
michael@0 1406 RuleBasedBreakIterator::reset()
michael@0 1407 {
michael@0 1408 if (fCachedBreakPositions) {
michael@0 1409 uprv_free(fCachedBreakPositions);
michael@0 1410 }
michael@0 1411 fCachedBreakPositions = NULL;
michael@0 1412 fNumCachedBreakPositions = 0;
michael@0 1413 fDictionaryCharCount = 0;
michael@0 1414 fPositionInCache = 0;
michael@0 1415 }
michael@0 1416
michael@0 1417
michael@0 1418
michael@0 1419 //-------------------------------------------------------------------------------
michael@0 1420 //
michael@0 1421 // getRuleStatus() Return the break rule tag associated with the current
michael@0 1422 // iterator position. If the iterator arrived at its current
michael@0 1423 // position by iterating forwards, the value will have been
michael@0 1424 // cached by the handleNext() function.
michael@0 1425 //
michael@0 1426 // If no cached status value is available, the status is
michael@0 1427 // found by doing a previous() followed by a next(), which
michael@0 1428 // leaves the iterator where it started, and computes the
michael@0 1429 // status while doing the next().
michael@0 1430 //
michael@0 1431 //-------------------------------------------------------------------------------
michael@0 1432 void RuleBasedBreakIterator::makeRuleStatusValid() {
michael@0 1433 if (fLastStatusIndexValid == FALSE) {
michael@0 1434 // No cached status is available.
michael@0 1435 if (fText == NULL || current() == 0) {
michael@0 1436 // At start of text, or there is no text. Status is always zero.
michael@0 1437 fLastRuleStatusIndex = 0;
michael@0 1438 fLastStatusIndexValid = TRUE;
michael@0 1439 } else {
michael@0 1440 // Not at start of text. Find status the tedious way.
michael@0 1441 int32_t pa = current();
michael@0 1442 previous();
michael@0 1443 if (fNumCachedBreakPositions > 0) {
michael@0 1444 reset(); // Blow off the dictionary cache
michael@0 1445 }
michael@0 1446 int32_t pb = next();
michael@0 1447 if (pa != pb) {
michael@0 1448 // note: the if (pa != pb) test is here only to eliminate warnings for
michael@0 1449 // unused local variables on gcc. Logically, it isn't needed.
michael@0 1450 U_ASSERT(pa == pb);
michael@0 1451 }
michael@0 1452 }
michael@0 1453 }
michael@0 1454 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
michael@0 1455 }
michael@0 1456
michael@0 1457
michael@0 1458 int32_t RuleBasedBreakIterator::getRuleStatus() const {
michael@0 1459 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
michael@0 1460 nonConstThis->makeRuleStatusValid();
michael@0 1461
michael@0 1462 // fLastRuleStatusIndex indexes to the start of the appropriate status record
michael@0 1463 // (the number of status values.)
michael@0 1464 // This function returns the last (largest) of the array of status values.
michael@0 1465 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
michael@0 1466 int32_t tagVal = fData->fRuleStatusTable[idx];
michael@0 1467
michael@0 1468 return tagVal;
michael@0 1469 }
michael@0 1470
michael@0 1471
michael@0 1472
michael@0 1473
michael@0 1474 int32_t RuleBasedBreakIterator::getRuleStatusVec(
michael@0 1475 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
michael@0 1476 {
michael@0 1477 if (U_FAILURE(status)) {
michael@0 1478 return 0;
michael@0 1479 }
michael@0 1480
michael@0 1481 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
michael@0 1482 nonConstThis->makeRuleStatusValid();
michael@0 1483 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
michael@0 1484 int32_t numValsToCopy = numVals;
michael@0 1485 if (numVals > capacity) {
michael@0 1486 status = U_BUFFER_OVERFLOW_ERROR;
michael@0 1487 numValsToCopy = capacity;
michael@0 1488 }
michael@0 1489 int i;
michael@0 1490 for (i=0; i<numValsToCopy; i++) {
michael@0 1491 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
michael@0 1492 }
michael@0 1493 return numVals;
michael@0 1494 }
michael@0 1495
michael@0 1496
michael@0 1497
michael@0 1498 //-------------------------------------------------------------------------------
michael@0 1499 //
michael@0 1500 // getBinaryRules Access to the compiled form of the rules,
michael@0 1501 // for use by build system tools that save the data
michael@0 1502 // for standard iterator types.
michael@0 1503 //
michael@0 1504 //-------------------------------------------------------------------------------
michael@0 1505 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
michael@0 1506 const uint8_t *retPtr = NULL;
michael@0 1507 length = 0;
michael@0 1508
michael@0 1509 if (fData != NULL) {
michael@0 1510 retPtr = (const uint8_t *)fData->fHeader;
michael@0 1511 length = fData->fHeader->fLength;
michael@0 1512 }
michael@0 1513 return retPtr;
michael@0 1514 }
michael@0 1515
michael@0 1516
michael@0 1517 BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
michael@0 1518 int32_t &bufferSize,
michael@0 1519 UErrorCode &status)
michael@0 1520 {
michael@0 1521 if (U_FAILURE(status)){
michael@0 1522 return NULL;
michael@0 1523 }
michael@0 1524
michael@0 1525 if (bufferSize == 0) {
michael@0 1526 bufferSize = 1; // preflighting for deprecated functionality
michael@0 1527 return NULL;
michael@0 1528 }
michael@0 1529
michael@0 1530 BreakIterator *clonedBI = clone();
michael@0 1531 if (clonedBI == NULL) {
michael@0 1532 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1533 } else {
michael@0 1534 status = U_SAFECLONE_ALLOCATED_WARNING;
michael@0 1535 }
michael@0 1536 return (RuleBasedBreakIterator *)clonedBI;
michael@0 1537 }
michael@0 1538
michael@0 1539
michael@0 1540 //-------------------------------------------------------------------------------
michael@0 1541 //
michael@0 1542 // isDictionaryChar Return true if the category lookup for this char
michael@0 1543 // indicates that it is in the set of dictionary lookup
michael@0 1544 // chars.
michael@0 1545 //
michael@0 1546 // This function is intended for use by dictionary based
michael@0 1547 // break iterators.
michael@0 1548 //
michael@0 1549 //-------------------------------------------------------------------------------
michael@0 1550 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
michael@0 1551 if (fData == NULL) {
michael@0 1552 return FALSE;
michael@0 1553 }
michael@0 1554 uint16_t category;
michael@0 1555 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1556 return (category & 0x4000) != 0;
michael@0 1557 }*/
michael@0 1558
michael@0 1559
michael@0 1560 //-------------------------------------------------------------------------------
michael@0 1561 //
michael@0 1562 // checkDictionary This function handles all processing of characters in
michael@0 1563 // the "dictionary" set. It will determine the appropriate
michael@0 1564 // course of action, and possibly set up a cache in the
michael@0 1565 // process.
michael@0 1566 //
michael@0 1567 //-------------------------------------------------------------------------------
michael@0 1568 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
michael@0 1569 int32_t endPos,
michael@0 1570 UBool reverse) {
michael@0 1571 // Reset the old break cache first.
michael@0 1572 reset();
michael@0 1573
michael@0 1574 // note: code segment below assumes that dictionary chars are in the
michael@0 1575 // startPos-endPos range
michael@0 1576 // value returned should be next character in sequence
michael@0 1577 if ((endPos - startPos) <= 1) {
michael@0 1578 return (reverse ? startPos : endPos);
michael@0 1579 }
michael@0 1580
michael@0 1581 // Bug 5532. The dictionary code will crash if the input text is UTF-8
michael@0 1582 // because native indexes are different from UTF-16 indexes.
michael@0 1583 // Temporary hack: skip dictionary lookup for UTF-8 encoded text.
michael@0 1584 // It wont give the right breaks, but it's better than a crash.
michael@0 1585 //
michael@0 1586 // Check the type of the UText by checking its pFuncs field, which
michael@0 1587 // is UText's function dispatch table. It will be the same for all
michael@0 1588 // UTF-8 UTexts and different for any other UText type.
michael@0 1589 //
michael@0 1590 // We have no other type of UText available with non-UTF-16 native indexing.
michael@0 1591 // This whole check will go away once the dictionary code is fixed.
michael@0 1592 static const void *utext_utf8Funcs;
michael@0 1593 if (utext_utf8Funcs == NULL) {
michael@0 1594 // Cache the UTF-8 UText function pointer value.
michael@0 1595 UErrorCode status = U_ZERO_ERROR;
michael@0 1596 UText tempUText = UTEXT_INITIALIZER;
michael@0 1597 utext_openUTF8(&tempUText, NULL, 0, &status);
michael@0 1598 utext_utf8Funcs = tempUText.pFuncs;
michael@0 1599 utext_close(&tempUText);
michael@0 1600 }
michael@0 1601 if (fText->pFuncs == utext_utf8Funcs) {
michael@0 1602 return (reverse ? startPos : endPos);
michael@0 1603 }
michael@0 1604
michael@0 1605 // Starting from the starting point, scan towards the proposed result,
michael@0 1606 // looking for the first dictionary character (which may be the one
michael@0 1607 // we're on, if we're starting in the middle of a range).
michael@0 1608 utext_setNativeIndex(fText, reverse ? endPos : startPos);
michael@0 1609 if (reverse) {
michael@0 1610 UTEXT_PREVIOUS32(fText);
michael@0 1611 }
michael@0 1612
michael@0 1613 int32_t rangeStart = startPos;
michael@0 1614 int32_t rangeEnd = endPos;
michael@0 1615
michael@0 1616 uint16_t category;
michael@0 1617 int32_t current;
michael@0 1618 UErrorCode status = U_ZERO_ERROR;
michael@0 1619 UStack breaks(status);
michael@0 1620 int32_t foundBreakCount = 0;
michael@0 1621 UChar32 c = utext_current32(fText);
michael@0 1622
michael@0 1623 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1624
michael@0 1625 // Is the character we're starting on a dictionary character? If so, we
michael@0 1626 // need to back up to include the entire run; otherwise the results of
michael@0 1627 // the break algorithm will differ depending on where we start. Since
michael@0 1628 // the result is cached and there is typically a non-dictionary break
michael@0 1629 // within a small number of words, there should be little performance impact.
michael@0 1630 if (category & 0x4000) {
michael@0 1631 if (reverse) {
michael@0 1632 do {
michael@0 1633 utext_next32(fText); // TODO: recast to work directly with postincrement.
michael@0 1634 c = utext_current32(fText);
michael@0 1635 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1636 } while (c != U_SENTINEL && (category & 0x4000));
michael@0 1637 // Back up to the last dictionary character
michael@0 1638 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
michael@0 1639 if (c == U_SENTINEL) {
michael@0 1640 // c = fText->last32();
michael@0 1641 // TODO: why was this if needed?
michael@0 1642 c = UTEXT_PREVIOUS32(fText);
michael@0 1643 }
michael@0 1644 else {
michael@0 1645 c = UTEXT_PREVIOUS32(fText);
michael@0 1646 }
michael@0 1647 }
michael@0 1648 else {
michael@0 1649 do {
michael@0 1650 c = UTEXT_PREVIOUS32(fText);
michael@0 1651 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1652 }
michael@0 1653 while (c != U_SENTINEL && (category & 0x4000));
michael@0 1654 // Back up to the last dictionary character
michael@0 1655 if (c == U_SENTINEL) {
michael@0 1656 // c = fText->first32();
michael@0 1657 c = utext_current32(fText);
michael@0 1658 }
michael@0 1659 else {
michael@0 1660 utext_next32(fText);
michael@0 1661 c = utext_current32(fText);
michael@0 1662 }
michael@0 1663 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
michael@0 1664 }
michael@0 1665 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1666 }
michael@0 1667
michael@0 1668 // Loop through the text, looking for ranges of dictionary characters.
michael@0 1669 // For each span, find the appropriate break engine, and ask it to find
michael@0 1670 // any breaks within the span.
michael@0 1671 // Note: we always do this in the forward direction, so that the break
michael@0 1672 // cache is built in the right order.
michael@0 1673 if (reverse) {
michael@0 1674 utext_setNativeIndex(fText, rangeStart);
michael@0 1675 c = utext_current32(fText);
michael@0 1676 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1677 }
michael@0 1678 while(U_SUCCESS(status)) {
michael@0 1679 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
michael@0 1680 utext_next32(fText); // TODO: tweak for post-increment operation
michael@0 1681 c = utext_current32(fText);
michael@0 1682 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1683 }
michael@0 1684 if (current >= rangeEnd) {
michael@0 1685 break;
michael@0 1686 }
michael@0 1687
michael@0 1688 // We now have a dictionary character. Get the appropriate language object
michael@0 1689 // to deal with it.
michael@0 1690 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
michael@0 1691
michael@0 1692 // Ask the language object if there are any breaks. It will leave the text
michael@0 1693 // pointer on the other side of its range, ready to search for the next one.
michael@0 1694 if (lbe != NULL) {
michael@0 1695 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
michael@0 1696 }
michael@0 1697
michael@0 1698 // Reload the loop variables for the next go-round
michael@0 1699 c = utext_current32(fText);
michael@0 1700 UTRIE_GET16(&fData->fTrie, c, category);
michael@0 1701 }
michael@0 1702
michael@0 1703 // If we found breaks, build a new break cache. The first and last entries must
michael@0 1704 // be the original starting and ending position.
michael@0 1705 if (foundBreakCount > 0) {
michael@0 1706 int32_t totalBreaks = foundBreakCount;
michael@0 1707 if (startPos < breaks.elementAti(0)) {
michael@0 1708 totalBreaks += 1;
michael@0 1709 }
michael@0 1710 if (endPos > breaks.peeki()) {
michael@0 1711 totalBreaks += 1;
michael@0 1712 }
michael@0 1713 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
michael@0 1714 if (fCachedBreakPositions != NULL) {
michael@0 1715 int32_t out = 0;
michael@0 1716 fNumCachedBreakPositions = totalBreaks;
michael@0 1717 if (startPos < breaks.elementAti(0)) {
michael@0 1718 fCachedBreakPositions[out++] = startPos;
michael@0 1719 }
michael@0 1720 for (int32_t i = 0; i < foundBreakCount; ++i) {
michael@0 1721 fCachedBreakPositions[out++] = breaks.elementAti(i);
michael@0 1722 }
michael@0 1723 if (endPos > fCachedBreakPositions[out-1]) {
michael@0 1724 fCachedBreakPositions[out] = endPos;
michael@0 1725 }
michael@0 1726 // If there are breaks, then by definition, we are replacing the original
michael@0 1727 // proposed break by one of the breaks we found. Use following() and
michael@0 1728 // preceding() to do the work. They should never recurse in this case.
michael@0 1729 if (reverse) {
michael@0 1730 return preceding(endPos);
michael@0 1731 }
michael@0 1732 else {
michael@0 1733 return following(startPos);
michael@0 1734 }
michael@0 1735 }
michael@0 1736 // If the allocation failed, just fall through to the "no breaks found" case.
michael@0 1737 }
michael@0 1738
michael@0 1739 // If we get here, there were no language-based breaks. Set the text pointer
michael@0 1740 // to the original proposed break.
michael@0 1741 utext_setNativeIndex(fText, reverse ? startPos : endPos);
michael@0 1742 return (reverse ? startPos : endPos);
michael@0 1743 }
michael@0 1744
michael@0 1745 // defined in ucln_cmn.h
michael@0 1746
michael@0 1747 U_NAMESPACE_END
michael@0 1748
michael@0 1749
michael@0 1750 static icu::UStack *gLanguageBreakFactories = NULL;
michael@0 1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
michael@0 1752
michael@0 1753 /**
michael@0 1754 * Release all static memory held by breakiterator.
michael@0 1755 */
michael@0 1756 U_CDECL_BEGIN
michael@0 1757 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
michael@0 1758 if (gLanguageBreakFactories) {
michael@0 1759 delete gLanguageBreakFactories;
michael@0 1760 gLanguageBreakFactories = NULL;
michael@0 1761 }
michael@0 1762 gLanguageBreakFactoriesInitOnce.reset();
michael@0 1763 return TRUE;
michael@0 1764 }
michael@0 1765 U_CDECL_END
michael@0 1766
michael@0 1767 U_CDECL_BEGIN
michael@0 1768 static void U_CALLCONV _deleteFactory(void *obj) {
michael@0 1769 delete (icu::LanguageBreakFactory *) obj;
michael@0 1770 }
michael@0 1771 U_CDECL_END
michael@0 1772 U_NAMESPACE_BEGIN
michael@0 1773
michael@0 1774 static void U_CALLCONV initLanguageFactories() {
michael@0 1775 UErrorCode status = U_ZERO_ERROR;
michael@0 1776 U_ASSERT(gLanguageBreakFactories == NULL);
michael@0 1777 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
michael@0 1778 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
michael@0 1779 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
michael@0 1780 gLanguageBreakFactories->push(builtIn, status);
michael@0 1781 #ifdef U_LOCAL_SERVICE_HOOK
michael@0 1782 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
michael@0 1783 if (extra != NULL) {
michael@0 1784 gLanguageBreakFactories->push(extra, status);
michael@0 1785 }
michael@0 1786 #endif
michael@0 1787 }
michael@0 1788 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
michael@0 1789 }
michael@0 1790
michael@0 1791
michael@0 1792 static const LanguageBreakEngine*
michael@0 1793 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
michael@0 1794 {
michael@0 1795 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
michael@0 1796 if (gLanguageBreakFactories == NULL) {
michael@0 1797 return NULL;
michael@0 1798 }
michael@0 1799
michael@0 1800 int32_t i = gLanguageBreakFactories->size();
michael@0 1801 const LanguageBreakEngine *lbe = NULL;
michael@0 1802 while (--i >= 0) {
michael@0 1803 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
michael@0 1804 lbe = factory->getEngineFor(c, breakType);
michael@0 1805 if (lbe != NULL) {
michael@0 1806 break;
michael@0 1807 }
michael@0 1808 }
michael@0 1809 return lbe;
michael@0 1810 }
michael@0 1811
michael@0 1812
michael@0 1813 //-------------------------------------------------------------------------------
michael@0 1814 //
michael@0 1815 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
michael@0 1816 // the character c.
michael@0 1817 //
michael@0 1818 //-------------------------------------------------------------------------------
michael@0 1819 const LanguageBreakEngine *
michael@0 1820 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
michael@0 1821 const LanguageBreakEngine *lbe = NULL;
michael@0 1822 UErrorCode status = U_ZERO_ERROR;
michael@0 1823
michael@0 1824 if (fLanguageBreakEngines == NULL) {
michael@0 1825 fLanguageBreakEngines = new UStack(status);
michael@0 1826 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
michael@0 1827 delete fLanguageBreakEngines;
michael@0 1828 fLanguageBreakEngines = 0;
michael@0 1829 return NULL;
michael@0 1830 }
michael@0 1831 }
michael@0 1832
michael@0 1833 int32_t i = fLanguageBreakEngines->size();
michael@0 1834 while (--i >= 0) {
michael@0 1835 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
michael@0 1836 if (lbe->handles(c, fBreakType)) {
michael@0 1837 return lbe;
michael@0 1838 }
michael@0 1839 }
michael@0 1840
michael@0 1841 // No existing dictionary took the character. See if a factory wants to
michael@0 1842 // give us a new LanguageBreakEngine for this character.
michael@0 1843 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
michael@0 1844
michael@0 1845 // If we got one, use it and push it on our stack.
michael@0 1846 if (lbe != NULL) {
michael@0 1847 fLanguageBreakEngines->push((void *)lbe, status);
michael@0 1848 // Even if we can't remember it, we can keep looking it up, so
michael@0 1849 // return it even if the push fails.
michael@0 1850 return lbe;
michael@0 1851 }
michael@0 1852
michael@0 1853 // No engine is forthcoming for this character. Add it to the
michael@0 1854 // reject set. Create the reject break engine if needed.
michael@0 1855 if (fUnhandledBreakEngine == NULL) {
michael@0 1856 fUnhandledBreakEngine = new UnhandledEngine(status);
michael@0 1857 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
michael@0 1858 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1859 }
michael@0 1860 // Put it last so that scripts for which we have an engine get tried
michael@0 1861 // first.
michael@0 1862 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
michael@0 1863 // If we can't insert it, or creation failed, get rid of it
michael@0 1864 if (U_FAILURE(status)) {
michael@0 1865 delete fUnhandledBreakEngine;
michael@0 1866 fUnhandledBreakEngine = 0;
michael@0 1867 return NULL;
michael@0 1868 }
michael@0 1869 }
michael@0 1870
michael@0 1871 // Tell the reject engine about the character; at its discretion, it may
michael@0 1872 // add more than just the one character.
michael@0 1873 fUnhandledBreakEngine->handleCharacter(c, fBreakType);
michael@0 1874
michael@0 1875 return fUnhandledBreakEngine;
michael@0 1876 }
michael@0 1877
michael@0 1878
michael@0 1879
michael@0 1880 /*int32_t RuleBasedBreakIterator::getBreakType() const {
michael@0 1881 return fBreakType;
michael@0 1882 }*/
michael@0 1883
michael@0 1884 void RuleBasedBreakIterator::setBreakType(int32_t type) {
michael@0 1885 fBreakType = type;
michael@0 1886 reset();
michael@0 1887 }
michael@0 1888
michael@0 1889 U_NAMESPACE_END
michael@0 1890
michael@0 1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial