michael@0: /* michael@0: *************************************************************************** michael@0: * Copyright (C) 1999-2013 International Business Machines Corporation michael@0: * and others. All rights reserved. michael@0: *************************************************************************** michael@0: */ michael@0: // michael@0: // file: rbbi.c Contains the implementation of the rule based break iterator michael@0: // runtime engine and the API implementation for michael@0: // class RuleBasedBreakIterator michael@0: // michael@0: michael@0: #include "utypeinfo.h" // for 'typeid' to work michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/rbbi.h" michael@0: #include "unicode/schriter.h" michael@0: #include "unicode/uchriter.h" michael@0: #include "unicode/udata.h" michael@0: #include "unicode/uclean.h" michael@0: #include "rbbidata.h" michael@0: #include "rbbirb.h" michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: #include "umutex.h" michael@0: #include "ucln_cmn.h" michael@0: #include "brkeng.h" michael@0: michael@0: #include "uassert.h" michael@0: #include "uvector.h" michael@0: michael@0: // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. michael@0: #if U_LOCAL_SERVICE_HOOK michael@0: #include "localsvc.h" michael@0: #endif michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: static UBool fTrace = FALSE; michael@0: #endif michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: // The state number of the starting state michael@0: #define START_STATE 1 michael@0: michael@0: // The state-transition value indicating "stop" michael@0: #define STOP_STATE 0 michael@0: michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) michael@0: michael@0: michael@0: //======================================================================= michael@0: // constructors michael@0: //======================================================================= michael@0: michael@0: /** michael@0: * Constructs a RuleBasedBreakIterator that uses the already-created michael@0: * tables object that is passed in as a parameter. michael@0: */ michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) michael@0: { michael@0: init(); michael@0: fData = new RBBIDataWrapper(data, status); // status checked in constructor michael@0: if (U_FAILURE(status)) {return;} michael@0: if(fData == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Same as above but does not adopt memory michael@0: */ michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) michael@0: { michael@0: init(); michael@0: fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor michael@0: if (U_FAILURE(status)) {return;} michael@0: if(fData == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: michael@0: // michael@0: // Construct from precompiled binary rules (tables). This constructor is public API, michael@0: // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). michael@0: // michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, michael@0: uint32_t ruleLength, michael@0: UErrorCode &status) { michael@0: init(); michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; michael@0: if (data->fLength > ruleLength) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return; michael@0: } michael@0: fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); michael@0: if (U_FAILURE(status)) {return;} michael@0: if(fData == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // Constructor from a UDataMemory handle to precompiled break rules michael@0: // stored in an ICU data file. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) michael@0: { michael@0: init(); michael@0: fData = new RBBIDataWrapper(udm, status); // status checked in constructor michael@0: if (U_FAILURE(status)) {return;} michael@0: if(fData == 0) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // Constructor from a set of rules supplied as a string. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, michael@0: UParseError &parseError, michael@0: UErrorCode &status) michael@0: { michael@0: init(); michael@0: if (U_FAILURE(status)) {return;} michael@0: RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) michael@0: RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); michael@0: // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that michael@0: // creates and returns a complete RBBI. From here, in a constructor, we michael@0: // can't just return the object created by the builder factory, hence michael@0: // the assignment of the factory created object to "this". michael@0: if (U_SUCCESS(status)) { michael@0: *this = *bi; michael@0: delete bi; michael@0: } michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // Default Constructor. Create an empty shell that can be set up later. michael@0: // Used when creating a RuleBasedBreakIterator from a set michael@0: // of rules. michael@0: //------------------------------------------------------------------------------- michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator() { michael@0: init(); michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // Copy constructor. Will produce a break iterator with the same behavior, michael@0: // and which iterates over the same text, as the one passed in. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) michael@0: : BreakIterator(other) michael@0: { michael@0: this->init(); michael@0: *this = other; michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Destructor michael@0: */ michael@0: RuleBasedBreakIterator::~RuleBasedBreakIterator() { michael@0: if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { michael@0: // fCharIter was adopted from the outside. michael@0: delete fCharIter; michael@0: } michael@0: fCharIter = NULL; michael@0: delete fSCharIter; michael@0: fCharIter = NULL; michael@0: delete fDCharIter; michael@0: fDCharIter = NULL; michael@0: michael@0: utext_close(fText); michael@0: michael@0: if (fData != NULL) { michael@0: fData->removeReference(); michael@0: fData = NULL; michael@0: } michael@0: if (fCachedBreakPositions) { michael@0: uprv_free(fCachedBreakPositions); michael@0: fCachedBreakPositions = NULL; michael@0: } michael@0: if (fLanguageBreakEngines) { michael@0: delete fLanguageBreakEngines; michael@0: fLanguageBreakEngines = NULL; michael@0: } michael@0: if (fUnhandledBreakEngine) { michael@0: delete fUnhandledBreakEngine; michael@0: fUnhandledBreakEngine = NULL; michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * Assignment operator. Sets this iterator to have the same behavior, michael@0: * and iterate over the same text, as the one passed in. michael@0: */ michael@0: RuleBasedBreakIterator& michael@0: RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { michael@0: if (this == &that) { michael@0: return *this; michael@0: } michael@0: reset(); // Delete break cache information michael@0: fBreakType = that.fBreakType; michael@0: if (fLanguageBreakEngines != NULL) { michael@0: delete fLanguageBreakEngines; michael@0: fLanguageBreakEngines = NULL; // Just rebuild for now michael@0: } michael@0: // TODO: clone fLanguageBreakEngines from "that" michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); michael@0: michael@0: if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { michael@0: delete fCharIter; michael@0: } michael@0: fCharIter = NULL; michael@0: michael@0: if (that.fCharIter != NULL ) { michael@0: // This is a little bit tricky - it will intially appear that michael@0: // this->fCharIter is adopted, even if that->fCharIter was michael@0: // not adopted. That's ok. michael@0: fCharIter = that.fCharIter->clone(); michael@0: } michael@0: michael@0: if (fData != NULL) { michael@0: fData->removeReference(); michael@0: fData = NULL; michael@0: } michael@0: if (that.fData != NULL) { michael@0: fData = that.fData->addReference(); michael@0: } michael@0: michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // init() Shared initialization routine. Used by all the constructors. michael@0: // Initializes all fields, leaving the object in a consistent state. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: void RuleBasedBreakIterator::init() { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: fText = utext_openUChars(NULL, NULL, 0, &status); michael@0: fCharIter = NULL; michael@0: fSCharIter = NULL; michael@0: fDCharIter = NULL; michael@0: fData = NULL; michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: fDictionaryCharCount = 0; michael@0: fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable michael@0: // dictionary behavior for Break Iterators that are michael@0: // built from rules. Even better would be the ability to michael@0: // declare the type in the rules. michael@0: michael@0: fCachedBreakPositions = NULL; michael@0: fLanguageBreakEngines = NULL; michael@0: fUnhandledBreakEngine = NULL; michael@0: fNumCachedBreakPositions = 0; michael@0: fPositionInCache = 0; michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: static UBool debugInitDone = FALSE; michael@0: if (debugInitDone == FALSE) { michael@0: char *debugEnv = getenv("U_RBBIDEBUG"); michael@0: if (debugEnv && uprv_strstr(debugEnv, "trace")) { michael@0: fTrace = TRUE; michael@0: } michael@0: debugInitDone = TRUE; michael@0: } michael@0: #endif michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------- michael@0: // michael@0: // clone - Returns a newly-constructed RuleBasedBreakIterator with the same michael@0: // behavior, and iterating over the same text, as this one. michael@0: // Virtual function: does the right thing with subclasses. michael@0: // michael@0: //----------------------------------------------------------------------------- michael@0: BreakIterator* michael@0: RuleBasedBreakIterator::clone(void) const { michael@0: return new RuleBasedBreakIterator(*this); michael@0: } michael@0: michael@0: /** michael@0: * Equality operator. Returns TRUE if both BreakIterators are of the michael@0: * same class, have the same behavior, and iterate over the same text. michael@0: */ michael@0: UBool michael@0: RuleBasedBreakIterator::operator==(const BreakIterator& that) const { michael@0: if (typeid(*this) != typeid(that)) { michael@0: return FALSE; michael@0: } michael@0: michael@0: const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; michael@0: michael@0: if (!utext_equals(fText, that2.fText)) { michael@0: // The two break iterators are operating on different text, michael@0: // or have a different interation position. michael@0: return FALSE; michael@0: }; michael@0: michael@0: // TODO: need a check for when in a dictionary region at different offsets. michael@0: michael@0: if (that2.fData == fData || michael@0: (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { michael@0: // The two break iterators are using the same rules. michael@0: return TRUE; michael@0: } michael@0: return FALSE; michael@0: } michael@0: michael@0: /** michael@0: * Compute a hash code for this BreakIterator michael@0: * @return A hash code michael@0: */ michael@0: int32_t michael@0: RuleBasedBreakIterator::hashCode(void) const { michael@0: int32_t hash = 0; michael@0: if (fData != NULL) { michael@0: hash = fData->hashCode(); michael@0: } michael@0: return hash; michael@0: } michael@0: michael@0: michael@0: void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return; michael@0: } michael@0: reset(); michael@0: fText = utext_clone(fText, ut, FALSE, TRUE, &status); michael@0: michael@0: // Set up a dummy CharacterIterator to be returned if anyone michael@0: // calls getText(). With input from UText, there is no reasonable michael@0: // way to return a characterIterator over the actual input text. michael@0: // Return one over an empty string instead - this is the closest michael@0: // we can come to signaling a failure. michael@0: // (GetText() is obsolete, this failure is sort of OK) michael@0: if (fDCharIter == NULL) { michael@0: static const UChar c = 0; michael@0: fDCharIter = new UCharCharacterIterator(&c, 0); michael@0: if (fDCharIter == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: return; michael@0: } michael@0: } michael@0: michael@0: if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { michael@0: // existing fCharIter was adopted from the outside. Delete it now. michael@0: delete fCharIter; michael@0: } michael@0: fCharIter = fDCharIter; michael@0: michael@0: this->first(); michael@0: } michael@0: michael@0: michael@0: UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { michael@0: UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); michael@0: return result; michael@0: } michael@0: michael@0: michael@0: michael@0: /** michael@0: * Returns the description used to create this iterator michael@0: */ michael@0: const UnicodeString& michael@0: RuleBasedBreakIterator::getRules() const { michael@0: if (fData != NULL) { michael@0: return fData->getRuleSourceString(); michael@0: } else { michael@0: static const UnicodeString *s; michael@0: if (s == NULL) { michael@0: // TODO: something more elegant here. michael@0: // perhaps API should return the string by value. michael@0: // Note: thread unsafe init & leak are semi-ok, better than michael@0: // what was before. Sould be cleaned up, though. michael@0: s = new UnicodeString; michael@0: } michael@0: return *s; michael@0: } michael@0: } michael@0: michael@0: //======================================================================= michael@0: // BreakIterator overrides michael@0: //======================================================================= michael@0: michael@0: /** michael@0: * Return a CharacterIterator over the text being analyzed. michael@0: */ michael@0: CharacterIterator& michael@0: RuleBasedBreakIterator::getText() const { michael@0: return *fCharIter; michael@0: } michael@0: michael@0: /** michael@0: * Set the iterator to analyze a new piece of text. This function resets michael@0: * the current iteration position to the beginning of the text. michael@0: * @param newText An iterator over the text to analyze. michael@0: */ michael@0: void michael@0: RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { michael@0: // If we are holding a CharacterIterator adopted from a michael@0: // previous call to this function, delete it now. michael@0: if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { michael@0: delete fCharIter; michael@0: } michael@0: michael@0: fCharIter = newText; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: reset(); michael@0: if (newText==NULL || newText->startIndex() != 0) { michael@0: // startIndex !=0 wants to be an error, but there's no way to report it. michael@0: // Make the iterator text be an empty string. michael@0: fText = utext_openUChars(fText, NULL, 0, &status); michael@0: } else { michael@0: fText = utext_openCharacterIterator(fText, newText, &status); michael@0: } michael@0: this->first(); michael@0: } michael@0: michael@0: /** michael@0: * Set the iterator to analyze a new piece of text. This function resets michael@0: * the current iteration position to the beginning of the text. michael@0: * @param newText An iterator over the text to analyze. michael@0: */ michael@0: void michael@0: RuleBasedBreakIterator::setText(const UnicodeString& newText) { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: reset(); michael@0: fText = utext_openConstUnicodeString(fText, &newText, &status); michael@0: michael@0: // Set up a character iterator on the string. michael@0: // Needed in case someone calls getText(). michael@0: // Can not, unfortunately, do this lazily on the (probably never) michael@0: // call to getText(), because getText is const. michael@0: if (fSCharIter == NULL) { michael@0: fSCharIter = new StringCharacterIterator(newText); michael@0: } else { michael@0: fSCharIter->setText(newText); michael@0: } michael@0: michael@0: if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { michael@0: // old fCharIter was adopted from the outside. Delete it. michael@0: delete fCharIter; michael@0: } michael@0: fCharIter = fSCharIter; michael@0: michael@0: this->first(); michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Provide a new UText for the input text. Must reference text with contents identical michael@0: * to the original. michael@0: * Intended for use with text data originating in Java (garbage collected) environments michael@0: * where the data may be moved in memory at arbitrary times. michael@0: */ michael@0: RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: if (input == NULL) { michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return *this; michael@0: } michael@0: int64_t pos = utext_getNativeIndex(fText); michael@0: // Shallow read-only clone of the new UText into the existing input UText michael@0: fText = utext_clone(fText, input, FALSE, TRUE, &status); michael@0: if (U_FAILURE(status)) { michael@0: return *this; michael@0: } michael@0: utext_setNativeIndex(fText, pos); michael@0: if (utext_getNativeIndex(fText) != pos) { michael@0: // Sanity check. The new input utext is supposed to have the exact same michael@0: // contents as the old. If we can't set to the same position, it doesn't. michael@0: // The contents underlying the old utext might be invalid at this point, michael@0: // so it's not safe to check directly. michael@0: status = U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: return *this; michael@0: } michael@0: michael@0: michael@0: /** michael@0: * Sets the current iteration position to the beginning of the text. michael@0: * @return The offset of the beginning of the text. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::first(void) { michael@0: reset(); michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: //if (fText == NULL) michael@0: // return BreakIterator::DONE; michael@0: michael@0: utext_setNativeIndex(fText, 0); michael@0: return 0; michael@0: } michael@0: michael@0: /** michael@0: * Sets the current iteration position to the end of the text. michael@0: * @return The text's past-the-end offset. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::last(void) { michael@0: reset(); michael@0: if (fText == NULL) { michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: return BreakIterator::DONE; michael@0: } michael@0: michael@0: fLastStatusIndexValid = FALSE; michael@0: int32_t pos = (int32_t)utext_nativeLength(fText); michael@0: utext_setNativeIndex(fText, pos); michael@0: return pos; michael@0: } michael@0: michael@0: /** michael@0: * Advances the iterator either forward or backward the specified number of steps. michael@0: * Negative values move backward, and positive values move forward. This is michael@0: * equivalent to repeatedly calling next() or previous(). michael@0: * @param n The number of steps to move. The sign indicates the direction michael@0: * (negative is backwards, and positive is forwards). michael@0: * @return The character offset of the boundary position n boundaries away from michael@0: * the current one. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::next(int32_t n) { michael@0: int32_t result = current(); michael@0: while (n > 0) { michael@0: result = next(); michael@0: --n; michael@0: } michael@0: while (n < 0) { michael@0: result = previous(); michael@0: ++n; michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Advances the iterator to the next boundary position. michael@0: * @return The position of the first boundary after this one. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::next(void) { michael@0: // if we have cached break positions and we're still in the range michael@0: // covered by them, just move one step forward in the cache michael@0: if (fCachedBreakPositions != NULL) { michael@0: if (fPositionInCache < fNumCachedBreakPositions - 1) { michael@0: ++fPositionInCache; michael@0: int32_t pos = fCachedBreakPositions[fPositionInCache]; michael@0: utext_setNativeIndex(fText, pos); michael@0: return pos; michael@0: } michael@0: else { michael@0: reset(); michael@0: } michael@0: } michael@0: michael@0: int32_t startPos = current(); michael@0: int32_t result = handleNext(fData->fForwardTable); michael@0: if (fDictionaryCharCount > 0) { michael@0: result = checkDictionary(startPos, result, FALSE); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Advances the iterator backwards, to the last boundary preceding this one. michael@0: * @return The position of the last boundary position preceding this one. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::previous(void) { michael@0: int32_t result; michael@0: int32_t startPos; michael@0: michael@0: // if we have cached break positions and we're still in the range michael@0: // covered by them, just move one step backward in the cache michael@0: if (fCachedBreakPositions != NULL) { michael@0: if (fPositionInCache > 0) { michael@0: --fPositionInCache; michael@0: // If we're at the beginning of the cache, need to reevaluate the michael@0: // rule status michael@0: if (fPositionInCache <= 0) { michael@0: fLastStatusIndexValid = FALSE; michael@0: } michael@0: int32_t pos = fCachedBreakPositions[fPositionInCache]; michael@0: utext_setNativeIndex(fText, pos); michael@0: return pos; michael@0: } michael@0: else { michael@0: reset(); michael@0: } michael@0: } michael@0: michael@0: // if we're already sitting at the beginning of the text, return DONE michael@0: if (fText == NULL || (startPos = current()) == 0) { michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: return BreakIterator::DONE; michael@0: } michael@0: michael@0: if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { michael@0: result = handlePrevious(fData->fReverseTable); michael@0: if (fDictionaryCharCount > 0) { michael@0: result = checkDictionary(result, startPos, TRUE); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: // old rule syntax michael@0: // set things up. handlePrevious() will back us up to some valid michael@0: // break position before the current position (we back our internal michael@0: // iterator up one step to prevent handlePrevious() from returning michael@0: // the current position), but not necessarily the last one before michael@0: michael@0: // where we started michael@0: michael@0: int32_t start = current(); michael@0: michael@0: (void)UTEXT_PREVIOUS32(fText); michael@0: int32_t lastResult = handlePrevious(fData->fReverseTable); michael@0: if (lastResult == UBRK_DONE) { michael@0: lastResult = 0; michael@0: utext_setNativeIndex(fText, 0); michael@0: } michael@0: result = lastResult; michael@0: int32_t lastTag = 0; michael@0: UBool breakTagValid = FALSE; michael@0: michael@0: // iterate forward from the known break position until we pass our michael@0: // starting point. The last break position before the starting michael@0: // point is our return value michael@0: michael@0: for (;;) { michael@0: result = next(); michael@0: if (result == BreakIterator::DONE || result >= start) { michael@0: break; michael@0: } michael@0: lastResult = result; michael@0: lastTag = fLastRuleStatusIndex; michael@0: breakTagValid = TRUE; michael@0: } michael@0: michael@0: // fLastBreakTag wants to have the value for section of text preceding michael@0: // the result position that we are to return (in lastResult.) If michael@0: // the backwards rules overshot and the above loop had to do two or more michael@0: // next()s to move up to the desired return position, we will have a valid michael@0: // tag value. But, if handlePrevious() took us to exactly the correct result positon, michael@0: // we wont have a tag value for that position, which is only set by handleNext(). michael@0: michael@0: // set the current iteration position to be the last break position michael@0: // before where we started, and then return that value michael@0: utext_setNativeIndex(fText, lastResult); michael@0: fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() michael@0: fLastStatusIndexValid = breakTagValid; michael@0: michael@0: // No need to check the dictionary; it will have been handled by michael@0: // next() michael@0: michael@0: return lastResult; michael@0: } michael@0: michael@0: /** michael@0: * Sets the iterator to refer to the first boundary position following michael@0: * the specified position. michael@0: * @offset The position from which to begin searching for a break position. michael@0: * @return The position of the first break after the current position. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::following(int32_t offset) { michael@0: // if we have cached break positions and offset is in the range michael@0: // covered by them, use them michael@0: // TODO: could use binary search michael@0: // TODO: what if offset is outside range, but break is not? michael@0: if (fCachedBreakPositions != NULL) { michael@0: if (offset >= fCachedBreakPositions[0] michael@0: && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { michael@0: fPositionInCache = 0; michael@0: // We are guaranteed not to leave the array due to range test above michael@0: while (offset >= fCachedBreakPositions[fPositionInCache]) { michael@0: ++fPositionInCache; michael@0: } michael@0: int32_t pos = fCachedBreakPositions[fPositionInCache]; michael@0: utext_setNativeIndex(fText, pos); michael@0: return pos; michael@0: } michael@0: else { michael@0: reset(); michael@0: } michael@0: } michael@0: michael@0: // if the offset passed in is already past the end of the text, michael@0: // just return DONE; if it's before the beginning, return the michael@0: // text's starting offset michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: if (fText == NULL || offset >= utext_nativeLength(fText)) { michael@0: last(); michael@0: return next(); michael@0: } michael@0: else if (offset < 0) { michael@0: return first(); michael@0: } michael@0: michael@0: // otherwise, set our internal iteration position (temporarily) michael@0: // to the position passed in. If this is the _beginning_ position, michael@0: // then we can just use next() to get our return value michael@0: michael@0: int32_t result = 0; michael@0: michael@0: if (fData->fSafeRevTable != NULL) { michael@0: // new rule syntax michael@0: utext_setNativeIndex(fText, offset); michael@0: // move forward one codepoint to prepare for moving back to a michael@0: // safe point. michael@0: // this handles offset being between a supplementary character michael@0: (void)UTEXT_NEXT32(fText); michael@0: // handlePrevious will move most of the time to < 1 boundary away michael@0: handlePrevious(fData->fSafeRevTable); michael@0: int32_t result = next(); michael@0: while (result <= offset) { michael@0: result = next(); michael@0: } michael@0: return result; michael@0: } michael@0: if (fData->fSafeFwdTable != NULL) { michael@0: // backup plan if forward safe table is not available michael@0: utext_setNativeIndex(fText, offset); michael@0: (void)UTEXT_PREVIOUS32(fText); michael@0: // handle next will give result >= offset michael@0: handleNext(fData->fSafeFwdTable); michael@0: // previous will give result 0 or 1 boundary away from offset, michael@0: // most of the time michael@0: // we have to michael@0: int32_t oldresult = previous(); michael@0: while (oldresult > offset) { michael@0: int32_t result = previous(); michael@0: if (result <= offset) { michael@0: return oldresult; michael@0: } michael@0: oldresult = result; michael@0: } michael@0: int32_t result = next(); michael@0: if (result <= offset) { michael@0: return next(); michael@0: } michael@0: return result; michael@0: } michael@0: // otherwise, we have to sync up first. Use handlePrevious() to back michael@0: // up to a known break position before the specified position (if michael@0: // we can determine that the specified position is a break position, michael@0: // we don't back up at all). This may or may not be the last break michael@0: // position at or before our starting position. Advance forward michael@0: // from here until we've passed the starting position. The position michael@0: // we stop on will be the first break position after the specified one. michael@0: // old rule syntax michael@0: michael@0: utext_setNativeIndex(fText, offset); michael@0: if (offset==0 || michael@0: (offset==1 && utext_getNativeIndex(fText)==0)) { michael@0: return next(); michael@0: } michael@0: result = previous(); michael@0: michael@0: while (result != BreakIterator::DONE && result <= offset) { michael@0: result = next(); michael@0: } michael@0: michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Sets the iterator to refer to the last boundary position before the michael@0: * specified position. michael@0: * @offset The position to begin searching for a break from. michael@0: * @return The position of the last boundary before the starting position. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::preceding(int32_t offset) { michael@0: // if we have cached break positions and offset is in the range michael@0: // covered by them, use them michael@0: if (fCachedBreakPositions != NULL) { michael@0: // TODO: binary search? michael@0: // TODO: What if offset is outside range, but break is not? michael@0: if (offset > fCachedBreakPositions[0] michael@0: && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { michael@0: fPositionInCache = 0; michael@0: while (fPositionInCache < fNumCachedBreakPositions michael@0: && offset > fCachedBreakPositions[fPositionInCache]) michael@0: ++fPositionInCache; michael@0: --fPositionInCache; michael@0: // If we're at the beginning of the cache, need to reevaluate the michael@0: // rule status michael@0: if (fPositionInCache <= 0) { michael@0: fLastStatusIndexValid = FALSE; michael@0: } michael@0: utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); michael@0: return fCachedBreakPositions[fPositionInCache]; michael@0: } michael@0: else { michael@0: reset(); michael@0: } michael@0: } michael@0: michael@0: // if the offset passed in is already past the end of the text, michael@0: // just return DONE; if it's before the beginning, return the michael@0: // text's starting offset michael@0: if (fText == NULL || offset > utext_nativeLength(fText)) { michael@0: // return BreakIterator::DONE; michael@0: return last(); michael@0: } michael@0: else if (offset < 0) { michael@0: return first(); michael@0: } michael@0: michael@0: // if we start by updating the current iteration position to the michael@0: // position specified by the caller, we can just use previous() michael@0: // to carry out this operation michael@0: michael@0: if (fData->fSafeFwdTable != NULL) { michael@0: // new rule syntax michael@0: utext_setNativeIndex(fText, offset); michael@0: int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: if (newOffset != offset) { michael@0: // Will come here if specified offset was not a code point boundary AND michael@0: // the underlying implmentation is using UText, which snaps any non-code-point-boundary michael@0: // indices to the containing code point. michael@0: // For breakitereator::preceding only, these non-code-point indices need to be moved michael@0: // up to refer to the following codepoint. michael@0: (void)UTEXT_NEXT32(fText); michael@0: offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: } michael@0: michael@0: // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, michael@0: // rather than adjusting the position unconditionally? michael@0: // (Change would interact with safe rules.) michael@0: // TODO: change RBBI behavior for off-boundary indices to match that of UText? michael@0: // affects only preceding(), seems cleaner, but is slightly different. michael@0: (void)UTEXT_PREVIOUS32(fText); michael@0: handleNext(fData->fSafeFwdTable); michael@0: int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: while (result >= offset) { michael@0: result = previous(); michael@0: } michael@0: return result; michael@0: } michael@0: if (fData->fSafeRevTable != NULL) { michael@0: // backup plan if forward safe table is not available michael@0: // TODO: check whether this path can be discarded michael@0: // It's probably OK to say that rules must supply both safe tables michael@0: // if they use safe tables at all. We have certainly never described michael@0: // to anyone how to work with just one safe table. michael@0: utext_setNativeIndex(fText, offset); michael@0: (void)UTEXT_NEXT32(fText); michael@0: michael@0: // handle previous will give result <= offset michael@0: handlePrevious(fData->fSafeRevTable); michael@0: michael@0: // next will give result 0 or 1 boundary away from offset, michael@0: // most of the time michael@0: // we have to michael@0: int32_t oldresult = next(); michael@0: while (oldresult < offset) { michael@0: int32_t result = next(); michael@0: if (result >= offset) { michael@0: return oldresult; michael@0: } michael@0: oldresult = result; michael@0: } michael@0: int32_t result = previous(); michael@0: if (result >= offset) { michael@0: return previous(); michael@0: } michael@0: return result; michael@0: } michael@0: michael@0: // old rule syntax michael@0: utext_setNativeIndex(fText, offset); michael@0: return previous(); michael@0: } michael@0: michael@0: /** michael@0: * Returns true if the specfied position is a boundary position. As a side michael@0: * effect, leaves the iterator pointing to the first boundary position at michael@0: * or after "offset". michael@0: * @param offset the offset to check. michael@0: * @return True if "offset" is a boundary position. michael@0: */ michael@0: UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { michael@0: // the beginning index of the iterator is always a boundary position by definition michael@0: if (offset == 0) { michael@0: first(); // For side effects on current position, tag values. michael@0: return TRUE; michael@0: } michael@0: michael@0: if (offset == (int32_t)utext_nativeLength(fText)) { michael@0: last(); // For side effects on current position, tag values. michael@0: return TRUE; michael@0: } michael@0: michael@0: // out-of-range indexes are never boundary positions michael@0: if (offset < 0) { michael@0: first(); // For side effects on current position, tag values. michael@0: return FALSE; michael@0: } michael@0: michael@0: if (offset > utext_nativeLength(fText)) { michael@0: last(); // For side effects on current position, tag values. michael@0: return FALSE; michael@0: } michael@0: michael@0: // otherwise, we can use following() on the position before the specified michael@0: // one and return true if the position we get back is the one the user michael@0: // specified michael@0: utext_previous32From(fText, offset); michael@0: int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: UBool result = following(backOne) == offset; michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Returns the current iteration position. michael@0: * @return The current iteration position. michael@0: */ michael@0: int32_t RuleBasedBreakIterator::current(void) const { michael@0: int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: return pos; michael@0: } michael@0: michael@0: //======================================================================= michael@0: // implementation michael@0: //======================================================================= michael@0: michael@0: // michael@0: // RBBIRunMode - the state machine runs an extra iteration at the beginning and end michael@0: // of user text. A variable with this enum type keeps track of where we michael@0: // are. The state machine only fetches user input while in the RUN mode. michael@0: // michael@0: enum RBBIRunMode { michael@0: RBBI_START, // state machine processing is before first char of input michael@0: RBBI_RUN, // state machine processing is in the user text michael@0: RBBI_END // state machine processing is after end of user text. michael@0: }; michael@0: michael@0: michael@0: //----------------------------------------------------------------------------------- michael@0: // michael@0: // handleNext(stateTable) michael@0: // This method is the actual implementation of the rbbi next() method. michael@0: // This method initializes the state machine to state 1 michael@0: // and advances through the text character by character until we reach the end michael@0: // of the text or the state machine transitions to state 0. We update our return michael@0: // value every time the state machine passes through an accepting state. michael@0: // michael@0: //----------------------------------------------------------------------------------- michael@0: int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { michael@0: int32_t state; michael@0: uint16_t category = 0; michael@0: RBBIRunMode mode; michael@0: michael@0: RBBIStateTableRow *row; michael@0: UChar32 c; michael@0: int32_t lookaheadStatus = 0; michael@0: int32_t lookaheadTagIdx = 0; michael@0: int32_t result = 0; michael@0: int32_t initialPosition = 0; michael@0: int32_t lookaheadResult = 0; michael@0: UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; michael@0: const char *tableData = statetable->fTableData; michael@0: uint32_t tableRowLen = statetable->fRowLen; michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPuts("Handle Next pos char state category"); michael@0: } michael@0: #endif michael@0: michael@0: // No matter what, handleNext alway correctly sets the break tag value. michael@0: fLastStatusIndexValid = TRUE; michael@0: fLastRuleStatusIndex = 0; michael@0: michael@0: // if we're already at the end of the text, return DONE. michael@0: initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: result = initialPosition; michael@0: c = UTEXT_NEXT32(fText); michael@0: if (fData == NULL || c==U_SENTINEL) { michael@0: return BreakIterator::DONE; michael@0: } michael@0: michael@0: // Set the initial state for the state machine michael@0: state = START_STATE; michael@0: row = (RBBIStateTableRow *) michael@0: //(statetable->fTableData + (statetable->fRowLen * state)); michael@0: (tableData + tableRowLen * state); michael@0: michael@0: michael@0: mode = RBBI_RUN; michael@0: if (statetable->fFlags & RBBI_BOF_REQUIRED) { michael@0: category = 2; michael@0: mode = RBBI_START; michael@0: } michael@0: michael@0: michael@0: // loop until we reach the end of the text or transition to state 0 michael@0: // michael@0: for (;;) { michael@0: if (c == U_SENTINEL) { michael@0: // Reached end of input string. michael@0: if (mode == RBBI_END) { michael@0: // We have already run the loop one last time with the michael@0: // character set to the psueudo {eof} value. Now it is time michael@0: // to unconditionally bail out. michael@0: if (lookaheadResult > result) { michael@0: // We ran off the end of the string with a pending look-ahead match. michael@0: // Treat this as if the look-ahead condition had been met, and return michael@0: // the match at the / position from the look-ahead rule. michael@0: result = lookaheadResult; michael@0: fLastRuleStatusIndex = lookaheadTagIdx; michael@0: lookaheadStatus = 0; michael@0: } michael@0: break; michael@0: } michael@0: // Run the loop one last time with the fake end-of-input character category. michael@0: mode = RBBI_END; michael@0: category = 1; michael@0: } michael@0: michael@0: // michael@0: // Get the char category. An incoming category of 1 or 2 means that michael@0: // we are preset for doing the beginning or end of input, and michael@0: // that we shouldn't get a category from an actual text input character. michael@0: // michael@0: if (mode == RBBI_RUN) { michael@0: // look up the current character's character category, which tells us michael@0: // which column in the state table to look at. michael@0: // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, michael@0: // not the size of the character going in, which is a UChar32. michael@0: // michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: michael@0: // Check the dictionary bit in the character's category. michael@0: // Counter is only used by dictionary based iterators (subclasses). michael@0: // Chars that need to be handled by a dictionary have a flag bit set michael@0: // in their category values. michael@0: // michael@0: if ((category & 0x4000) != 0) { michael@0: fDictionaryCharCount++; michael@0: // And off the dictionary flag bit. michael@0: category &= ~0x4000; michael@0: } michael@0: } michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); michael@0: if (0x20<=c && c<0x7f) { michael@0: RBBIDebugPrintf("\"%c\" ", c); michael@0: } else { michael@0: RBBIDebugPrintf("%5x ", c); michael@0: } michael@0: RBBIDebugPrintf("%3d %3d\n", state, category); michael@0: } michael@0: #endif michael@0: michael@0: // State Transition - move machine to its next state michael@0: // michael@0: michael@0: // Note: fNextState is defined as uint16_t[2], but we are casting michael@0: // a generated RBBI table to RBBIStateTableRow and some tables michael@0: // actually have more than 2 categories. michael@0: U_ASSERT(categoryfHeader->fCatCount); michael@0: state = row->fNextState[category]; /*Not accessing beyond memory*/ michael@0: row = (RBBIStateTableRow *) michael@0: // (statetable->fTableData + (statetable->fRowLen * state)); michael@0: (tableData + tableRowLen * state); michael@0: michael@0: michael@0: if (row->fAccepting == -1) { michael@0: // Match found, common case. michael@0: if (mode != RBBI_START) { michael@0: result = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: } michael@0: fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. michael@0: } michael@0: michael@0: if (row->fLookAhead != 0) { michael@0: if (lookaheadStatus != 0 michael@0: && row->fAccepting == lookaheadStatus) { michael@0: // Lookahead match is completed. michael@0: result = lookaheadResult; michael@0: fLastRuleStatusIndex = lookaheadTagIdx; michael@0: lookaheadStatus = 0; michael@0: // TODO: make a standalone hard break in a rule work. michael@0: if (lookAheadHardBreak) { michael@0: UTEXT_SETNATIVEINDEX(fText, result); michael@0: return result; michael@0: } michael@0: // Look-ahead completed, but other rules may match further. Continue on michael@0: // TODO: junk this feature? I don't think it's used anywhwere. michael@0: goto continueOn; michael@0: } michael@0: michael@0: int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: lookaheadResult = r; michael@0: lookaheadStatus = row->fLookAhead; michael@0: lookaheadTagIdx = row->fTagIdx; michael@0: goto continueOn; michael@0: } michael@0: michael@0: michael@0: if (row->fAccepting != 0) { michael@0: // Because this is an accepting state, any in-progress look-ahead match michael@0: // is no longer relavant. Clear out the pending lookahead status. michael@0: lookaheadStatus = 0; // clear out any pending look-ahead match. michael@0: } michael@0: michael@0: continueOn: michael@0: if (state == STOP_STATE) { michael@0: // This is the normal exit from the lookup state machine. michael@0: // We have advanced through the string until it is certain that no michael@0: // longer match is possible, no matter what characters follow. michael@0: break; michael@0: } michael@0: michael@0: // Advance to the next character. michael@0: // If this is a beginning-of-input loop iteration, don't advance michael@0: // the input position. The next iteration will be processing the michael@0: // first real input character. michael@0: if (mode == RBBI_RUN) { michael@0: c = UTEXT_NEXT32(fText); michael@0: } else { michael@0: if (mode == RBBI_START) { michael@0: mode = RBBI_RUN; michael@0: } michael@0: } michael@0: michael@0: michael@0: } michael@0: michael@0: // The state machine is done. Check whether it found a match... michael@0: michael@0: // If the iterator failed to advance in the match engine, force it ahead by one. michael@0: // (This really indicates a defect in the break rules. They should always match michael@0: // at least one character.) michael@0: if (result == initialPosition) { michael@0: UTEXT_SETNATIVEINDEX(fText, initialPosition); michael@0: UTEXT_NEXT32(fText); michael@0: result = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: } michael@0: michael@0: // Leave the iterator at our result position. michael@0: UTEXT_SETNATIVEINDEX(fText, result); michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPrintf("result = %d\n\n", result); michael@0: } michael@0: #endif michael@0: return result; michael@0: } michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------------- michael@0: // michael@0: // handlePrevious() michael@0: // michael@0: // Iterate backwards, according to the logic of the reverse rules. michael@0: // This version handles the exact style backwards rules. michael@0: // michael@0: // The logic of this function is very similar to handleNext(), above. michael@0: // michael@0: //----------------------------------------------------------------------------------- michael@0: int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { michael@0: int32_t state; michael@0: uint16_t category = 0; michael@0: RBBIRunMode mode; michael@0: RBBIStateTableRow *row; michael@0: UChar32 c; michael@0: int32_t lookaheadStatus = 0; michael@0: int32_t result = 0; michael@0: int32_t initialPosition = 0; michael@0: int32_t lookaheadResult = 0; michael@0: UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPuts("Handle Previous pos char state category"); michael@0: } michael@0: #endif michael@0: michael@0: // handlePrevious() never gets the rule status. michael@0: // Flag the status as invalid; if the user ever asks for status, we will need michael@0: // to back up, then re-find the break position using handleNext(), which does michael@0: // get the status value. michael@0: fLastStatusIndexValid = FALSE; michael@0: fLastRuleStatusIndex = 0; michael@0: michael@0: // if we're already at the start of the text, return DONE. michael@0: if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { michael@0: return BreakIterator::DONE; michael@0: } michael@0: michael@0: // Set up the starting char. michael@0: initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: result = initialPosition; michael@0: c = UTEXT_PREVIOUS32(fText); michael@0: michael@0: // Set the initial state for the state machine michael@0: state = START_STATE; michael@0: row = (RBBIStateTableRow *) michael@0: (statetable->fTableData + (statetable->fRowLen * state)); michael@0: category = 3; michael@0: mode = RBBI_RUN; michael@0: if (statetable->fFlags & RBBI_BOF_REQUIRED) { michael@0: category = 2; michael@0: mode = RBBI_START; michael@0: } michael@0: michael@0: michael@0: // loop until we reach the start of the text or transition to state 0 michael@0: // michael@0: for (;;) { michael@0: if (c == U_SENTINEL) { michael@0: // Reached end of input string. michael@0: if (mode == RBBI_END) { michael@0: // We have already run the loop one last time with the michael@0: // character set to the psueudo {eof} value. Now it is time michael@0: // to unconditionally bail out. michael@0: if (lookaheadResult < result) { michael@0: // We ran off the end of the string with a pending look-ahead match. michael@0: // Treat this as if the look-ahead condition had been met, and return michael@0: // the match at the / position from the look-ahead rule. michael@0: result = lookaheadResult; michael@0: lookaheadStatus = 0; michael@0: } else if (result == initialPosition) { michael@0: // Ran off start, no match found. michael@0: // move one index one (towards the start, since we are doing a previous()) michael@0: UTEXT_SETNATIVEINDEX(fText, initialPosition); michael@0: (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. michael@0: } michael@0: break; michael@0: } michael@0: // Run the loop one last time with the fake end-of-input character category. michael@0: mode = RBBI_END; michael@0: category = 1; michael@0: } michael@0: michael@0: // michael@0: // Get the char category. An incoming category of 1 or 2 means that michael@0: // we are preset for doing the beginning or end of input, and michael@0: // that we shouldn't get a category from an actual text input character. michael@0: // michael@0: if (mode == RBBI_RUN) { michael@0: // look up the current character's character category, which tells us michael@0: // which column in the state table to look at. michael@0: // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, michael@0: // not the size of the character going in, which is a UChar32. michael@0: // michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: michael@0: // Check the dictionary bit in the character's category. michael@0: // Counter is only used by dictionary based iterators (subclasses). michael@0: // Chars that need to be handled by a dictionary have a flag bit set michael@0: // in their category values. michael@0: // michael@0: if ((category & 0x4000) != 0) { michael@0: fDictionaryCharCount++; michael@0: // And off the dictionary flag bit. michael@0: category &= ~0x4000; michael@0: } michael@0: } michael@0: michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); michael@0: if (0x20<=c && c<0x7f) { michael@0: RBBIDebugPrintf("\"%c\" ", c); michael@0: } else { michael@0: RBBIDebugPrintf("%5x ", c); michael@0: } michael@0: RBBIDebugPrintf("%3d %3d\n", state, category); michael@0: } michael@0: #endif michael@0: michael@0: // State Transition - move machine to its next state michael@0: // michael@0: michael@0: // Note: fNextState is defined as uint16_t[2], but we are casting michael@0: // a generated RBBI table to RBBIStateTableRow and some tables michael@0: // actually have more than 2 categories. michael@0: U_ASSERT(categoryfHeader->fCatCount); michael@0: state = row->fNextState[category]; /*Not accessing beyond memory*/ michael@0: row = (RBBIStateTableRow *) michael@0: (statetable->fTableData + (statetable->fRowLen * state)); michael@0: michael@0: if (row->fAccepting == -1) { michael@0: // Match found, common case. michael@0: result = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: } michael@0: michael@0: if (row->fLookAhead != 0) { michael@0: if (lookaheadStatus != 0 michael@0: && row->fAccepting == lookaheadStatus) { michael@0: // Lookahead match is completed. michael@0: result = lookaheadResult; michael@0: lookaheadStatus = 0; michael@0: // TODO: make a standalone hard break in a rule work. michael@0: if (lookAheadHardBreak) { michael@0: UTEXT_SETNATIVEINDEX(fText, result); michael@0: return result; michael@0: } michael@0: // Look-ahead completed, but other rules may match further. Continue on michael@0: // TODO: junk this feature? I don't think it's used anywhwere. michael@0: goto continueOn; michael@0: } michael@0: michael@0: int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: lookaheadResult = r; michael@0: lookaheadStatus = row->fLookAhead; michael@0: goto continueOn; michael@0: } michael@0: michael@0: michael@0: if (row->fAccepting != 0) { michael@0: // Because this is an accepting state, any in-progress look-ahead match michael@0: // is no longer relavant. Clear out the pending lookahead status. michael@0: lookaheadStatus = 0; michael@0: } michael@0: michael@0: continueOn: michael@0: if (state == STOP_STATE) { michael@0: // This is the normal exit from the lookup state machine. michael@0: // We have advanced through the string until it is certain that no michael@0: // longer match is possible, no matter what characters follow. michael@0: break; michael@0: } michael@0: michael@0: // Move (backwards) to the next character to process. michael@0: // If this is a beginning-of-input loop iteration, don't advance michael@0: // the input position. The next iteration will be processing the michael@0: // first real input character. michael@0: if (mode == RBBI_RUN) { michael@0: c = UTEXT_PREVIOUS32(fText); michael@0: } else { michael@0: if (mode == RBBI_START) { michael@0: mode = RBBI_RUN; michael@0: } michael@0: } michael@0: } michael@0: michael@0: // The state machine is done. Check whether it found a match... michael@0: michael@0: // If the iterator failed to advance in the match engine, force it ahead by one. michael@0: // (This really indicates a defect in the break rules. They should always match michael@0: // at least one character.) michael@0: if (result == initialPosition) { michael@0: UTEXT_SETNATIVEINDEX(fText, initialPosition); michael@0: UTEXT_PREVIOUS32(fText); michael@0: result = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: } michael@0: michael@0: // Leave the iterator at our result position. michael@0: UTEXT_SETNATIVEINDEX(fText, result); michael@0: #ifdef RBBI_DEBUG michael@0: if (fTrace) { michael@0: RBBIDebugPrintf("result = %d\n\n", result); michael@0: } michael@0: #endif michael@0: return result; michael@0: } michael@0: michael@0: michael@0: void michael@0: RuleBasedBreakIterator::reset() michael@0: { michael@0: if (fCachedBreakPositions) { michael@0: uprv_free(fCachedBreakPositions); michael@0: } michael@0: fCachedBreakPositions = NULL; michael@0: fNumCachedBreakPositions = 0; michael@0: fDictionaryCharCount = 0; michael@0: fPositionInCache = 0; michael@0: } michael@0: michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // getRuleStatus() Return the break rule tag associated with the current michael@0: // iterator position. If the iterator arrived at its current michael@0: // position by iterating forwards, the value will have been michael@0: // cached by the handleNext() function. michael@0: // michael@0: // If no cached status value is available, the status is michael@0: // found by doing a previous() followed by a next(), which michael@0: // leaves the iterator where it started, and computes the michael@0: // status while doing the next(). michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: void RuleBasedBreakIterator::makeRuleStatusValid() { michael@0: if (fLastStatusIndexValid == FALSE) { michael@0: // No cached status is available. michael@0: if (fText == NULL || current() == 0) { michael@0: // At start of text, or there is no text. Status is always zero. michael@0: fLastRuleStatusIndex = 0; michael@0: fLastStatusIndexValid = TRUE; michael@0: } else { michael@0: // Not at start of text. Find status the tedious way. michael@0: int32_t pa = current(); michael@0: previous(); michael@0: if (fNumCachedBreakPositions > 0) { michael@0: reset(); // Blow off the dictionary cache michael@0: } michael@0: int32_t pb = next(); michael@0: if (pa != pb) { michael@0: // note: the if (pa != pb) test is here only to eliminate warnings for michael@0: // unused local variables on gcc. Logically, it isn't needed. michael@0: U_ASSERT(pa == pb); michael@0: } michael@0: } michael@0: } michael@0: U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); michael@0: } michael@0: michael@0: michael@0: int32_t RuleBasedBreakIterator::getRuleStatus() const { michael@0: RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; michael@0: nonConstThis->makeRuleStatusValid(); michael@0: michael@0: // fLastRuleStatusIndex indexes to the start of the appropriate status record michael@0: // (the number of status values.) michael@0: // This function returns the last (largest) of the array of status values. michael@0: int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; michael@0: int32_t tagVal = fData->fRuleStatusTable[idx]; michael@0: michael@0: return tagVal; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: int32_t RuleBasedBreakIterator::getRuleStatusVec( michael@0: int32_t *fillInVec, int32_t capacity, UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)) { michael@0: return 0; michael@0: } michael@0: michael@0: RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; michael@0: nonConstThis->makeRuleStatusValid(); michael@0: int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; michael@0: int32_t numValsToCopy = numVals; michael@0: if (numVals > capacity) { michael@0: status = U_BUFFER_OVERFLOW_ERROR; michael@0: numValsToCopy = capacity; michael@0: } michael@0: int i; michael@0: for (i=0; ifRuleStatusTable[fLastRuleStatusIndex + i + 1]; michael@0: } michael@0: return numVals; michael@0: } michael@0: michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // getBinaryRules Access to the compiled form of the rules, michael@0: // for use by build system tools that save the data michael@0: // for standard iterator types. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { michael@0: const uint8_t *retPtr = NULL; michael@0: length = 0; michael@0: michael@0: if (fData != NULL) { michael@0: retPtr = (const uint8_t *)fData->fHeader; michael@0: length = fData->fHeader->fLength; michael@0: } michael@0: return retPtr; michael@0: } michael@0: michael@0: michael@0: BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, michael@0: int32_t &bufferSize, michael@0: UErrorCode &status) michael@0: { michael@0: if (U_FAILURE(status)){ michael@0: return NULL; michael@0: } michael@0: michael@0: if (bufferSize == 0) { michael@0: bufferSize = 1; // preflighting for deprecated functionality michael@0: return NULL; michael@0: } michael@0: michael@0: BreakIterator *clonedBI = clone(); michael@0: if (clonedBI == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } else { michael@0: status = U_SAFECLONE_ALLOCATED_WARNING; michael@0: } michael@0: return (RuleBasedBreakIterator *)clonedBI; michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // isDictionaryChar Return true if the category lookup for this char michael@0: // indicates that it is in the set of dictionary lookup michael@0: // chars. michael@0: // michael@0: // This function is intended for use by dictionary based michael@0: // break iterators. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { michael@0: if (fData == NULL) { michael@0: return FALSE; michael@0: } michael@0: uint16_t category; michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: return (category & 0x4000) != 0; michael@0: }*/ michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // checkDictionary This function handles all processing of characters in michael@0: // the "dictionary" set. It will determine the appropriate michael@0: // course of action, and possibly set up a cache in the michael@0: // process. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, michael@0: int32_t endPos, michael@0: UBool reverse) { michael@0: // Reset the old break cache first. michael@0: reset(); michael@0: michael@0: // note: code segment below assumes that dictionary chars are in the michael@0: // startPos-endPos range michael@0: // value returned should be next character in sequence michael@0: if ((endPos - startPos) <= 1) { michael@0: return (reverse ? startPos : endPos); michael@0: } michael@0: michael@0: // Bug 5532. The dictionary code will crash if the input text is UTF-8 michael@0: // because native indexes are different from UTF-16 indexes. michael@0: // Temporary hack: skip dictionary lookup for UTF-8 encoded text. michael@0: // It wont give the right breaks, but it's better than a crash. michael@0: // michael@0: // Check the type of the UText by checking its pFuncs field, which michael@0: // is UText's function dispatch table. It will be the same for all michael@0: // UTF-8 UTexts and different for any other UText type. michael@0: // michael@0: // We have no other type of UText available with non-UTF-16 native indexing. michael@0: // This whole check will go away once the dictionary code is fixed. michael@0: static const void *utext_utf8Funcs; michael@0: if (utext_utf8Funcs == NULL) { michael@0: // Cache the UTF-8 UText function pointer value. michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UText tempUText = UTEXT_INITIALIZER; michael@0: utext_openUTF8(&tempUText, NULL, 0, &status); michael@0: utext_utf8Funcs = tempUText.pFuncs; michael@0: utext_close(&tempUText); michael@0: } michael@0: if (fText->pFuncs == utext_utf8Funcs) { michael@0: return (reverse ? startPos : endPos); michael@0: } michael@0: michael@0: // Starting from the starting point, scan towards the proposed result, michael@0: // looking for the first dictionary character (which may be the one michael@0: // we're on, if we're starting in the middle of a range). michael@0: utext_setNativeIndex(fText, reverse ? endPos : startPos); michael@0: if (reverse) { michael@0: UTEXT_PREVIOUS32(fText); michael@0: } michael@0: michael@0: int32_t rangeStart = startPos; michael@0: int32_t rangeEnd = endPos; michael@0: michael@0: uint16_t category; michael@0: int32_t current; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: UStack breaks(status); michael@0: int32_t foundBreakCount = 0; michael@0: UChar32 c = utext_current32(fText); michael@0: michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: michael@0: // Is the character we're starting on a dictionary character? If so, we michael@0: // need to back up to include the entire run; otherwise the results of michael@0: // the break algorithm will differ depending on where we start. Since michael@0: // the result is cached and there is typically a non-dictionary break michael@0: // within a small number of words, there should be little performance impact. michael@0: if (category & 0x4000) { michael@0: if (reverse) { michael@0: do { michael@0: utext_next32(fText); // TODO: recast to work directly with postincrement. michael@0: c = utext_current32(fText); michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } while (c != U_SENTINEL && (category & 0x4000)); michael@0: // Back up to the last dictionary character michael@0: rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); michael@0: if (c == U_SENTINEL) { michael@0: // c = fText->last32(); michael@0: // TODO: why was this if needed? michael@0: c = UTEXT_PREVIOUS32(fText); michael@0: } michael@0: else { michael@0: c = UTEXT_PREVIOUS32(fText); michael@0: } michael@0: } michael@0: else { michael@0: do { michael@0: c = UTEXT_PREVIOUS32(fText); michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } michael@0: while (c != U_SENTINEL && (category & 0x4000)); michael@0: // Back up to the last dictionary character michael@0: if (c == U_SENTINEL) { michael@0: // c = fText->first32(); michael@0: c = utext_current32(fText); michael@0: } michael@0: else { michael@0: utext_next32(fText); michael@0: c = utext_current32(fText); michael@0: } michael@0: rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; michael@0: } michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } michael@0: michael@0: // Loop through the text, looking for ranges of dictionary characters. michael@0: // For each span, find the appropriate break engine, and ask it to find michael@0: // any breaks within the span. michael@0: // Note: we always do this in the forward direction, so that the break michael@0: // cache is built in the right order. michael@0: if (reverse) { michael@0: utext_setNativeIndex(fText, rangeStart); michael@0: c = utext_current32(fText); michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } michael@0: while(U_SUCCESS(status)) { michael@0: while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { michael@0: utext_next32(fText); // TODO: tweak for post-increment operation michael@0: c = utext_current32(fText); michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } michael@0: if (current >= rangeEnd) { michael@0: break; michael@0: } michael@0: michael@0: // We now have a dictionary character. Get the appropriate language object michael@0: // to deal with it. michael@0: const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); michael@0: michael@0: // Ask the language object if there are any breaks. It will leave the text michael@0: // pointer on the other side of its range, ready to search for the next one. michael@0: if (lbe != NULL) { michael@0: foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); michael@0: } michael@0: michael@0: // Reload the loop variables for the next go-round michael@0: c = utext_current32(fText); michael@0: UTRIE_GET16(&fData->fTrie, c, category); michael@0: } michael@0: michael@0: // If we found breaks, build a new break cache. The first and last entries must michael@0: // be the original starting and ending position. michael@0: if (foundBreakCount > 0) { michael@0: int32_t totalBreaks = foundBreakCount; michael@0: if (startPos < breaks.elementAti(0)) { michael@0: totalBreaks += 1; michael@0: } michael@0: if (endPos > breaks.peeki()) { michael@0: totalBreaks += 1; michael@0: } michael@0: fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); michael@0: if (fCachedBreakPositions != NULL) { michael@0: int32_t out = 0; michael@0: fNumCachedBreakPositions = totalBreaks; michael@0: if (startPos < breaks.elementAti(0)) { michael@0: fCachedBreakPositions[out++] = startPos; michael@0: } michael@0: for (int32_t i = 0; i < foundBreakCount; ++i) { michael@0: fCachedBreakPositions[out++] = breaks.elementAti(i); michael@0: } michael@0: if (endPos > fCachedBreakPositions[out-1]) { michael@0: fCachedBreakPositions[out] = endPos; michael@0: } michael@0: // If there are breaks, then by definition, we are replacing the original michael@0: // proposed break by one of the breaks we found. Use following() and michael@0: // preceding() to do the work. They should never recurse in this case. michael@0: if (reverse) { michael@0: return preceding(endPos); michael@0: } michael@0: else { michael@0: return following(startPos); michael@0: } michael@0: } michael@0: // If the allocation failed, just fall through to the "no breaks found" case. michael@0: } michael@0: michael@0: // If we get here, there were no language-based breaks. Set the text pointer michael@0: // to the original proposed break. michael@0: utext_setNativeIndex(fText, reverse ? startPos : endPos); michael@0: return (reverse ? startPos : endPos); michael@0: } michael@0: michael@0: // defined in ucln_cmn.h michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: michael@0: static icu::UStack *gLanguageBreakFactories = NULL; michael@0: static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; michael@0: michael@0: /** michael@0: * Release all static memory held by breakiterator. michael@0: */ michael@0: U_CDECL_BEGIN michael@0: static UBool U_CALLCONV breakiterator_cleanup_dict(void) { michael@0: if (gLanguageBreakFactories) { michael@0: delete gLanguageBreakFactories; michael@0: gLanguageBreakFactories = NULL; michael@0: } michael@0: gLanguageBreakFactoriesInitOnce.reset(); michael@0: return TRUE; michael@0: } michael@0: U_CDECL_END michael@0: michael@0: U_CDECL_BEGIN michael@0: static void U_CALLCONV _deleteFactory(void *obj) { michael@0: delete (icu::LanguageBreakFactory *) obj; michael@0: } michael@0: U_CDECL_END michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: static void U_CALLCONV initLanguageFactories() { michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: U_ASSERT(gLanguageBreakFactories == NULL); michael@0: gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); michael@0: if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { michael@0: ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); michael@0: gLanguageBreakFactories->push(builtIn, status); michael@0: #ifdef U_LOCAL_SERVICE_HOOK michael@0: LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); michael@0: if (extra != NULL) { michael@0: gLanguageBreakFactories->push(extra, status); michael@0: } michael@0: #endif michael@0: } michael@0: ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); michael@0: } michael@0: michael@0: michael@0: static const LanguageBreakEngine* michael@0: getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) michael@0: { michael@0: umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); michael@0: if (gLanguageBreakFactories == NULL) { michael@0: return NULL; michael@0: } michael@0: michael@0: int32_t i = gLanguageBreakFactories->size(); michael@0: const LanguageBreakEngine *lbe = NULL; michael@0: while (--i >= 0) { michael@0: LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); michael@0: lbe = factory->getEngineFor(c, breakType); michael@0: if (lbe != NULL) { michael@0: break; michael@0: } michael@0: } michael@0: return lbe; michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------- michael@0: // michael@0: // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the michael@0: // the character c. michael@0: // michael@0: //------------------------------------------------------------------------------- michael@0: const LanguageBreakEngine * michael@0: RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { michael@0: const LanguageBreakEngine *lbe = NULL; michael@0: UErrorCode status = U_ZERO_ERROR; michael@0: michael@0: if (fLanguageBreakEngines == NULL) { michael@0: fLanguageBreakEngines = new UStack(status); michael@0: if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { michael@0: delete fLanguageBreakEngines; michael@0: fLanguageBreakEngines = 0; michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: int32_t i = fLanguageBreakEngines->size(); michael@0: while (--i >= 0) { michael@0: lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); michael@0: if (lbe->handles(c, fBreakType)) { michael@0: return lbe; michael@0: } michael@0: } michael@0: michael@0: // No existing dictionary took the character. See if a factory wants to michael@0: // give us a new LanguageBreakEngine for this character. michael@0: lbe = getLanguageBreakEngineFromFactory(c, fBreakType); michael@0: michael@0: // If we got one, use it and push it on our stack. michael@0: if (lbe != NULL) { michael@0: fLanguageBreakEngines->push((void *)lbe, status); michael@0: // Even if we can't remember it, we can keep looking it up, so michael@0: // return it even if the push fails. michael@0: return lbe; michael@0: } michael@0: michael@0: // No engine is forthcoming for this character. Add it to the michael@0: // reject set. Create the reject break engine if needed. michael@0: if (fUnhandledBreakEngine == NULL) { michael@0: fUnhandledBreakEngine = new UnhandledEngine(status); michael@0: if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: // Put it last so that scripts for which we have an engine get tried michael@0: // first. michael@0: fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); michael@0: // If we can't insert it, or creation failed, get rid of it michael@0: if (U_FAILURE(status)) { michael@0: delete fUnhandledBreakEngine; michael@0: fUnhandledBreakEngine = 0; michael@0: return NULL; michael@0: } michael@0: } michael@0: michael@0: // Tell the reject engine about the character; at its discretion, it may michael@0: // add more than just the one character. michael@0: fUnhandledBreakEngine->handleCharacter(c, fBreakType); michael@0: michael@0: return fUnhandledBreakEngine; michael@0: } michael@0: michael@0: michael@0: michael@0: /*int32_t RuleBasedBreakIterator::getBreakType() const { michael@0: return fBreakType; michael@0: }*/ michael@0: michael@0: void RuleBasedBreakIterator::setBreakType(int32_t type) { michael@0: fBreakType = type; michael@0: reset(); michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_BREAK_ITERATION */