The Tor Browser: intl/icu/source/common/rbbi.cpp@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 ***************************************************************************

     3 *   Copyright (C) 1999-2013 International Business Machines Corporation

     4 *   and others. All rights reserved.

     5 ***************************************************************************

     6 */

     7 //

     8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator

     9 //                   runtime engine and the API implementation for

    10 //                   class RuleBasedBreakIterator

    11 //

    13 #include "utypeinfo.h"  // for 'typeid' to work

    15 #include "unicode/utypes.h"

    17 #if !UCONFIG_NO_BREAK_ITERATION

    19 #include "unicode/rbbi.h"

    20 #include "unicode/schriter.h"

    21 #include "unicode/uchriter.h"

    22 #include "unicode/udata.h"

    23 #include "unicode/uclean.h"

    24 #include "rbbidata.h"

    25 #include "rbbirb.h"

    26 #include "cmemory.h"

    27 #include "cstring.h"

    28 #include "umutex.h"

    29 #include "ucln_cmn.h"

    30 #include "brkeng.h"

    32 #include "uassert.h"

    33 #include "uvector.h"

    35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.

    36 #if U_LOCAL_SERVICE_HOOK

    37 #include "localsvc.h"

    38 #endif

    40 #ifdef RBBI_DEBUG

    41 static UBool fTrace = FALSE;

    42 #endif

    44 U_NAMESPACE_BEGIN

    46 // The state number of the starting state

    47 #define START_STATE 1

    49 // The state-transition value indicating "stop"

    50 #define STOP_STATE  0

    53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)

    56 //=======================================================================

    57 // constructors

    58 //=======================================================================

    60 /**

    61  * Constructs a RuleBasedBreakIterator that uses the already-created

    62  * tables object that is passed in as a parameter.

    63  */

    64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)

    65 {

    66     init();

    67     fData = new RBBIDataWrapper(data, status); // status checked in constructor

    68     if (U_FAILURE(status)) {return;}

    69     if(fData == 0) {

    70         status = U_MEMORY_ALLOCATION_ERROR;

    71         return;

    72     }

    73 }

    75 /**

    76  * Same as above but does not adopt memory

    77  */

    78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)

    79 {

    80     init();

    81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor

    82     if (U_FAILURE(status)) {return;}

    83     if(fData == 0) {

    84         status = U_MEMORY_ALLOCATION_ERROR;

    85         return;

    86     }

    87 }

    90 //

    91 //  Construct from precompiled binary rules (tables).  This constructor is public API,

    92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().

    93 //

    94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,

    95                        uint32_t       ruleLength,

    96                        UErrorCode     &status) {

    97     init();

    98     if (U_FAILURE(status)) {

    99         return;

   100     }

   101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {

   102         status = U_ILLEGAL_ARGUMENT_ERROR;

   103         return;

   104     }

   105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;

   106     if (data->fLength > ruleLength) {

   107         status = U_ILLEGAL_ARGUMENT_ERROR;

   108         return;

   109     }

   110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);

   111     if (U_FAILURE(status)) {return;}

   112     if(fData == 0) {

   113         status = U_MEMORY_ALLOCATION_ERROR;

   114         return;

   115     }

   116 }

   119 //-------------------------------------------------------------------------------

   120 //

   121 //   Constructor   from a UDataMemory handle to precompiled break rules

   122 //                 stored in an ICU data file.

   123 //

   124 //-------------------------------------------------------------------------------

   125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)

   126 {

   127     init();

   128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor

   129     if (U_FAILURE(status)) {return;}

   130     if(fData == 0) {

   131         status = U_MEMORY_ALLOCATION_ERROR;

   132         return;

   133     }

   134 }

   138 //-------------------------------------------------------------------------------

   139 //

   140 //   Constructor       from a set of rules supplied as a string.

   141 //

   142 //-------------------------------------------------------------------------------

   143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,

   144                                                 UParseError          &parseError,

   145                                                 UErrorCode           &status)

   146 {

   147     init();

   148     if (U_FAILURE(status)) {return;}

   149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)

   150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);

   151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that

   152     //        creates and returns a complete RBBI.  From here, in a constructor, we

   153     //        can't just return the object created by the builder factory, hence

   154     //        the assignment of the factory created object to "this".

   155     if (U_SUCCESS(status)) {

   156         *this = *bi;

   157         delete bi;

   158     }

   159 }

   162 //-------------------------------------------------------------------------------

   163 //

   164 // Default Constructor.      Create an empty shell that can be set up later.

   165 //                           Used when creating a RuleBasedBreakIterator from a set

   166 //                           of rules.

   167 //-------------------------------------------------------------------------------

   168 RuleBasedBreakIterator::RuleBasedBreakIterator() {

   169     init();

   170 }

   173 //-------------------------------------------------------------------------------

   174 //

   175 //   Copy constructor.  Will produce a break iterator with the same behavior,

   176 //                      and which iterates over the same text, as the one passed in.

   177 //

   178 //-------------------------------------------------------------------------------

   179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)

   180 : BreakIterator(other)

   181 {

   182     this->init();

   183     *this = other;

   184 }

   187 /**

   188  * Destructor

   189  */

   190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {

   191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

   192         // fCharIter was adopted from the outside.

   193         delete fCharIter;

   194     }

   195     fCharIter = NULL;

   196     delete fSCharIter;

   197     fCharIter = NULL;

   198     delete fDCharIter;

   199     fDCharIter = NULL;

   201     utext_close(fText);

   203     if (fData != NULL) {

   204         fData->removeReference();

   205         fData = NULL;

   206     }

   207     if (fCachedBreakPositions) {

   208         uprv_free(fCachedBreakPositions);

   209         fCachedBreakPositions = NULL;

   210     }

   211     if (fLanguageBreakEngines) {

   212         delete fLanguageBreakEngines;

   213         fLanguageBreakEngines = NULL;

   214     }

   215     if (fUnhandledBreakEngine) {

   216         delete fUnhandledBreakEngine;

   217         fUnhandledBreakEngine = NULL;

   218     }

   219 }

   221 /**

   222  * Assignment operator.  Sets this iterator to have the same behavior,

   223  * and iterate over the same text, as the one passed in.

   224  */

   225 RuleBasedBreakIterator&

   226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {

   227     if (this == &that) {

   228         return *this;

   229     }

   230     reset();    // Delete break cache information

   231     fBreakType = that.fBreakType;

   232     if (fLanguageBreakEngines != NULL) {

   233         delete fLanguageBreakEngines;

   234         fLanguageBreakEngines = NULL;   // Just rebuild for now

   235     }

   236     // TODO: clone fLanguageBreakEngines from "that"

   237     UErrorCode status = U_ZERO_ERROR;

   238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);

   240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

   241         delete fCharIter;

   242     }

   243     fCharIter = NULL;

   245     if (that.fCharIter != NULL ) {

   246         // This is a little bit tricky - it will intially appear that

   247         //  this->fCharIter is adopted, even if that->fCharIter was

   248         //  not adopted.  That's ok.

   249         fCharIter = that.fCharIter->clone();

   250     }

   252     if (fData != NULL) {

   253         fData->removeReference();

   254         fData = NULL;

   255     }

   256     if (that.fData != NULL) {

   257         fData = that.fData->addReference();

   258     }

   260     return *this;

   261 }

   265 //-----------------------------------------------------------------------------

   266 //

   267 //    init()      Shared initialization routine.   Used by all the constructors.

   268 //                Initializes all fields, leaving the object in a consistent state.

   269 //

   270 //-----------------------------------------------------------------------------

   271 void RuleBasedBreakIterator::init() {

   272     UErrorCode  status    = U_ZERO_ERROR;

   273     fText                 = utext_openUChars(NULL, NULL, 0, &status);

   274     fCharIter             = NULL;

   275     fSCharIter            = NULL;

   276     fDCharIter            = NULL;

   277     fData                 = NULL;

   278     fLastRuleStatusIndex  = 0;

   279     fLastStatusIndexValid = TRUE;

   280     fDictionaryCharCount  = 0;

   281     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable

   282                                         //   dictionary behavior for Break Iterators that are

   283                                         //   built from rules.  Even better would be the ability to

   284                                         //   declare the type in the rules.

   286     fCachedBreakPositions    = NULL;

   287     fLanguageBreakEngines    = NULL;

   288     fUnhandledBreakEngine    = NULL;

   289     fNumCachedBreakPositions = 0;

   290     fPositionInCache         = 0;

   292 #ifdef RBBI_DEBUG

   293     static UBool debugInitDone = FALSE;

   294     if (debugInitDone == FALSE) {

   295         char *debugEnv = getenv("U_RBBIDEBUG");

   296         if (debugEnv && uprv_strstr(debugEnv, "trace")) {

   297             fTrace = TRUE;

   298         }

   299         debugInitDone = TRUE;

   300     }

   301 #endif

   302 }

   306 //-----------------------------------------------------------------------------

   307 //

   308 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same

   309 //            behavior, and iterating over the same text, as this one.

   310 //            Virtual function: does the right thing with subclasses.

   311 //

   312 //-----------------------------------------------------------------------------

   313 BreakIterator*

   314 RuleBasedBreakIterator::clone(void) const {

   315     return new RuleBasedBreakIterator(*this);

   316 }

   318 /**

   319  * Equality operator.  Returns TRUE if both BreakIterators are of the

   320  * same class, have the same behavior, and iterate over the same text.

   321  */

   322 UBool

   323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {

   324     if (typeid(*this) != typeid(that)) {

   325         return FALSE;

   326     }

   328     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;

   330     if (!utext_equals(fText, that2.fText)) {

   331         // The two break iterators are operating on different text,

   332         //   or have a different interation position.

   333         return FALSE;

   334     };

   336     // TODO:  need a check for when in a dictionary region at different offsets.

   338     if (that2.fData == fData ||

   339         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {

   340             // The two break iterators are using the same rules.

   341             return TRUE;

   342         }

   343     return FALSE;

   344 }

   346 /**

   347  * Compute a hash code for this BreakIterator

   348  * @return A hash code

   349  */

   350 int32_t

   351 RuleBasedBreakIterator::hashCode(void) const {

   352     int32_t   hash = 0;

   353     if (fData != NULL) {

   354         hash = fData->hashCode();

   355     }

   356     return hash;

   357 }

   360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {

   361     if (U_FAILURE(status)) {

   362         return;

   363     }

   364     reset();

   365     fText = utext_clone(fText, ut, FALSE, TRUE, &status);

   367     // Set up a dummy CharacterIterator to be returned if anyone

   368     //   calls getText().  With input from UText, there is no reasonable

   369     //   way to return a characterIterator over the actual input text.

   370     //   Return one over an empty string instead - this is the closest

   371     //   we can come to signaling a failure.

   372     //   (GetText() is obsolete, this failure is sort of OK)

   373     if (fDCharIter == NULL) {

   374         static const UChar c = 0;

   375         fDCharIter = new UCharCharacterIterator(&c, 0);

   376         if (fDCharIter == NULL) {

   377             status = U_MEMORY_ALLOCATION_ERROR;

   378             return;

   379         }

   380     }

   382     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

   383         // existing fCharIter was adopted from the outside.  Delete it now.

   384         delete fCharIter;

   385     }

   386     fCharIter = fDCharIter;

   388     this->first();

   389 }

   392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {

   393     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);

   394     return result;

   395 }

   399 /**

   400  * Returns the description used to create this iterator

   401  */

   402 const UnicodeString&

   403 RuleBasedBreakIterator::getRules() const {

   404     if (fData != NULL) {

   405         return fData->getRuleSourceString();

   406     } else {

   407         static const UnicodeString *s;

   408         if (s == NULL) {

   409             // TODO:  something more elegant here.

   410             //        perhaps API should return the string by value.

   411             //        Note:  thread unsafe init & leak are semi-ok, better than

   412             //               what was before.  Sould be cleaned up, though.

   413             s = new UnicodeString;

   414         }

   415         return *s;

   416     }

   417 }

   419 //=======================================================================

   420 // BreakIterator overrides

   421 //=======================================================================

   423 /**

   424  * Return a CharacterIterator over the text being analyzed.

   425  */

   426 CharacterIterator&

   427 RuleBasedBreakIterator::getText() const {

   428     return *fCharIter;

   429 }

   431 /**

   432  * Set the iterator to analyze a new piece of text.  This function resets

   433  * the current iteration position to the beginning of the text.

   434  * @param newText An iterator over the text to analyze.

   435  */

   436 void

   437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {

   438     // If we are holding a CharacterIterator adopted from a

   439     //   previous call to this function, delete it now.

   440     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

   441         delete fCharIter;

   442     }

   444     fCharIter = newText;

   445     UErrorCode status = U_ZERO_ERROR;

   446     reset();

   447     if (newText==NULL || newText->startIndex() != 0) {

   448         // startIndex !=0 wants to be an error, but there's no way to report it.

   449         // Make the iterator text be an empty string.

   450         fText = utext_openUChars(fText, NULL, 0, &status);

   451     } else {

   452         fText = utext_openCharacterIterator(fText, newText, &status);

   453     }

   454     this->first();

   455 }

   457 /**

   458  * Set the iterator to analyze a new piece of text.  This function resets

   459  * the current iteration position to the beginning of the text.

   460  * @param newText An iterator over the text to analyze.

   461  */

   462 void

   463 RuleBasedBreakIterator::setText(const UnicodeString& newText) {

   464     UErrorCode status = U_ZERO_ERROR;

   465     reset();

   466     fText = utext_openConstUnicodeString(fText, &newText, &status);

   468     // Set up a character iterator on the string.

   469     //   Needed in case someone calls getText().

   470     //  Can not, unfortunately, do this lazily on the (probably never)

   471     //  call to getText(), because getText is const.

   472     if (fSCharIter == NULL) {

   473         fSCharIter = new StringCharacterIterator(newText);

   474     } else {

   475         fSCharIter->setText(newText);

   476     }

   478     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {

   479         // old fCharIter was adopted from the outside.  Delete it.

   480         delete fCharIter;

   481     }

   482     fCharIter = fSCharIter;

   484     this->first();

   485 }

   488 /**

   489  *  Provide a new UText for the input text.  Must reference text with contents identical

   490  *  to the original.

   491  *  Intended for use with text data originating in Java (garbage collected) environments

   492  *  where the data may be moved in memory at arbitrary times.

   493  */

   494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {

   495     if (U_FAILURE(status)) {

   496         return *this;

   497     }

   498     if (input == NULL) {

   499         status = U_ILLEGAL_ARGUMENT_ERROR;

   500         return *this;

   501     }

   502     int64_t pos = utext_getNativeIndex(fText);

   503     //  Shallow read-only clone of the new UText into the existing input UText

   504     fText = utext_clone(fText, input, FALSE, TRUE, &status);

   505     if (U_FAILURE(status)) {

   506         return *this;

   507     }

   508     utext_setNativeIndex(fText, pos);

   509     if (utext_getNativeIndex(fText) != pos) {

   510         // Sanity check.  The new input utext is supposed to have the exact same

   511         // contents as the old.  If we can't set to the same position, it doesn't.

   512         // The contents underlying the old utext might be invalid at this point,

   513         // so it's not safe to check directly.

   514         status = U_ILLEGAL_ARGUMENT_ERROR;

   515     }

   516     return *this;

   517 }

   520 /**

   521  * Sets the current iteration position to the beginning of the text.

   522  * @return The offset of the beginning of the text.

   523  */

   524 int32_t RuleBasedBreakIterator::first(void) {

   525     reset();

   526     fLastRuleStatusIndex  = 0;

   527     fLastStatusIndexValid = TRUE;

   528     //if (fText == NULL)

   529     //    return BreakIterator::DONE;

   531     utext_setNativeIndex(fText, 0);

   532     return 0;

   533 }

   535 /**

   536  * Sets the current iteration position to the end of the text.

   537  * @return The text's past-the-end offset.

   538  */

   539 int32_t RuleBasedBreakIterator::last(void) {

   540     reset();

   541     if (fText == NULL) {

   542         fLastRuleStatusIndex  = 0;

   543         fLastStatusIndexValid = TRUE;

   544         return BreakIterator::DONE;

   545     }

   547     fLastStatusIndexValid = FALSE;

   548     int32_t pos = (int32_t)utext_nativeLength(fText);

   549     utext_setNativeIndex(fText, pos);

   550     return pos;

   551 }

   553 /**

   554  * Advances the iterator either forward or backward the specified number of steps.

   555  * Negative values move backward, and positive values move forward.  This is

   556  * equivalent to repeatedly calling next() or previous().

   557  * @param n The number of steps to move.  The sign indicates the direction

   558  * (negative is backwards, and positive is forwards).

   559  * @return The character offset of the boundary position n boundaries away from

   560  * the current one.

   561  */

   562 int32_t RuleBasedBreakIterator::next(int32_t n) {

   563     int32_t result = current();

   564     while (n > 0) {

   565         result = next();

   566         --n;

   567     }

   568     while (n < 0) {

   569         result = previous();

   570         ++n;

   571     }

   572     return result;

   573 }

   575 /**

   576  * Advances the iterator to the next boundary position.

   577  * @return The position of the first boundary after this one.

   578  */

   579 int32_t RuleBasedBreakIterator::next(void) {

   580     // if we have cached break positions and we're still in the range

   581     // covered by them, just move one step forward in the cache

   582     if (fCachedBreakPositions != NULL) {

   583         if (fPositionInCache < fNumCachedBreakPositions - 1) {

   584             ++fPositionInCache;

   585             int32_t pos = fCachedBreakPositions[fPositionInCache];

   586             utext_setNativeIndex(fText, pos);

   587             return pos;

   588         }

   589         else {

   590             reset();

   591         }

   592     }

   594     int32_t startPos = current();

   595     int32_t result = handleNext(fData->fForwardTable);

   596     if (fDictionaryCharCount > 0) {

   597         result = checkDictionary(startPos, result, FALSE);

   598     }

   599     return result;

   600 }

   602 /**

   603  * Advances the iterator backwards, to the last boundary preceding this one.

   604  * @return The position of the last boundary position preceding this one.

   605  */

   606 int32_t RuleBasedBreakIterator::previous(void) {

   607     int32_t result;

   608     int32_t startPos;

   610     // if we have cached break positions and we're still in the range

   611     // covered by them, just move one step backward in the cache

   612     if (fCachedBreakPositions != NULL) {

   613         if (fPositionInCache > 0) {

   614             --fPositionInCache;

   615             // If we're at the beginning of the cache, need to reevaluate the

   616             // rule status

   617             if (fPositionInCache <= 0) {

   618                 fLastStatusIndexValid = FALSE;

   619             }

   620             int32_t pos = fCachedBreakPositions[fPositionInCache];

   621             utext_setNativeIndex(fText, pos);

   622             return pos;

   623         }

   624         else {

   625             reset();

   626         }

   627     }

   629     // if we're already sitting at the beginning of the text, return DONE

   630     if (fText == NULL || (startPos = current()) == 0) {

   631         fLastRuleStatusIndex  = 0;

   632         fLastStatusIndexValid = TRUE;

   633         return BreakIterator::DONE;

   634     }

   636     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {

   637         result = handlePrevious(fData->fReverseTable);

   638         if (fDictionaryCharCount > 0) {

   639             result = checkDictionary(result, startPos, TRUE);

   640         }

   641         return result;

   642     }

   644     // old rule syntax

   645     // set things up.  handlePrevious() will back us up to some valid

   646     // break position before the current position (we back our internal

   647     // iterator up one step to prevent handlePrevious() from returning

   648     // the current position), but not necessarily the last one before

   650     // where we started

   652     int32_t start = current();

   654     (void)UTEXT_PREVIOUS32(fText);

   655     int32_t lastResult    = handlePrevious(fData->fReverseTable);

   656     if (lastResult == UBRK_DONE) {

   657         lastResult = 0;

   658         utext_setNativeIndex(fText, 0);

   659     }

   660     result = lastResult;

   661     int32_t lastTag       = 0;

   662     UBool   breakTagValid = FALSE;

   664     // iterate forward from the known break position until we pass our

   665     // starting point.  The last break position before the starting

   666     // point is our return value

   668     for (;;) {

   669         result         = next();

   670         if (result == BreakIterator::DONE || result >= start) {

   671             break;

   672         }

   673         lastResult     = result;

   674         lastTag        = fLastRuleStatusIndex;

   675         breakTagValid  = TRUE;

   676     }

   678     // fLastBreakTag wants to have the value for section of text preceding

   679     // the result position that we are to return (in lastResult.)  If

   680     // the backwards rules overshot and the above loop had to do two or more

   681     // next()s to move up to the desired return position, we will have a valid

   682     // tag value. But, if handlePrevious() took us to exactly the correct result positon,

   683     // we wont have a tag value for that position, which is only set by handleNext().

   685     // set the current iteration position to be the last break position

   686     // before where we started, and then return that value

   687     utext_setNativeIndex(fText, lastResult);

   688     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()

   689     fLastStatusIndexValid = breakTagValid;

   691     // No need to check the dictionary; it will have been handled by

   692     // next()

   694     return lastResult;

   695 }

   697 /**

   698  * Sets the iterator to refer to the first boundary position following

   699  * the specified position.

   700  * @offset The position from which to begin searching for a break position.

   701  * @return The position of the first break after the current position.

   702  */

   703 int32_t RuleBasedBreakIterator::following(int32_t offset) {

   704     // if we have cached break positions and offset is in the range

   705     // covered by them, use them

   706     // TODO: could use binary search

   707     // TODO: what if offset is outside range, but break is not?

   708     if (fCachedBreakPositions != NULL) {

   709         if (offset >= fCachedBreakPositions[0]

   710                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {

   711             fPositionInCache = 0;

   712             // We are guaranteed not to leave the array due to range test above

   713             while (offset >= fCachedBreakPositions[fPositionInCache]) {

   714                 ++fPositionInCache;

   715             }

   716             int32_t pos = fCachedBreakPositions[fPositionInCache];

   717             utext_setNativeIndex(fText, pos);

   718             return pos;

   719         }

   720         else {

   721             reset();

   722         }

   723     }

   725     // if the offset passed in is already past the end of the text,

   726     // just return DONE; if it's before the beginning, return the

   727     // text's starting offset

   728     fLastRuleStatusIndex  = 0;

   729     fLastStatusIndexValid = TRUE;

   730     if (fText == NULL || offset >= utext_nativeLength(fText)) {

   731         last();

   732         return next();

   733     }

   734     else if (offset < 0) {

   735         return first();

   736     }

   738     // otherwise, set our internal iteration position (temporarily)

   739     // to the position passed in.  If this is the _beginning_ position,

   740     // then we can just use next() to get our return value

   742     int32_t result = 0;

   744     if (fData->fSafeRevTable != NULL) {

   745         // new rule syntax

   746         utext_setNativeIndex(fText, offset);

   747         // move forward one codepoint to prepare for moving back to a

   748         // safe point.

   749         // this handles offset being between a supplementary character

   750         (void)UTEXT_NEXT32(fText);

   751         // handlePrevious will move most of the time to < 1 boundary away

   752         handlePrevious(fData->fSafeRevTable);

   753         int32_t result = next();

   754         while (result <= offset) {

   755             result = next();

   756         }

   757         return result;

   758     }

   759     if (fData->fSafeFwdTable != NULL) {

   760         // backup plan if forward safe table is not available

   761         utext_setNativeIndex(fText, offset);

   762         (void)UTEXT_PREVIOUS32(fText);

   763         // handle next will give result >= offset

   764         handleNext(fData->fSafeFwdTable);

   765         // previous will give result 0 or 1 boundary away from offset,

   766         // most of the time

   767         // we have to

   768         int32_t oldresult = previous();

   769         while (oldresult > offset) {

   770             int32_t result = previous();

   771             if (result <= offset) {

   772                 return oldresult;

   773             }

   774             oldresult = result;

   775         }

   776         int32_t result = next();

   777         if (result <= offset) {

   778             return next();

   779         }

   780         return result;

   781     }

   782     // otherwise, we have to sync up first.  Use handlePrevious() to back

   783     // up to a known break position before the specified position (if

   784     // we can determine that the specified position is a break position,

   785     // we don't back up at all).  This may or may not be the last break

   786     // position at or before our starting position.  Advance forward

   787     // from here until we've passed the starting position.  The position

   788     // we stop on will be the first break position after the specified one.

   789     // old rule syntax

   791     utext_setNativeIndex(fText, offset);

   792     if (offset==0 ||

   793         (offset==1  && utext_getNativeIndex(fText)==0)) {

   794         return next();

   795     }

   796     result = previous();

   798     while (result != BreakIterator::DONE && result <= offset) {

   799         result = next();

   800     }

   802     return result;

   803 }

   805 /**

   806  * Sets the iterator to refer to the last boundary position before the

   807  * specified position.

   808  * @offset The position to begin searching for a break from.

   809  * @return The position of the last boundary before the starting position.

   810  */

   811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {

   812     // if we have cached break positions and offset is in the range

   813     // covered by them, use them

   814     if (fCachedBreakPositions != NULL) {

   815         // TODO: binary search?

   816         // TODO: What if offset is outside range, but break is not?

   817         if (offset > fCachedBreakPositions[0]

   818                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {

   819             fPositionInCache = 0;

   820             while (fPositionInCache < fNumCachedBreakPositions

   821                    && offset > fCachedBreakPositions[fPositionInCache])

   822                 ++fPositionInCache;

   823             --fPositionInCache;

   824             // If we're at the beginning of the cache, need to reevaluate the

   825             // rule status

   826             if (fPositionInCache <= 0) {

   827                 fLastStatusIndexValid = FALSE;

   828             }

   829             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);

   830             return fCachedBreakPositions[fPositionInCache];

   831         }

   832         else {

   833             reset();

   834         }

   835     }

   837     // if the offset passed in is already past the end of the text,

   838     // just return DONE; if it's before the beginning, return the

   839     // text's starting offset

   840     if (fText == NULL || offset > utext_nativeLength(fText)) {

   841         // return BreakIterator::DONE;

   842         return last();

   843     }

   844     else if (offset < 0) {

   845         return first();

   846     }

   848     // if we start by updating the current iteration position to the

   849     // position specified by the caller, we can just use previous()

   850     // to carry out this operation

   852     if (fData->fSafeFwdTable != NULL) {

   853         // new rule syntax

   854         utext_setNativeIndex(fText, offset);

   855         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);

   856         if (newOffset != offset) {

   857             // Will come here if specified offset was not a code point boundary AND

   858             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary

   859             //   indices to the containing code point.

   860             // For breakitereator::preceding only, these non-code-point indices need to be moved

   861             //   up to refer to the following codepoint.

   862             (void)UTEXT_NEXT32(fText);

   863             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);

   864         }

   866         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,

   867         //        rather than adjusting the position unconditionally?

   868         //        (Change would interact with safe rules.)

   869         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?

   870         //        affects only preceding(), seems cleaner, but is slightly different.

   871         (void)UTEXT_PREVIOUS32(fText);

   872         handleNext(fData->fSafeFwdTable);

   873         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

   874         while (result >= offset) {

   875             result = previous();

   876         }

   877         return result;

   878     }

   879     if (fData->fSafeRevTable != NULL) {

   880         // backup plan if forward safe table is not available

   881         //  TODO:  check whether this path can be discarded

   882         //         It's probably OK to say that rules must supply both safe tables

   883         //            if they use safe tables at all.  We have certainly never described

   884         //            to anyone how to work with just one safe table.

   885         utext_setNativeIndex(fText, offset);

   886         (void)UTEXT_NEXT32(fText);

   888         // handle previous will give result <= offset

   889         handlePrevious(fData->fSafeRevTable);

   891         // next will give result 0 or 1 boundary away from offset,

   892         // most of the time

   893         // we have to

   894         int32_t oldresult = next();

   895         while (oldresult < offset) {

   896             int32_t result = next();

   897             if (result >= offset) {

   898                 return oldresult;

   899             }

   900             oldresult = result;

   901         }

   902         int32_t result = previous();

   903         if (result >= offset) {

   904             return previous();

   905         }

   906         return result;

   907     }

   909     // old rule syntax

   910     utext_setNativeIndex(fText, offset);

   911     return previous();

   912 }

   914 /**

   915  * Returns true if the specfied position is a boundary position.  As a side

   916  * effect, leaves the iterator pointing to the first boundary position at

   917  * or after "offset".

   918  * @param offset the offset to check.

   919  * @return True if "offset" is a boundary position.

   920  */

   921 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {

   922     // the beginning index of the iterator is always a boundary position by definition

   923     if (offset == 0) {

   924         first();       // For side effects on current position, tag values.

   925         return TRUE;

   926     }

   928     if (offset == (int32_t)utext_nativeLength(fText)) {

   929         last();       // For side effects on current position, tag values.

   930         return TRUE;

   931     }

   933     // out-of-range indexes are never boundary positions

   934     if (offset < 0) {

   935         first();       // For side effects on current position, tag values.

   936         return FALSE;

   937     }

   939     if (offset > utext_nativeLength(fText)) {

   940         last();        // For side effects on current position, tag values.

   941         return FALSE;

   942     }

   944     // otherwise, we can use following() on the position before the specified

   945     // one and return true if the position we get back is the one the user

   946     // specified

   947     utext_previous32From(fText, offset);

   948     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);

   949     UBool    result  = following(backOne) == offset;

   950     return result;

   951 }

   953 /**

   954  * Returns the current iteration position.

   955  * @return The current iteration position.

   956  */

   957 int32_t RuleBasedBreakIterator::current(void) const {

   958     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);

   959     return pos;

   960 }

   962 //=======================================================================

   963 // implementation

   964 //=======================================================================

   966 //

   967 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end

   968 //                 of user text.  A variable with this enum type keeps track of where we

   969 //                 are.  The state machine only fetches user input while in the RUN mode.

   970 //

   971 enum RBBIRunMode {

   972     RBBI_START,     // state machine processing is before first char of input

   973     RBBI_RUN,       // state machine processing is in the user text

   974     RBBI_END        // state machine processing is after end of user text.

   975 };

   978 //-----------------------------------------------------------------------------------

   979 //

   980 //  handleNext(stateTable)

   981 //     This method is the actual implementation of the rbbi next() method.

   982 //     This method initializes the state machine to state 1

   983 //     and advances through the text character by character until we reach the end

   984 //     of the text or the state machine transitions to state 0.  We update our return

   985 //     value every time the state machine passes through an accepting state.

   986 //

   987 //-----------------------------------------------------------------------------------

   988 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {

   989     int32_t             state;

   990     uint16_t            category        = 0;

   991     RBBIRunMode         mode;

   993     RBBIStateTableRow  *row;

   994     UChar32             c;

   995     int32_t             lookaheadStatus = 0;

   996     int32_t             lookaheadTagIdx = 0;

   997     int32_t             result          = 0;

   998     int32_t             initialPosition = 0;

   999     int32_t             lookaheadResult = 0;

  1000     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

  1001     const char         *tableData       = statetable->fTableData;

  1002     uint32_t            tableRowLen     = statetable->fRowLen;

  1004     #ifdef RBBI_DEBUG

  1005         if (fTrace) {

  1006             RBBIDebugPuts("Handle Next   pos   char  state category");

  1007         }

  1008     #endif

  1010     // No matter what, handleNext alway correctly sets the break tag value.

  1011     fLastStatusIndexValid = TRUE;

  1012     fLastRuleStatusIndex = 0;

  1014     // if we're already at the end of the text, return DONE.

  1015     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1016     result          = initialPosition;

  1017     c               = UTEXT_NEXT32(fText);

  1018     if (fData == NULL || c==U_SENTINEL) {

  1019         return BreakIterator::DONE;

  1020     }

  1022     //  Set the initial state for the state machine

  1023     state = START_STATE;

  1024     row = (RBBIStateTableRow *)

  1025             //(statetable->fTableData + (statetable->fRowLen * state));

  1026             (tableData + tableRowLen * state);

  1029     mode     = RBBI_RUN;

  1030     if (statetable->fFlags & RBBI_BOF_REQUIRED) {

  1031         category = 2;

  1032         mode     = RBBI_START;

  1033     }

  1036     // loop until we reach the end of the text or transition to state 0

  1037     //

  1038     for (;;) {

  1039         if (c == U_SENTINEL) {

  1040             // Reached end of input string.

  1041             if (mode == RBBI_END) {

  1042                 // We have already run the loop one last time with the

  1043                 //   character set to the psueudo {eof} value.  Now it is time

  1044                 //   to unconditionally bail out.

  1045                 if (lookaheadResult > result) {

  1046                     // We ran off the end of the string with a pending look-ahead match.

  1047                     // Treat this as if the look-ahead condition had been met, and return

  1048                     //  the match at the / position from the look-ahead rule.

  1049                     result               = lookaheadResult;

  1050                     fLastRuleStatusIndex = lookaheadTagIdx;

  1051                     lookaheadStatus = 0;

  1052                 }

  1053                 break;

  1054             }

  1055             // Run the loop one last time with the fake end-of-input character category.

  1056             mode = RBBI_END;

  1057             category = 1;

  1058         }

  1060         //

  1061         // Get the char category.  An incoming category of 1 or 2 means that

  1062         //      we are preset for doing the beginning or end of input, and

  1063         //      that we shouldn't get a category from an actual text input character.

  1064         //

  1065         if (mode == RBBI_RUN) {

  1066             // look up the current character's character category, which tells us

  1067             // which column in the state table to look at.

  1068             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,

  1069             //        not the size of the character going in, which is a UChar32.

  1070             //

  1071             UTRIE_GET16(&fData->fTrie, c, category);

  1073             // Check the dictionary bit in the character's category.

  1074             //    Counter is only used by dictionary based iterators (subclasses).

  1075             //    Chars that need to be handled by a dictionary have a flag bit set

  1076             //    in their category values.

  1077             //

  1078             if ((category & 0x4000) != 0)  {

  1079                 fDictionaryCharCount++;

  1080                 //  And off the dictionary flag bit.

  1081                 category &= ~0x4000;

  1082             }

  1083         }

  1085        #ifdef RBBI_DEBUG

  1086             if (fTrace) {

  1087                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));

  1088                 if (0x20<=c && c<0x7f) {

  1089                     RBBIDebugPrintf("\"%c\"  ", c);

  1090                 } else {

  1091                     RBBIDebugPrintf("%5x  ", c);

  1092                 }

  1093                 RBBIDebugPrintf("%3d  %3d\n", state, category);

  1094             }

  1095         #endif

  1097         // State Transition - move machine to its next state

  1098         //

  1100         // Note: fNextState is defined as uint16_t[2], but we are casting

  1101         // a generated RBBI table to RBBIStateTableRow and some tables

  1102         // actually have more than 2 categories.

  1103         U_ASSERT(category<fData->fHeader->fCatCount);

  1104         state = row->fNextState[category];  /*Not accessing beyond memory*/

  1105         row = (RBBIStateTableRow *)

  1106             // (statetable->fTableData + (statetable->fRowLen * state));

  1107             (tableData + tableRowLen * state);

  1110         if (row->fAccepting == -1) {

  1111             // Match found, common case.

  1112             if (mode != RBBI_START) {

  1113                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1114             }

  1115             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.

  1116         }

  1118         if (row->fLookAhead != 0) {

  1119             if (lookaheadStatus != 0

  1120                 && row->fAccepting == lookaheadStatus) {

  1121                 // Lookahead match is completed.

  1122                 result               = lookaheadResult;

  1123                 fLastRuleStatusIndex = lookaheadTagIdx;

  1124                 lookaheadStatus      = 0;

  1125                 // TODO:  make a standalone hard break in a rule work.

  1126                 if (lookAheadHardBreak) {

  1127                     UTEXT_SETNATIVEINDEX(fText, result);

  1128                     return result;

  1129                 }

  1130                 // Look-ahead completed, but other rules may match further.  Continue on

  1131                 //  TODO:  junk this feature?  I don't think it's used anywhwere.

  1132                 goto continueOn;

  1133             }

  1135             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1136             lookaheadResult = r;

  1137             lookaheadStatus = row->fLookAhead;

  1138             lookaheadTagIdx = row->fTagIdx;

  1139             goto continueOn;

  1140         }

  1143         if (row->fAccepting != 0) {

  1144             // Because this is an accepting state, any in-progress look-ahead match

  1145             //   is no longer relavant.  Clear out the pending lookahead status.

  1146             lookaheadStatus = 0;           // clear out any pending look-ahead match.

  1147         }

  1149 continueOn:

  1150         if (state == STOP_STATE) {

  1151             // This is the normal exit from the lookup state machine.

  1152             // We have advanced through the string until it is certain that no

  1153             //   longer match is possible, no matter what characters follow.

  1154             break;

  1155         }

  1157         // Advance to the next character.

  1158         // If this is a beginning-of-input loop iteration, don't advance

  1159         //    the input position.  The next iteration will be processing the

  1160         //    first real input character.

  1161         if (mode == RBBI_RUN) {

  1162             c = UTEXT_NEXT32(fText);

  1163         } else {

  1164             if (mode == RBBI_START) {

  1165                 mode = RBBI_RUN;

  1166             }

  1167         }

  1170     }

  1172     // The state machine is done.  Check whether it found a match...

  1174     // If the iterator failed to advance in the match engine, force it ahead by one.

  1175     //   (This really indicates a defect in the break rules.  They should always match

  1176     //    at least one character.)

  1177     if (result == initialPosition) {

  1178         UTEXT_SETNATIVEINDEX(fText, initialPosition);

  1179         UTEXT_NEXT32(fText);

  1180         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1181     }

  1183     // Leave the iterator at our result position.

  1184     UTEXT_SETNATIVEINDEX(fText, result);

  1185     #ifdef RBBI_DEBUG

  1186         if (fTrace) {

  1187             RBBIDebugPrintf("result = %d\n\n", result);

  1188         }

  1189     #endif

  1190     return result;

  1191 }

  1195 //-----------------------------------------------------------------------------------

  1196 //

  1197 //  handlePrevious()

  1198 //

  1199 //      Iterate backwards, according to the logic of the reverse rules.

  1200 //      This version handles the exact style backwards rules.

  1201 //

  1202 //      The logic of this function is very similar to handleNext(), above.

  1203 //

  1204 //-----------------------------------------------------------------------------------

  1205 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {

  1206     int32_t             state;

  1207     uint16_t            category        = 0;

  1208     RBBIRunMode         mode;

  1209     RBBIStateTableRow  *row;

  1210     UChar32             c;

  1211     int32_t             lookaheadStatus = 0;

  1212     int32_t             result          = 0;

  1213     int32_t             initialPosition = 0;

  1214     int32_t             lookaheadResult = 0;

  1215     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

  1217     #ifdef RBBI_DEBUG

  1218         if (fTrace) {

  1219             RBBIDebugPuts("Handle Previous   pos   char  state category");

  1220         }

  1221     #endif

  1223     // handlePrevious() never gets the rule status.

  1224     // Flag the status as invalid; if the user ever asks for status, we will need

  1225     // to back up, then re-find the break position using handleNext(), which does

  1226     // get the status value.

  1227     fLastStatusIndexValid = FALSE;

  1228     fLastRuleStatusIndex = 0;

  1230     // if we're already at the start of the text, return DONE.

  1231     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {

  1232         return BreakIterator::DONE;

  1233     }

  1235     //  Set up the starting char.

  1236     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1237     result          = initialPosition;

  1238     c               = UTEXT_PREVIOUS32(fText);

  1240     //  Set the initial state for the state machine

  1241     state = START_STATE;

  1242     row = (RBBIStateTableRow *)

  1243             (statetable->fTableData + (statetable->fRowLen * state));

  1244     category = 3;

  1245     mode     = RBBI_RUN;

  1246     if (statetable->fFlags & RBBI_BOF_REQUIRED) {

  1247         category = 2;

  1248         mode     = RBBI_START;

  1249     }

  1252     // loop until we reach the start of the text or transition to state 0

  1253     //

  1254     for (;;) {

  1255         if (c == U_SENTINEL) {

  1256             // Reached end of input string.

  1257             if (mode == RBBI_END) {

  1258                 // We have already run the loop one last time with the

  1259                 //   character set to the psueudo {eof} value.  Now it is time

  1260                 //   to unconditionally bail out.

  1261                 if (lookaheadResult < result) {

  1262                     // We ran off the end of the string with a pending look-ahead match.

  1263                     // Treat this as if the look-ahead condition had been met, and return

  1264                     //  the match at the / position from the look-ahead rule.

  1265                     result               = lookaheadResult;

  1266                     lookaheadStatus = 0;

  1267                 } else if (result == initialPosition) {

  1268                     // Ran off start, no match found.

  1269                     // move one index one (towards the start, since we are doing a previous())

  1270                     UTEXT_SETNATIVEINDEX(fText, initialPosition);

  1271                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.

  1272                 }

  1273                 break;

  1274             }

  1275             // Run the loop one last time with the fake end-of-input character category.

  1276             mode = RBBI_END;

  1277             category = 1;

  1278         }

  1280         //

  1281         // Get the char category.  An incoming category of 1 or 2 means that

  1282         //      we are preset for doing the beginning or end of input, and

  1283         //      that we shouldn't get a category from an actual text input character.

  1284         //

  1285         if (mode == RBBI_RUN) {

  1286             // look up the current character's character category, which tells us

  1287             // which column in the state table to look at.

  1288             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,

  1289             //        not the size of the character going in, which is a UChar32.

  1290             //

  1291             UTRIE_GET16(&fData->fTrie, c, category);

  1293             // Check the dictionary bit in the character's category.

  1294             //    Counter is only used by dictionary based iterators (subclasses).

  1295             //    Chars that need to be handled by a dictionary have a flag bit set

  1296             //    in their category values.

  1297             //

  1298             if ((category & 0x4000) != 0)  {

  1299                 fDictionaryCharCount++;

  1300                 //  And off the dictionary flag bit.

  1301                 category &= ~0x4000;

  1302             }

  1303         }

  1305         #ifdef RBBI_DEBUG

  1306             if (fTrace) {

  1307                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));

  1308                 if (0x20<=c && c<0x7f) {

  1309                     RBBIDebugPrintf("\"%c\"  ", c);

  1310                 } else {

  1311                     RBBIDebugPrintf("%5x  ", c);

  1312                 }

  1313                 RBBIDebugPrintf("%3d  %3d\n", state, category);

  1314             }

  1315         #endif

  1317         // State Transition - move machine to its next state

  1318         //

  1320         // Note: fNextState is defined as uint16_t[2], but we are casting

  1321         // a generated RBBI table to RBBIStateTableRow and some tables

  1322         // actually have more than 2 categories.

  1323         U_ASSERT(category<fData->fHeader->fCatCount);

  1324         state = row->fNextState[category];  /*Not accessing beyond memory*/

  1325         row = (RBBIStateTableRow *)

  1326             (statetable->fTableData + (statetable->fRowLen * state));

  1328         if (row->fAccepting == -1) {

  1329             // Match found, common case.

  1330             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1331         }

  1333         if (row->fLookAhead != 0) {

  1334             if (lookaheadStatus != 0

  1335                 && row->fAccepting == lookaheadStatus) {

  1336                 // Lookahead match is completed.

  1337                 result               = lookaheadResult;

  1338                 lookaheadStatus      = 0;

  1339                 // TODO:  make a standalone hard break in a rule work.

  1340                 if (lookAheadHardBreak) {

  1341                     UTEXT_SETNATIVEINDEX(fText, result);

  1342                     return result;

  1343                 }

  1344                 // Look-ahead completed, but other rules may match further.  Continue on

  1345                 //  TODO:  junk this feature?  I don't think it's used anywhwere.

  1346                 goto continueOn;

  1347             }

  1349             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1350             lookaheadResult = r;

  1351             lookaheadStatus = row->fLookAhead;

  1352             goto continueOn;

  1353         }

  1356         if (row->fAccepting != 0) {

  1357             // Because this is an accepting state, any in-progress look-ahead match

  1358             //   is no longer relavant.  Clear out the pending lookahead status.

  1359             lookaheadStatus = 0;

  1360         }

  1362 continueOn:

  1363         if (state == STOP_STATE) {

  1364             // This is the normal exit from the lookup state machine.

  1365             // We have advanced through the string until it is certain that no

  1366             //   longer match is possible, no matter what characters follow.

  1367             break;

  1368         }

  1370         // Move (backwards) to the next character to process.

  1371         // If this is a beginning-of-input loop iteration, don't advance

  1372         //    the input position.  The next iteration will be processing the

  1373         //    first real input character.

  1374         if (mode == RBBI_RUN) {

  1375             c = UTEXT_PREVIOUS32(fText);

  1376         } else {

  1377             if (mode == RBBI_START) {

  1378                 mode = RBBI_RUN;

  1379             }

  1380         }

  1381     }

  1383     // The state machine is done.  Check whether it found a match...

  1385     // If the iterator failed to advance in the match engine, force it ahead by one.

  1386     //   (This really indicates a defect in the break rules.  They should always match

  1387     //    at least one character.)

  1388     if (result == initialPosition) {

  1389         UTEXT_SETNATIVEINDEX(fText, initialPosition);

  1390         UTEXT_PREVIOUS32(fText);

  1391         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1392     }

  1394     // Leave the iterator at our result position.

  1395     UTEXT_SETNATIVEINDEX(fText, result);

  1396     #ifdef RBBI_DEBUG

  1397         if (fTrace) {

  1398             RBBIDebugPrintf("result = %d\n\n", result);

  1399         }

  1400     #endif

  1401     return result;

  1402 }

  1405 void

  1406 RuleBasedBreakIterator::reset()

  1407 {

  1408     if (fCachedBreakPositions) {

  1409         uprv_free(fCachedBreakPositions);

  1410     }

  1411     fCachedBreakPositions = NULL;

  1412     fNumCachedBreakPositions = 0;

  1413     fDictionaryCharCount = 0;

  1414     fPositionInCache = 0;

  1415 }

  1419 //-------------------------------------------------------------------------------

  1420 //

  1421 //   getRuleStatus()   Return the break rule tag associated with the current

  1422 //                     iterator position.  If the iterator arrived at its current

  1423 //                     position by iterating forwards, the value will have been

  1424 //                     cached by the handleNext() function.

  1425 //

  1426 //                     If no cached status value is available, the status is

  1427 //                     found by doing a previous() followed by a next(), which

  1428 //                     leaves the iterator where it started, and computes the

  1429 //                     status while doing the next().

  1430 //

  1431 //-------------------------------------------------------------------------------

  1432 void RuleBasedBreakIterator::makeRuleStatusValid() {

  1433     if (fLastStatusIndexValid == FALSE) {

  1434         //  No cached status is available.

  1435         if (fText == NULL || current() == 0) {

  1436             //  At start of text, or there is no text.  Status is always zero.

  1437             fLastRuleStatusIndex = 0;

  1438             fLastStatusIndexValid = TRUE;

  1439         } else {

  1440             //  Not at start of text.  Find status the tedious way.

  1441             int32_t pa = current();

  1442             previous();

  1443             if (fNumCachedBreakPositions > 0) {

  1444                 reset();                // Blow off the dictionary cache

  1445             }

  1446             int32_t pb = next();

  1447             if (pa != pb) {

  1448                 // note: the if (pa != pb) test is here only to eliminate warnings for

  1449                 //       unused local variables on gcc.  Logically, it isn't needed.

  1450                 U_ASSERT(pa == pb);

  1451             }

  1452         }

  1453     }

  1454     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);

  1455 }

  1458 int32_t  RuleBasedBreakIterator::getRuleStatus() const {

  1459     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;

  1460     nonConstThis->makeRuleStatusValid();

  1462     // fLastRuleStatusIndex indexes to the start of the appropriate status record

  1463     //                                                 (the number of status values.)

  1464     //   This function returns the last (largest) of the array of status values.

  1465     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];

  1466     int32_t  tagVal = fData->fRuleStatusTable[idx];

  1468     return tagVal;

  1469 }

  1474 int32_t RuleBasedBreakIterator::getRuleStatusVec(

  1475              int32_t *fillInVec, int32_t capacity, UErrorCode &status)

  1476 {

  1477     if (U_FAILURE(status)) {

  1478         return 0;

  1479     }

  1481     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;

  1482     nonConstThis->makeRuleStatusValid();

  1483     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];

  1484     int32_t  numValsToCopy = numVals;

  1485     if (numVals > capacity) {

  1486         status = U_BUFFER_OVERFLOW_ERROR;

  1487         numValsToCopy = capacity;

  1488     }

  1489     int i;

  1490     for (i=0; i<numValsToCopy; i++) {

  1491         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];

  1492     }

  1493     return numVals;

  1494 }

  1498 //-------------------------------------------------------------------------------

  1499 //

  1500 //   getBinaryRules        Access to the compiled form of the rules,

  1501 //                         for use by build system tools that save the data

  1502 //                         for standard iterator types.

  1503 //

  1504 //-------------------------------------------------------------------------------

  1505 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {

  1506     const uint8_t  *retPtr = NULL;

  1507     length = 0;

  1509     if (fData != NULL) {

  1510         retPtr = (const uint8_t *)fData->fHeader;

  1511         length = fData->fHeader->fLength;

  1512     }

  1513     return retPtr;

  1514 }

  1517 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,

  1518                                    int32_t &bufferSize,

  1519                                    UErrorCode &status)

  1520 {

  1521     if (U_FAILURE(status)){

  1522         return NULL;

  1523     }

  1525     if (bufferSize == 0) {

  1526         bufferSize = 1;  // preflighting for deprecated functionality

  1527         return NULL;

  1528     }

  1530     BreakIterator *clonedBI = clone();

  1531     if (clonedBI == NULL) {

  1532         status = U_MEMORY_ALLOCATION_ERROR;

  1533     } else {

  1534         status = U_SAFECLONE_ALLOCATED_WARNING;

  1535     }

  1536     return (RuleBasedBreakIterator *)clonedBI;

  1537 }

  1540 //-------------------------------------------------------------------------------

  1541 //

  1542 //  isDictionaryChar      Return true if the category lookup for this char

  1543 //                        indicates that it is in the set of dictionary lookup

  1544 //                        chars.

  1545 //

  1546 //                        This function is intended for use by dictionary based

  1547 //                        break iterators.

  1548 //

  1549 //-------------------------------------------------------------------------------

  1550 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {

  1551     if (fData == NULL) {

  1552         return FALSE;

  1553     }

  1554     uint16_t category;

  1555     UTRIE_GET16(&fData->fTrie, c, category);

  1556     return (category & 0x4000) != 0;

  1557 }*/

  1560 //-------------------------------------------------------------------------------

  1561 //

  1562 //  checkDictionary       This function handles all processing of characters in

  1563 //                        the "dictionary" set. It will determine the appropriate

  1564 //                        course of action, and possibly set up a cache in the

  1565 //                        process.

  1566 //

  1567 //-------------------------------------------------------------------------------

  1568 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,

  1569                             int32_t endPos,

  1570                             UBool reverse) {

  1571     // Reset the old break cache first.

  1572     reset();

  1574     // note: code segment below assumes that dictionary chars are in the

  1575     // startPos-endPos range

  1576     // value returned should be next character in sequence

  1577     if ((endPos - startPos) <= 1) {

  1578         return (reverse ? startPos : endPos);

  1579     }

  1581     // Bug 5532.  The dictionary code will crash if the input text is UTF-8

  1582     //      because native indexes are different from UTF-16 indexes.

  1583     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.

  1584     //      It wont give the right breaks, but it's better than a crash.

  1585     //

  1586     //      Check the type of the UText by checking its pFuncs field, which

  1587     //      is UText's function dispatch table.  It will be the same for all

  1588     //      UTF-8 UTexts and different for any other UText type.

  1589     //

  1590     //      We have no other type of UText available with non-UTF-16 native indexing.

  1591     //      This whole check will go away once the dictionary code is fixed.

  1592     static const void *utext_utf8Funcs;

  1593     if (utext_utf8Funcs == NULL) {

  1594         // Cache the UTF-8 UText function pointer value.

  1595         UErrorCode status = U_ZERO_ERROR;

  1596         UText tempUText = UTEXT_INITIALIZER;

  1597         utext_openUTF8(&tempUText, NULL, 0, &status);

  1598         utext_utf8Funcs = tempUText.pFuncs;

  1599         utext_close(&tempUText);

  1600     }

  1601     if (fText->pFuncs == utext_utf8Funcs) {

  1602         return (reverse ? startPos : endPos);

  1603     }

  1605     // Starting from the starting point, scan towards the proposed result,

  1606     // looking for the first dictionary character (which may be the one

  1607     // we're on, if we're starting in the middle of a range).

  1608     utext_setNativeIndex(fText, reverse ? endPos : startPos);

  1609     if (reverse) {

  1610         UTEXT_PREVIOUS32(fText);

  1611     }

  1613     int32_t rangeStart = startPos;

  1614     int32_t rangeEnd = endPos;

  1616     uint16_t    category;

  1617     int32_t     current;

  1618     UErrorCode  status = U_ZERO_ERROR;

  1619     UStack      breaks(status);

  1620     int32_t     foundBreakCount = 0;

  1621     UChar32     c = utext_current32(fText);

  1623     UTRIE_GET16(&fData->fTrie, c, category);

  1625     // Is the character we're starting on a dictionary character? If so, we

  1626     // need to back up to include the entire run; otherwise the results of

  1627     // the break algorithm will differ depending on where we start. Since

  1628     // the result is cached and there is typically a non-dictionary break

  1629     // within a small number of words, there should be little performance impact.

  1630     if (category & 0x4000) {

  1631         if (reverse) {

  1632             do {

  1633                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.

  1634                 c = utext_current32(fText);

  1635                 UTRIE_GET16(&fData->fTrie, c, category);

  1636             } while (c != U_SENTINEL && (category & 0x4000));

  1637             // Back up to the last dictionary character

  1638             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);

  1639             if (c == U_SENTINEL) {

  1640                 // c = fText->last32();

  1641                 //   TODO:  why was this if needed?

  1642                 c = UTEXT_PREVIOUS32(fText);

  1643             }

  1644             else {

  1645                 c = UTEXT_PREVIOUS32(fText);

  1646             }

  1647         }

  1648         else {

  1649             do {

  1650                 c = UTEXT_PREVIOUS32(fText);

  1651                 UTRIE_GET16(&fData->fTrie, c, category);

  1652             }

  1653             while (c != U_SENTINEL && (category & 0x4000));

  1654             // Back up to the last dictionary character

  1655             if (c == U_SENTINEL) {

  1656                 // c = fText->first32();

  1657                 c = utext_current32(fText);

  1658             }

  1659             else {

  1660                 utext_next32(fText);

  1661                 c = utext_current32(fText);

  1662             }

  1663             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;

  1664         }

  1665         UTRIE_GET16(&fData->fTrie, c, category);

  1666     }

  1668     // Loop through the text, looking for ranges of dictionary characters.

  1669     // For each span, find the appropriate break engine, and ask it to find

  1670     // any breaks within the span.

  1671     // Note: we always do this in the forward direction, so that the break

  1672     // cache is built in the right order.

  1673     if (reverse) {

  1674         utext_setNativeIndex(fText, rangeStart);

  1675         c = utext_current32(fText);

  1676         UTRIE_GET16(&fData->fTrie, c, category);

  1677     }

  1678     while(U_SUCCESS(status)) {

  1679         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {

  1680             utext_next32(fText);           // TODO:  tweak for post-increment operation

  1681             c = utext_current32(fText);

  1682             UTRIE_GET16(&fData->fTrie, c, category);

  1683         }

  1684         if (current >= rangeEnd) {

  1685             break;

  1686         }

  1688         // We now have a dictionary character. Get the appropriate language object

  1689         // to deal with it.

  1690         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);

  1692         // Ask the language object if there are any breaks. It will leave the text

  1693         // pointer on the other side of its range, ready to search for the next one.

  1694         if (lbe != NULL) {

  1695             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);

  1696         }

  1698         // Reload the loop variables for the next go-round

  1699         c = utext_current32(fText);

  1700         UTRIE_GET16(&fData->fTrie, c, category);

  1701     }

  1703     // If we found breaks, build a new break cache. The first and last entries must

  1704     // be the original starting and ending position.

  1705     if (foundBreakCount > 0) {

  1706         int32_t totalBreaks = foundBreakCount;

  1707         if (startPos < breaks.elementAti(0)) {

  1708             totalBreaks += 1;

  1709         }

  1710         if (endPos > breaks.peeki()) {

  1711             totalBreaks += 1;

  1712         }

  1713         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));

  1714         if (fCachedBreakPositions != NULL) {

  1715             int32_t out = 0;

  1716             fNumCachedBreakPositions = totalBreaks;

  1717             if (startPos < breaks.elementAti(0)) {

  1718                 fCachedBreakPositions[out++] = startPos;

  1719             }

  1720             for (int32_t i = 0; i < foundBreakCount; ++i) {

  1721                 fCachedBreakPositions[out++] = breaks.elementAti(i);

  1722             }

  1723             if (endPos > fCachedBreakPositions[out-1]) {

  1724                 fCachedBreakPositions[out] = endPos;

  1725             }

  1726             // If there are breaks, then by definition, we are replacing the original

  1727             // proposed break by one of the breaks we found. Use following() and

  1728             // preceding() to do the work. They should never recurse in this case.

  1729             if (reverse) {

  1730                 return preceding(endPos);

  1731             }

  1732             else {

  1733                 return following(startPos);

  1734             }

  1735         }

  1736         // If the allocation failed, just fall through to the "no breaks found" case.

  1737     }

  1739     // If we get here, there were no language-based breaks. Set the text pointer

  1740     // to the original proposed break.

  1741     utext_setNativeIndex(fText, reverse ? startPos : endPos);

  1742     return (reverse ? startPos : endPos);

  1743 }

  1745 // defined in ucln_cmn.h

  1747 U_NAMESPACE_END

  1750 static icu::UStack *gLanguageBreakFactories = NULL;

  1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;

  1753 /**

  1754  * Release all static memory held by breakiterator.

  1755  */

  1756 U_CDECL_BEGIN

  1757 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {

  1758     if (gLanguageBreakFactories) {

  1759         delete gLanguageBreakFactories;

  1760         gLanguageBreakFactories = NULL;

  1761     }

  1762     gLanguageBreakFactoriesInitOnce.reset();

  1763     return TRUE;

  1764 }

  1765 U_CDECL_END

  1767 U_CDECL_BEGIN

  1768 static void U_CALLCONV _deleteFactory(void *obj) {

  1769     delete (icu::LanguageBreakFactory *) obj;

  1770 }

  1771 U_CDECL_END

  1772 U_NAMESPACE_BEGIN

  1774 static void U_CALLCONV initLanguageFactories() {

  1775     UErrorCode status = U_ZERO_ERROR;

  1776     U_ASSERT(gLanguageBreakFactories == NULL);

  1777     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);

  1778     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {

  1779         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);

  1780         gLanguageBreakFactories->push(builtIn, status);

  1781 #ifdef U_LOCAL_SERVICE_HOOK

  1782         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);

  1783         if (extra != NULL) {

  1784             gLanguageBreakFactories->push(extra, status);

  1785         }

  1786 #endif

  1787     }

  1788     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);

  1789 }

  1792 static const LanguageBreakEngine*

  1793 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)

  1794 {

  1795     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);

  1796     if (gLanguageBreakFactories == NULL) {

  1797         return NULL;

  1798     }

  1800     int32_t i = gLanguageBreakFactories->size();

  1801     const LanguageBreakEngine *lbe = NULL;

  1802     while (--i >= 0) {

  1803         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));

  1804         lbe = factory->getEngineFor(c, breakType);

  1805         if (lbe != NULL) {

  1806             break;

  1807         }

  1808     }

  1809     return lbe;

  1810 }

  1813 //-------------------------------------------------------------------------------

  1814 //

  1815 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the

  1816 //                          the character c.

  1817 //

  1818 //-------------------------------------------------------------------------------

  1819 const LanguageBreakEngine *

  1820 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {

  1821     const LanguageBreakEngine *lbe = NULL;

  1822     UErrorCode status = U_ZERO_ERROR;

  1824     if (fLanguageBreakEngines == NULL) {

  1825         fLanguageBreakEngines = new UStack(status);

  1826         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {

  1827             delete fLanguageBreakEngines;

  1828             fLanguageBreakEngines = 0;

  1829             return NULL;

  1830         }

  1831     }

  1833     int32_t i = fLanguageBreakEngines->size();

  1834     while (--i >= 0) {

  1835         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));

  1836         if (lbe->handles(c, fBreakType)) {

  1837             return lbe;

  1838         }

  1839     }

  1841     // No existing dictionary took the character. See if a factory wants to

  1842     // give us a new LanguageBreakEngine for this character.

  1843     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);

  1845     // If we got one, use it and push it on our stack.

  1846     if (lbe != NULL) {

  1847         fLanguageBreakEngines->push((void *)lbe, status);

  1848         // Even if we can't remember it, we can keep looking it up, so

  1849         // return it even if the push fails.

  1850         return lbe;

  1851     }

  1853     // No engine is forthcoming for this character. Add it to the

  1854     // reject set. Create the reject break engine if needed.

  1855     if (fUnhandledBreakEngine == NULL) {

  1856         fUnhandledBreakEngine = new UnhandledEngine(status);

  1857         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {

  1858             status = U_MEMORY_ALLOCATION_ERROR;

  1859         }

  1860         // Put it last so that scripts for which we have an engine get tried

  1861         // first.

  1862         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);

  1863         // If we can't insert it, or creation failed, get rid of it

  1864         if (U_FAILURE(status)) {

  1865             delete fUnhandledBreakEngine;

  1866             fUnhandledBreakEngine = 0;

  1867             return NULL;

  1868         }

  1869     }

  1871     // Tell the reject engine about the character; at its discretion, it may

  1872     // add more than just the one character.

  1873     fUnhandledBreakEngine->handleCharacter(c, fBreakType);

  1875     return fUnhandledBreakEngine;

  1876 }

  1880 /*int32_t RuleBasedBreakIterator::getBreakType() const {

  1881     return fBreakType;

  1882 }*/

  1884 void RuleBasedBreakIterator::setBreakType(int32_t type) {

  1885     fBreakType = type;

  1886     reset();

  1887 }

  1889 U_NAMESPACE_END

  1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

The Tor Browser / file revision

intl/icu/source/common/rbbi.cpp@6474c204b198

intl/icu/source/common/rbbi.cpp