intl/icu/source/common/rbbi.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 ***************************************************************************
     3 *   Copyright (C) 1999-2013 International Business Machines Corporation
     4 *   and others. All rights reserved.
     5 ***************************************************************************
     6 */
     7 //
     8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
     9 //                   runtime engine and the API implementation for
    10 //                   class RuleBasedBreakIterator
    11 //
    13 #include "utypeinfo.h"  // for 'typeid' to work
    15 #include "unicode/utypes.h"
    17 #if !UCONFIG_NO_BREAK_ITERATION
    19 #include "unicode/rbbi.h"
    20 #include "unicode/schriter.h"
    21 #include "unicode/uchriter.h"
    22 #include "unicode/udata.h"
    23 #include "unicode/uclean.h"
    24 #include "rbbidata.h"
    25 #include "rbbirb.h"
    26 #include "cmemory.h"
    27 #include "cstring.h"
    28 #include "umutex.h"
    29 #include "ucln_cmn.h"
    30 #include "brkeng.h"
    32 #include "uassert.h"
    33 #include "uvector.h"
    35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
    36 #if U_LOCAL_SERVICE_HOOK
    37 #include "localsvc.h"
    38 #endif
    40 #ifdef RBBI_DEBUG
    41 static UBool fTrace = FALSE;
    42 #endif
    44 U_NAMESPACE_BEGIN
    46 // The state number of the starting state
    47 #define START_STATE 1
    49 // The state-transition value indicating "stop"
    50 #define STOP_STATE  0
    53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
    56 //=======================================================================
    57 // constructors
    58 //=======================================================================
    60 /**
    61  * Constructs a RuleBasedBreakIterator that uses the already-created
    62  * tables object that is passed in as a parameter.
    63  */
    64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
    65 {
    66     init();
    67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
    68     if (U_FAILURE(status)) {return;}
    69     if(fData == 0) {
    70         status = U_MEMORY_ALLOCATION_ERROR;
    71         return;
    72     }
    73 }
    75 /**
    76  * Same as above but does not adopt memory
    77  */
    78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
    79 {
    80     init();
    81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
    82     if (U_FAILURE(status)) {return;}
    83     if(fData == 0) {
    84         status = U_MEMORY_ALLOCATION_ERROR;
    85         return;
    86     }
    87 }
    90 //
    91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
    92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
    93 //
    94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
    95                        uint32_t       ruleLength,
    96                        UErrorCode     &status) {
    97     init();
    98     if (U_FAILURE(status)) {
    99         return;
   100     }
   101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
   102         status = U_ILLEGAL_ARGUMENT_ERROR;
   103         return;
   104     }
   105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
   106     if (data->fLength > ruleLength) {
   107         status = U_ILLEGAL_ARGUMENT_ERROR;
   108         return;
   109     }
   110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 
   111     if (U_FAILURE(status)) {return;}
   112     if(fData == 0) {
   113         status = U_MEMORY_ALLOCATION_ERROR;
   114         return;
   115     }
   116 }    
   119 //-------------------------------------------------------------------------------
   120 //
   121 //   Constructor   from a UDataMemory handle to precompiled break rules
   122 //                 stored in an ICU data file.
   123 //
   124 //-------------------------------------------------------------------------------
   125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
   126 {
   127     init();
   128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
   129     if (U_FAILURE(status)) {return;}
   130     if(fData == 0) {
   131         status = U_MEMORY_ALLOCATION_ERROR;
   132         return;
   133     }
   134 }
   138 //-------------------------------------------------------------------------------
   139 //
   140 //   Constructor       from a set of rules supplied as a string.
   141 //
   142 //-------------------------------------------------------------------------------
   143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
   144                                                 UParseError          &parseError,
   145                                                 UErrorCode           &status)
   146 {
   147     init();
   148     if (U_FAILURE(status)) {return;}
   149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
   150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
   151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
   152     //        creates and returns a complete RBBI.  From here, in a constructor, we
   153     //        can't just return the object created by the builder factory, hence
   154     //        the assignment of the factory created object to "this".
   155     if (U_SUCCESS(status)) {
   156         *this = *bi;
   157         delete bi;
   158     }
   159 }
   162 //-------------------------------------------------------------------------------
   163 //
   164 // Default Constructor.      Create an empty shell that can be set up later.
   165 //                           Used when creating a RuleBasedBreakIterator from a set
   166 //                           of rules.
   167 //-------------------------------------------------------------------------------
   168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
   169     init();
   170 }
   173 //-------------------------------------------------------------------------------
   174 //
   175 //   Copy constructor.  Will produce a break iterator with the same behavior,
   176 //                      and which iterates over the same text, as the one passed in.
   177 //
   178 //-------------------------------------------------------------------------------
   179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
   180 : BreakIterator(other)
   181 {
   182     this->init();
   183     *this = other;
   184 }
   187 /**
   188  * Destructor
   189  */
   190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
   191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   192         // fCharIter was adopted from the outside.
   193         delete fCharIter;
   194     }
   195     fCharIter = NULL;
   196     delete fSCharIter;
   197     fCharIter = NULL;
   198     delete fDCharIter;
   199     fDCharIter = NULL;
   201     utext_close(fText);
   203     if (fData != NULL) {
   204         fData->removeReference();
   205         fData = NULL;
   206     }
   207     if (fCachedBreakPositions) {
   208         uprv_free(fCachedBreakPositions);
   209         fCachedBreakPositions = NULL;
   210     }
   211     if (fLanguageBreakEngines) {
   212         delete fLanguageBreakEngines;
   213         fLanguageBreakEngines = NULL;
   214     }
   215     if (fUnhandledBreakEngine) {
   216         delete fUnhandledBreakEngine;
   217         fUnhandledBreakEngine = NULL;
   218     }
   219 }
   221 /**
   222  * Assignment operator.  Sets this iterator to have the same behavior,
   223  * and iterate over the same text, as the one passed in.
   224  */
   225 RuleBasedBreakIterator&
   226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
   227     if (this == &that) {
   228         return *this;
   229     }
   230     reset();    // Delete break cache information
   231     fBreakType = that.fBreakType;
   232     if (fLanguageBreakEngines != NULL) {
   233         delete fLanguageBreakEngines;
   234         fLanguageBreakEngines = NULL;   // Just rebuild for now
   235     }
   236     // TODO: clone fLanguageBreakEngines from "that"
   237     UErrorCode status = U_ZERO_ERROR;
   238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
   240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   241         delete fCharIter;
   242     }
   243     fCharIter = NULL;
   245     if (that.fCharIter != NULL ) {
   246         // This is a little bit tricky - it will intially appear that
   247         //  this->fCharIter is adopted, even if that->fCharIter was
   248         //  not adopted.  That's ok.
   249         fCharIter = that.fCharIter->clone();
   250     }
   252     if (fData != NULL) {
   253         fData->removeReference();
   254         fData = NULL;
   255     }
   256     if (that.fData != NULL) {
   257         fData = that.fData->addReference();
   258     }
   260     return *this;
   261 }
   265 //-----------------------------------------------------------------------------
   266 //
   267 //    init()      Shared initialization routine.   Used by all the constructors.
   268 //                Initializes all fields, leaving the object in a consistent state.
   269 //
   270 //-----------------------------------------------------------------------------
   271 void RuleBasedBreakIterator::init() {
   272     UErrorCode  status    = U_ZERO_ERROR;
   273     fText                 = utext_openUChars(NULL, NULL, 0, &status);
   274     fCharIter             = NULL;
   275     fSCharIter            = NULL;
   276     fDCharIter            = NULL;
   277     fData                 = NULL;
   278     fLastRuleStatusIndex  = 0;
   279     fLastStatusIndexValid = TRUE;
   280     fDictionaryCharCount  = 0;
   281     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
   282                                         //   dictionary behavior for Break Iterators that are
   283                                         //   built from rules.  Even better would be the ability to
   284                                         //   declare the type in the rules.
   286     fCachedBreakPositions    = NULL;
   287     fLanguageBreakEngines    = NULL;
   288     fUnhandledBreakEngine    = NULL;
   289     fNumCachedBreakPositions = 0;
   290     fPositionInCache         = 0;
   292 #ifdef RBBI_DEBUG
   293     static UBool debugInitDone = FALSE;
   294     if (debugInitDone == FALSE) {
   295         char *debugEnv = getenv("U_RBBIDEBUG");
   296         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
   297             fTrace = TRUE;
   298         }
   299         debugInitDone = TRUE;
   300     }
   301 #endif
   302 }
   306 //-----------------------------------------------------------------------------
   307 //
   308 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
   309 //            behavior, and iterating over the same text, as this one.
   310 //            Virtual function: does the right thing with subclasses.
   311 //
   312 //-----------------------------------------------------------------------------
   313 BreakIterator*
   314 RuleBasedBreakIterator::clone(void) const {
   315     return new RuleBasedBreakIterator(*this);
   316 }
   318 /**
   319  * Equality operator.  Returns TRUE if both BreakIterators are of the
   320  * same class, have the same behavior, and iterate over the same text.
   321  */
   322 UBool
   323 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
   324     if (typeid(*this) != typeid(that)) {
   325         return FALSE;
   326     }
   328     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
   330     if (!utext_equals(fText, that2.fText)) {
   331         // The two break iterators are operating on different text,
   332         //   or have a different interation position.
   333         return FALSE;
   334     };
   336     // TODO:  need a check for when in a dictionary region at different offsets.
   338     if (that2.fData == fData ||
   339         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
   340             // The two break iterators are using the same rules.
   341             return TRUE;
   342         }
   343     return FALSE;
   344 }
   346 /**
   347  * Compute a hash code for this BreakIterator
   348  * @return A hash code
   349  */
   350 int32_t
   351 RuleBasedBreakIterator::hashCode(void) const {
   352     int32_t   hash = 0;
   353     if (fData != NULL) {
   354         hash = fData->hashCode();
   355     }
   356     return hash;
   357 }
   360 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
   361     if (U_FAILURE(status)) {
   362         return;
   363     }
   364     reset();
   365     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
   367     // Set up a dummy CharacterIterator to be returned if anyone
   368     //   calls getText().  With input from UText, there is no reasonable
   369     //   way to return a characterIterator over the actual input text.
   370     //   Return one over an empty string instead - this is the closest
   371     //   we can come to signaling a failure.
   372     //   (GetText() is obsolete, this failure is sort of OK)
   373     if (fDCharIter == NULL) {
   374         static const UChar c = 0;
   375         fDCharIter = new UCharCharacterIterator(&c, 0);
   376         if (fDCharIter == NULL) {
   377             status = U_MEMORY_ALLOCATION_ERROR;
   378             return;
   379         }
   380     }
   382     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   383         // existing fCharIter was adopted from the outside.  Delete it now.
   384         delete fCharIter;
   385     }
   386     fCharIter = fDCharIter;
   388     this->first();
   389 }
   392 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
   393     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);  
   394     return result;
   395 }
   399 /**
   400  * Returns the description used to create this iterator
   401  */
   402 const UnicodeString&
   403 RuleBasedBreakIterator::getRules() const {
   404     if (fData != NULL) {
   405         return fData->getRuleSourceString();
   406     } else {
   407         static const UnicodeString *s;
   408         if (s == NULL) {
   409             // TODO:  something more elegant here.
   410             //        perhaps API should return the string by value.
   411             //        Note:  thread unsafe init & leak are semi-ok, better than
   412             //               what was before.  Sould be cleaned up, though.
   413             s = new UnicodeString;
   414         }
   415         return *s;
   416     }
   417 }
   419 //=======================================================================
   420 // BreakIterator overrides
   421 //=======================================================================
   423 /**
   424  * Return a CharacterIterator over the text being analyzed.  
   425  */
   426 CharacterIterator&
   427 RuleBasedBreakIterator::getText() const {
   428     return *fCharIter;
   429 }
   431 /**
   432  * Set the iterator to analyze a new piece of text.  This function resets
   433  * the current iteration position to the beginning of the text.
   434  * @param newText An iterator over the text to analyze.
   435  */
   436 void
   437 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
   438     // If we are holding a CharacterIterator adopted from a 
   439     //   previous call to this function, delete it now.
   440     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   441         delete fCharIter;
   442     }
   444     fCharIter = newText;
   445     UErrorCode status = U_ZERO_ERROR;
   446     reset();
   447     if (newText==NULL || newText->startIndex() != 0) {   
   448         // startIndex !=0 wants to be an error, but there's no way to report it.
   449         // Make the iterator text be an empty string.
   450         fText = utext_openUChars(fText, NULL, 0, &status);
   451     } else {
   452         fText = utext_openCharacterIterator(fText, newText, &status);
   453     }
   454     this->first();
   455 }
   457 /**
   458  * Set the iterator to analyze a new piece of text.  This function resets
   459  * the current iteration position to the beginning of the text.
   460  * @param newText An iterator over the text to analyze.
   461  */
   462 void
   463 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
   464     UErrorCode status = U_ZERO_ERROR;
   465     reset();
   466     fText = utext_openConstUnicodeString(fText, &newText, &status);
   468     // Set up a character iterator on the string.  
   469     //   Needed in case someone calls getText().
   470     //  Can not, unfortunately, do this lazily on the (probably never)
   471     //  call to getText(), because getText is const.
   472     if (fSCharIter == NULL) {
   473         fSCharIter = new StringCharacterIterator(newText);
   474     } else {
   475         fSCharIter->setText(newText);
   476     }
   478     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   479         // old fCharIter was adopted from the outside.  Delete it.
   480         delete fCharIter;
   481     }
   482     fCharIter = fSCharIter;
   484     this->first();
   485 }
   488 /**
   489  *  Provide a new UText for the input text.  Must reference text with contents identical
   490  *  to the original.
   491  *  Intended for use with text data originating in Java (garbage collected) environments
   492  *  where the data may be moved in memory at arbitrary times.
   493  */
   494 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
   495     if (U_FAILURE(status)) {
   496         return *this;
   497     }
   498     if (input == NULL) {
   499         status = U_ILLEGAL_ARGUMENT_ERROR;
   500         return *this;
   501     }
   502     int64_t pos = utext_getNativeIndex(fText);
   503     //  Shallow read-only clone of the new UText into the existing input UText
   504     fText = utext_clone(fText, input, FALSE, TRUE, &status);
   505     if (U_FAILURE(status)) {
   506         return *this;
   507     }
   508     utext_setNativeIndex(fText, pos);
   509     if (utext_getNativeIndex(fText) != pos) {
   510         // Sanity check.  The new input utext is supposed to have the exact same
   511         // contents as the old.  If we can't set to the same position, it doesn't.
   512         // The contents underlying the old utext might be invalid at this point,
   513         // so it's not safe to check directly.
   514         status = U_ILLEGAL_ARGUMENT_ERROR;
   515     }
   516     return *this;
   517 }
   520 /**
   521  * Sets the current iteration position to the beginning of the text.
   522  * @return The offset of the beginning of the text.
   523  */
   524 int32_t RuleBasedBreakIterator::first(void) {
   525     reset();
   526     fLastRuleStatusIndex  = 0;
   527     fLastStatusIndexValid = TRUE;
   528     //if (fText == NULL)
   529     //    return BreakIterator::DONE;
   531     utext_setNativeIndex(fText, 0);
   532     return 0;
   533 }
   535 /**
   536  * Sets the current iteration position to the end of the text.
   537  * @return The text's past-the-end offset.
   538  */
   539 int32_t RuleBasedBreakIterator::last(void) {
   540     reset();
   541     if (fText == NULL) {
   542         fLastRuleStatusIndex  = 0;
   543         fLastStatusIndexValid = TRUE;
   544         return BreakIterator::DONE;
   545     }
   547     fLastStatusIndexValid = FALSE;
   548     int32_t pos = (int32_t)utext_nativeLength(fText);
   549     utext_setNativeIndex(fText, pos);
   550     return pos;
   551 }
   553 /**
   554  * Advances the iterator either forward or backward the specified number of steps.
   555  * Negative values move backward, and positive values move forward.  This is
   556  * equivalent to repeatedly calling next() or previous().
   557  * @param n The number of steps to move.  The sign indicates the direction
   558  * (negative is backwards, and positive is forwards).
   559  * @return The character offset of the boundary position n boundaries away from
   560  * the current one.
   561  */
   562 int32_t RuleBasedBreakIterator::next(int32_t n) {
   563     int32_t result = current();
   564     while (n > 0) {
   565         result = next();
   566         --n;
   567     }
   568     while (n < 0) {
   569         result = previous();
   570         ++n;
   571     }
   572     return result;
   573 }
   575 /**
   576  * Advances the iterator to the next boundary position.
   577  * @return The position of the first boundary after this one.
   578  */
   579 int32_t RuleBasedBreakIterator::next(void) {
   580     // if we have cached break positions and we're still in the range
   581     // covered by them, just move one step forward in the cache
   582     if (fCachedBreakPositions != NULL) {
   583         if (fPositionInCache < fNumCachedBreakPositions - 1) {
   584             ++fPositionInCache;
   585             int32_t pos = fCachedBreakPositions[fPositionInCache];
   586             utext_setNativeIndex(fText, pos);
   587             return pos;
   588         }
   589         else {
   590             reset();
   591         }
   592     }
   594     int32_t startPos = current();
   595     int32_t result = handleNext(fData->fForwardTable);
   596     if (fDictionaryCharCount > 0) {
   597         result = checkDictionary(startPos, result, FALSE);
   598     }
   599     return result;
   600 }
   602 /**
   603  * Advances the iterator backwards, to the last boundary preceding this one.
   604  * @return The position of the last boundary position preceding this one.
   605  */
   606 int32_t RuleBasedBreakIterator::previous(void) {
   607     int32_t result;
   608     int32_t startPos;
   610     // if we have cached break positions and we're still in the range
   611     // covered by them, just move one step backward in the cache
   612     if (fCachedBreakPositions != NULL) {
   613         if (fPositionInCache > 0) {
   614             --fPositionInCache;
   615             // If we're at the beginning of the cache, need to reevaluate the
   616             // rule status
   617             if (fPositionInCache <= 0) {
   618                 fLastStatusIndexValid = FALSE;
   619             }
   620             int32_t pos = fCachedBreakPositions[fPositionInCache];
   621             utext_setNativeIndex(fText, pos);
   622             return pos;
   623         }
   624         else {
   625             reset();
   626         }
   627     }
   629     // if we're already sitting at the beginning of the text, return DONE
   630     if (fText == NULL || (startPos = current()) == 0) {
   631         fLastRuleStatusIndex  = 0;
   632         fLastStatusIndexValid = TRUE;
   633         return BreakIterator::DONE;
   634     }
   636     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
   637         result = handlePrevious(fData->fReverseTable);
   638         if (fDictionaryCharCount > 0) {
   639             result = checkDictionary(result, startPos, TRUE);
   640         }
   641         return result;
   642     }
   644     // old rule syntax
   645     // set things up.  handlePrevious() will back us up to some valid
   646     // break position before the current position (we back our internal
   647     // iterator up one step to prevent handlePrevious() from returning
   648     // the current position), but not necessarily the last one before
   650     // where we started
   652     int32_t start = current();
   654     (void)UTEXT_PREVIOUS32(fText);
   655     int32_t lastResult    = handlePrevious(fData->fReverseTable);
   656     if (lastResult == UBRK_DONE) {
   657         lastResult = 0;
   658         utext_setNativeIndex(fText, 0);
   659     }
   660     result = lastResult;
   661     int32_t lastTag       = 0;
   662     UBool   breakTagValid = FALSE;
   664     // iterate forward from the known break position until we pass our
   665     // starting point.  The last break position before the starting
   666     // point is our return value
   668     for (;;) {
   669         result         = next();
   670         if (result == BreakIterator::DONE || result >= start) {
   671             break;
   672         }
   673         lastResult     = result;
   674         lastTag        = fLastRuleStatusIndex;
   675         breakTagValid  = TRUE;
   676     }
   678     // fLastBreakTag wants to have the value for section of text preceding
   679     // the result position that we are to return (in lastResult.)  If
   680     // the backwards rules overshot and the above loop had to do two or more
   681     // next()s to move up to the desired return position, we will have a valid
   682     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
   683     // we wont have a tag value for that position, which is only set by handleNext().
   685     // set the current iteration position to be the last break position
   686     // before where we started, and then return that value
   687     utext_setNativeIndex(fText, lastResult);
   688     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
   689     fLastStatusIndexValid = breakTagValid;
   691     // No need to check the dictionary; it will have been handled by
   692     // next()
   694     return lastResult;
   695 }
   697 /**
   698  * Sets the iterator to refer to the first boundary position following
   699  * the specified position.
   700  * @offset The position from which to begin searching for a break position.
   701  * @return The position of the first break after the current position.
   702  */
   703 int32_t RuleBasedBreakIterator::following(int32_t offset) {
   704     // if we have cached break positions and offset is in the range
   705     // covered by them, use them
   706     // TODO: could use binary search
   707     // TODO: what if offset is outside range, but break is not?
   708     if (fCachedBreakPositions != NULL) {
   709         if (offset >= fCachedBreakPositions[0]
   710                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
   711             fPositionInCache = 0;
   712             // We are guaranteed not to leave the array due to range test above
   713             while (offset >= fCachedBreakPositions[fPositionInCache]) {
   714                 ++fPositionInCache;
   715             }
   716             int32_t pos = fCachedBreakPositions[fPositionInCache];
   717             utext_setNativeIndex(fText, pos);
   718             return pos;
   719         }
   720         else {
   721             reset();
   722         }
   723     }
   725     // if the offset passed in is already past the end of the text,
   726     // just return DONE; if it's before the beginning, return the
   727     // text's starting offset
   728     fLastRuleStatusIndex  = 0;
   729     fLastStatusIndexValid = TRUE;
   730     if (fText == NULL || offset >= utext_nativeLength(fText)) {
   731         last();
   732         return next();
   733     }
   734     else if (offset < 0) {
   735         return first();
   736     }
   738     // otherwise, set our internal iteration position (temporarily)
   739     // to the position passed in.  If this is the _beginning_ position,
   740     // then we can just use next() to get our return value
   742     int32_t result = 0;
   744     if (fData->fSafeRevTable != NULL) {
   745         // new rule syntax
   746         utext_setNativeIndex(fText, offset);
   747         // move forward one codepoint to prepare for moving back to a
   748         // safe point.
   749         // this handles offset being between a supplementary character
   750         (void)UTEXT_NEXT32(fText);
   751         // handlePrevious will move most of the time to < 1 boundary away
   752         handlePrevious(fData->fSafeRevTable);
   753         int32_t result = next();
   754         while (result <= offset) {
   755             result = next();
   756         }
   757         return result;
   758     }
   759     if (fData->fSafeFwdTable != NULL) {
   760         // backup plan if forward safe table is not available
   761         utext_setNativeIndex(fText, offset);
   762         (void)UTEXT_PREVIOUS32(fText);
   763         // handle next will give result >= offset
   764         handleNext(fData->fSafeFwdTable);
   765         // previous will give result 0 or 1 boundary away from offset,
   766         // most of the time
   767         // we have to
   768         int32_t oldresult = previous();
   769         while (oldresult > offset) {
   770             int32_t result = previous();
   771             if (result <= offset) {
   772                 return oldresult;
   773             }
   774             oldresult = result;
   775         }
   776         int32_t result = next();
   777         if (result <= offset) {
   778             return next();
   779         }
   780         return result;
   781     }
   782     // otherwise, we have to sync up first.  Use handlePrevious() to back
   783     // up to a known break position before the specified position (if
   784     // we can determine that the specified position is a break position,
   785     // we don't back up at all).  This may or may not be the last break
   786     // position at or before our starting position.  Advance forward
   787     // from here until we've passed the starting position.  The position
   788     // we stop on will be the first break position after the specified one.
   789     // old rule syntax
   791     utext_setNativeIndex(fText, offset);
   792     if (offset==0 || 
   793         (offset==1  && utext_getNativeIndex(fText)==0)) {
   794         return next();
   795     }
   796     result = previous();
   798     while (result != BreakIterator::DONE && result <= offset) {
   799         result = next();
   800     }
   802     return result;
   803 }
   805 /**
   806  * Sets the iterator to refer to the last boundary position before the
   807  * specified position.
   808  * @offset The position to begin searching for a break from.
   809  * @return The position of the last boundary before the starting position.
   810  */
   811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
   812     // if we have cached break positions and offset is in the range
   813     // covered by them, use them
   814     if (fCachedBreakPositions != NULL) {
   815         // TODO: binary search?
   816         // TODO: What if offset is outside range, but break is not?
   817         if (offset > fCachedBreakPositions[0]
   818                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
   819             fPositionInCache = 0;
   820             while (fPositionInCache < fNumCachedBreakPositions
   821                    && offset > fCachedBreakPositions[fPositionInCache])
   822                 ++fPositionInCache;
   823             --fPositionInCache;
   824             // If we're at the beginning of the cache, need to reevaluate the
   825             // rule status
   826             if (fPositionInCache <= 0) {
   827                 fLastStatusIndexValid = FALSE;
   828             }
   829             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
   830             return fCachedBreakPositions[fPositionInCache];
   831         }
   832         else {
   833             reset();
   834         }
   835     }
   837     // if the offset passed in is already past the end of the text,
   838     // just return DONE; if it's before the beginning, return the
   839     // text's starting offset
   840     if (fText == NULL || offset > utext_nativeLength(fText)) {
   841         // return BreakIterator::DONE;
   842         return last();
   843     }
   844     else if (offset < 0) {
   845         return first();
   846     }
   848     // if we start by updating the current iteration position to the
   849     // position specified by the caller, we can just use previous()
   850     // to carry out this operation
   852     if (fData->fSafeFwdTable != NULL) {
   853         // new rule syntax
   854         utext_setNativeIndex(fText, offset);
   855         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   856         if (newOffset != offset) {
   857             // Will come here if specified offset was not a code point boundary AND
   858             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
   859             //   indices to the containing code point.
   860             // For breakitereator::preceding only, these non-code-point indices need to be moved
   861             //   up to refer to the following codepoint.
   862             (void)UTEXT_NEXT32(fText);
   863             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   864         }
   866         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
   867         //        rather than adjusting the position unconditionally?
   868         //        (Change would interact with safe rules.)
   869         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
   870         //        affects only preceding(), seems cleaner, but is slightly different.
   871         (void)UTEXT_PREVIOUS32(fText);
   872         handleNext(fData->fSafeFwdTable);
   873         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   874         while (result >= offset) {
   875             result = previous();
   876         }
   877         return result;
   878     }
   879     if (fData->fSafeRevTable != NULL) {
   880         // backup plan if forward safe table is not available
   881         //  TODO:  check whether this path can be discarded
   882         //         It's probably OK to say that rules must supply both safe tables
   883         //            if they use safe tables at all.  We have certainly never described
   884         //            to anyone how to work with just one safe table.
   885         utext_setNativeIndex(fText, offset);
   886         (void)UTEXT_NEXT32(fText);
   888         // handle previous will give result <= offset
   889         handlePrevious(fData->fSafeRevTable);
   891         // next will give result 0 or 1 boundary away from offset,
   892         // most of the time
   893         // we have to
   894         int32_t oldresult = next();
   895         while (oldresult < offset) {
   896             int32_t result = next();
   897             if (result >= offset) {
   898                 return oldresult;
   899             }
   900             oldresult = result;
   901         }
   902         int32_t result = previous();
   903         if (result >= offset) {
   904             return previous();
   905         }
   906         return result;
   907     }
   909     // old rule syntax
   910     utext_setNativeIndex(fText, offset);
   911     return previous();
   912 }
   914 /**
   915  * Returns true if the specfied position is a boundary position.  As a side
   916  * effect, leaves the iterator pointing to the first boundary position at
   917  * or after "offset".
   918  * @param offset the offset to check.
   919  * @return True if "offset" is a boundary position.
   920  */
   921 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
   922     // the beginning index of the iterator is always a boundary position by definition
   923     if (offset == 0) {
   924         first();       // For side effects on current position, tag values.
   925         return TRUE;
   926     }
   928     if (offset == (int32_t)utext_nativeLength(fText)) {
   929         last();       // For side effects on current position, tag values.
   930         return TRUE;
   931     }
   933     // out-of-range indexes are never boundary positions
   934     if (offset < 0) {
   935         first();       // For side effects on current position, tag values.
   936         return FALSE;
   937     }
   939     if (offset > utext_nativeLength(fText)) {
   940         last();        // For side effects on current position, tag values.
   941         return FALSE;
   942     }
   944     // otherwise, we can use following() on the position before the specified
   945     // one and return true if the position we get back is the one the user
   946     // specified
   947     utext_previous32From(fText, offset);
   948     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   949     UBool    result  = following(backOne) == offset;
   950     return result;
   951 }
   953 /**
   954  * Returns the current iteration position.
   955  * @return The current iteration position.
   956  */
   957 int32_t RuleBasedBreakIterator::current(void) const {
   958     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   959     return pos;
   960 }
   962 //=======================================================================
   963 // implementation
   964 //=======================================================================
   966 //
   967 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
   968 //                 of user text.  A variable with this enum type keeps track of where we
   969 //                 are.  The state machine only fetches user input while in the RUN mode.
   970 //
   971 enum RBBIRunMode {
   972     RBBI_START,     // state machine processing is before first char of input
   973     RBBI_RUN,       // state machine processing is in the user text
   974     RBBI_END        // state machine processing is after end of user text.
   975 };
   978 //-----------------------------------------------------------------------------------
   979 //
   980 //  handleNext(stateTable)
   981 //     This method is the actual implementation of the rbbi next() method. 
   982 //     This method initializes the state machine to state 1
   983 //     and advances through the text character by character until we reach the end
   984 //     of the text or the state machine transitions to state 0.  We update our return
   985 //     value every time the state machine passes through an accepting state.
   986 //
   987 //-----------------------------------------------------------------------------------
   988 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
   989     int32_t             state;
   990     uint16_t            category        = 0;
   991     RBBIRunMode         mode;
   993     RBBIStateTableRow  *row;
   994     UChar32             c;
   995     int32_t             lookaheadStatus = 0;
   996     int32_t             lookaheadTagIdx = 0;
   997     int32_t             result          = 0;
   998     int32_t             initialPosition = 0;
   999     int32_t             lookaheadResult = 0;
  1000     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
  1001     const char         *tableData       = statetable->fTableData;
  1002     uint32_t            tableRowLen     = statetable->fRowLen;
  1004     #ifdef RBBI_DEBUG
  1005         if (fTrace) {
  1006             RBBIDebugPuts("Handle Next   pos   char  state category");
  1008     #endif
  1010     // No matter what, handleNext alway correctly sets the break tag value.
  1011     fLastStatusIndexValid = TRUE;
  1012     fLastRuleStatusIndex = 0;
  1014     // if we're already at the end of the text, return DONE.
  1015     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 
  1016     result          = initialPosition;
  1017     c               = UTEXT_NEXT32(fText);
  1018     if (fData == NULL || c==U_SENTINEL) {
  1019         return BreakIterator::DONE;
  1022     //  Set the initial state for the state machine
  1023     state = START_STATE;
  1024     row = (RBBIStateTableRow *)
  1025             //(statetable->fTableData + (statetable->fRowLen * state));
  1026             (tableData + tableRowLen * state);
  1029     mode     = RBBI_RUN;
  1030     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  1031         category = 2;
  1032         mode     = RBBI_START;
  1036     // loop until we reach the end of the text or transition to state 0
  1037     //
  1038     for (;;) {
  1039         if (c == U_SENTINEL) {
  1040             // Reached end of input string.
  1041             if (mode == RBBI_END) {
  1042                 // We have already run the loop one last time with the 
  1043                 //   character set to the psueudo {eof} value.  Now it is time
  1044                 //   to unconditionally bail out.
  1045                 if (lookaheadResult > result) {
  1046                     // We ran off the end of the string with a pending look-ahead match.
  1047                     // Treat this as if the look-ahead condition had been met, and return
  1048                     //  the match at the / position from the look-ahead rule.
  1049                     result               = lookaheadResult;
  1050                     fLastRuleStatusIndex = lookaheadTagIdx;
  1051                     lookaheadStatus = 0;
  1053                 break;
  1055             // Run the loop one last time with the fake end-of-input character category.
  1056             mode = RBBI_END;
  1057             category = 1;
  1060         //
  1061         // Get the char category.  An incoming category of 1 or 2 means that
  1062         //      we are preset for doing the beginning or end of input, and
  1063         //      that we shouldn't get a category from an actual text input character.
  1064         //
  1065         if (mode == RBBI_RUN) {
  1066             // look up the current character's character category, which tells us
  1067             // which column in the state table to look at.
  1068             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  1069             //        not the size of the character going in, which is a UChar32.
  1070             //
  1071             UTRIE_GET16(&fData->fTrie, c, category);
  1073             // Check the dictionary bit in the character's category.
  1074             //    Counter is only used by dictionary based iterators (subclasses).
  1075             //    Chars that need to be handled by a dictionary have a flag bit set
  1076             //    in their category values.
  1077             //
  1078             if ((category & 0x4000) != 0)  {
  1079                 fDictionaryCharCount++;
  1080                 //  And off the dictionary flag bit.
  1081                 category &= ~0x4000;
  1085        #ifdef RBBI_DEBUG
  1086             if (fTrace) {
  1087                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
  1088                 if (0x20<=c && c<0x7f) {
  1089                     RBBIDebugPrintf("\"%c\"  ", c);
  1090                 } else {
  1091                     RBBIDebugPrintf("%5x  ", c);
  1093                 RBBIDebugPrintf("%3d  %3d\n", state, category);
  1095         #endif
  1097         // State Transition - move machine to its next state
  1098         //
  1100         // Note: fNextState is defined as uint16_t[2], but we are casting
  1101         // a generated RBBI table to RBBIStateTableRow and some tables
  1102         // actually have more than 2 categories.
  1103         U_ASSERT(category<fData->fHeader->fCatCount);
  1104         state = row->fNextState[category];  /*Not accessing beyond memory*/
  1105         row = (RBBIStateTableRow *)
  1106             // (statetable->fTableData + (statetable->fRowLen * state));
  1107             (tableData + tableRowLen * state);
  1110         if (row->fAccepting == -1) {
  1111             // Match found, common case.
  1112             if (mode != RBBI_START) {
  1113                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1115             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
  1118         if (row->fLookAhead != 0) {
  1119             if (lookaheadStatus != 0
  1120                 && row->fAccepting == lookaheadStatus) {
  1121                 // Lookahead match is completed.  
  1122                 result               = lookaheadResult;
  1123                 fLastRuleStatusIndex = lookaheadTagIdx;
  1124                 lookaheadStatus      = 0;
  1125                 // TODO:  make a standalone hard break in a rule work.
  1126                 if (lookAheadHardBreak) {
  1127                     UTEXT_SETNATIVEINDEX(fText, result);
  1128                     return result;
  1130                 // Look-ahead completed, but other rules may match further.  Continue on
  1131                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
  1132                 goto continueOn;
  1135             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1136             lookaheadResult = r;
  1137             lookaheadStatus = row->fLookAhead;
  1138             lookaheadTagIdx = row->fTagIdx;
  1139             goto continueOn;
  1143         if (row->fAccepting != 0) {
  1144             // Because this is an accepting state, any in-progress look-ahead match
  1145             //   is no longer relavant.  Clear out the pending lookahead status.
  1146             lookaheadStatus = 0;           // clear out any pending look-ahead match.
  1149 continueOn:
  1150         if (state == STOP_STATE) {
  1151             // This is the normal exit from the lookup state machine.
  1152             // We have advanced through the string until it is certain that no
  1153             //   longer match is possible, no matter what characters follow.
  1154             break;
  1157         // Advance to the next character.  
  1158         // If this is a beginning-of-input loop iteration, don't advance
  1159         //    the input position.  The next iteration will be processing the
  1160         //    first real input character.
  1161         if (mode == RBBI_RUN) {
  1162             c = UTEXT_NEXT32(fText);
  1163         } else {
  1164             if (mode == RBBI_START) {
  1165                 mode = RBBI_RUN;
  1172     // The state machine is done.  Check whether it found a match...
  1174     // If the iterator failed to advance in the match engine, force it ahead by one.
  1175     //   (This really indicates a defect in the break rules.  They should always match
  1176     //    at least one character.)
  1177     if (result == initialPosition) {
  1178         UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1179         UTEXT_NEXT32(fText);
  1180         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1183     // Leave the iterator at our result position.
  1184     UTEXT_SETNATIVEINDEX(fText, result);
  1185     #ifdef RBBI_DEBUG
  1186         if (fTrace) {
  1187             RBBIDebugPrintf("result = %d\n\n", result);
  1189     #endif
  1190     return result;
  1195 //-----------------------------------------------------------------------------------
  1196 //
  1197 //  handlePrevious()
  1198 //
  1199 //      Iterate backwards, according to the logic of the reverse rules.
  1200 //      This version handles the exact style backwards rules.
  1201 //
  1202 //      The logic of this function is very similar to handleNext(), above.
  1203 //
  1204 //-----------------------------------------------------------------------------------
  1205 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
  1206     int32_t             state;
  1207     uint16_t            category        = 0;
  1208     RBBIRunMode         mode;
  1209     RBBIStateTableRow  *row;
  1210     UChar32             c;
  1211     int32_t             lookaheadStatus = 0;
  1212     int32_t             result          = 0;
  1213     int32_t             initialPosition = 0;
  1214     int32_t             lookaheadResult = 0;
  1215     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
  1217     #ifdef RBBI_DEBUG
  1218         if (fTrace) {
  1219             RBBIDebugPuts("Handle Previous   pos   char  state category");
  1221     #endif
  1223     // handlePrevious() never gets the rule status.
  1224     // Flag the status as invalid; if the user ever asks for status, we will need
  1225     // to back up, then re-find the break position using handleNext(), which does
  1226     // get the status value.
  1227     fLastStatusIndexValid = FALSE;
  1228     fLastRuleStatusIndex = 0;
  1230     // if we're already at the start of the text, return DONE.
  1231     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
  1232         return BreakIterator::DONE;
  1235     //  Set up the starting char.
  1236     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1237     result          = initialPosition;
  1238     c               = UTEXT_PREVIOUS32(fText);
  1240     //  Set the initial state for the state machine
  1241     state = START_STATE;
  1242     row = (RBBIStateTableRow *)
  1243             (statetable->fTableData + (statetable->fRowLen * state));
  1244     category = 3;
  1245     mode     = RBBI_RUN;
  1246     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  1247         category = 2;
  1248         mode     = RBBI_START;
  1252     // loop until we reach the start of the text or transition to state 0
  1253     //
  1254     for (;;) {
  1255         if (c == U_SENTINEL) {
  1256             // Reached end of input string.
  1257             if (mode == RBBI_END) {
  1258                 // We have already run the loop one last time with the 
  1259                 //   character set to the psueudo {eof} value.  Now it is time
  1260                 //   to unconditionally bail out.
  1261                 if (lookaheadResult < result) {
  1262                     // We ran off the end of the string with a pending look-ahead match.
  1263                     // Treat this as if the look-ahead condition had been met, and return
  1264                     //  the match at the / position from the look-ahead rule.
  1265                     result               = lookaheadResult;
  1266                     lookaheadStatus = 0;
  1267                 } else if (result == initialPosition) {
  1268                     // Ran off start, no match found.
  1269                     // move one index one (towards the start, since we are doing a previous())
  1270                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1271                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
  1273                 break;
  1275             // Run the loop one last time with the fake end-of-input character category.
  1276             mode = RBBI_END;
  1277             category = 1;
  1280         //
  1281         // Get the char category.  An incoming category of 1 or 2 means that
  1282         //      we are preset for doing the beginning or end of input, and
  1283         //      that we shouldn't get a category from an actual text input character.
  1284         //
  1285         if (mode == RBBI_RUN) {
  1286             // look up the current character's character category, which tells us
  1287             // which column in the state table to look at.
  1288             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  1289             //        not the size of the character going in, which is a UChar32.
  1290             //
  1291             UTRIE_GET16(&fData->fTrie, c, category);
  1293             // Check the dictionary bit in the character's category.
  1294             //    Counter is only used by dictionary based iterators (subclasses).
  1295             //    Chars that need to be handled by a dictionary have a flag bit set
  1296             //    in their category values.
  1297             //
  1298             if ((category & 0x4000) != 0)  {
  1299                 fDictionaryCharCount++;
  1300                 //  And off the dictionary flag bit.
  1301                 category &= ~0x4000;
  1305         #ifdef RBBI_DEBUG
  1306             if (fTrace) {
  1307                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
  1308                 if (0x20<=c && c<0x7f) {
  1309                     RBBIDebugPrintf("\"%c\"  ", c);
  1310                 } else {
  1311                     RBBIDebugPrintf("%5x  ", c);
  1313                 RBBIDebugPrintf("%3d  %3d\n", state, category);
  1315         #endif
  1317         // State Transition - move machine to its next state
  1318         //
  1320         // Note: fNextState is defined as uint16_t[2], but we are casting
  1321         // a generated RBBI table to RBBIStateTableRow and some tables
  1322         // actually have more than 2 categories.
  1323         U_ASSERT(category<fData->fHeader->fCatCount);
  1324         state = row->fNextState[category];  /*Not accessing beyond memory*/
  1325         row = (RBBIStateTableRow *)
  1326             (statetable->fTableData + (statetable->fRowLen * state));
  1328         if (row->fAccepting == -1) {
  1329             // Match found, common case.
  1330             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1333         if (row->fLookAhead != 0) {
  1334             if (lookaheadStatus != 0
  1335                 && row->fAccepting == lookaheadStatus) {
  1336                 // Lookahead match is completed.  
  1337                 result               = lookaheadResult;
  1338                 lookaheadStatus      = 0;
  1339                 // TODO:  make a standalone hard break in a rule work.
  1340                 if (lookAheadHardBreak) {
  1341                     UTEXT_SETNATIVEINDEX(fText, result);
  1342                     return result;
  1344                 // Look-ahead completed, but other rules may match further.  Continue on
  1345                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
  1346                 goto continueOn;
  1349             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1350             lookaheadResult = r;
  1351             lookaheadStatus = row->fLookAhead;
  1352             goto continueOn;
  1356         if (row->fAccepting != 0) {
  1357             // Because this is an accepting state, any in-progress look-ahead match
  1358             //   is no longer relavant.  Clear out the pending lookahead status.
  1359             lookaheadStatus = 0;    
  1362 continueOn:
  1363         if (state == STOP_STATE) {
  1364             // This is the normal exit from the lookup state machine.
  1365             // We have advanced through the string until it is certain that no
  1366             //   longer match is possible, no matter what characters follow.
  1367             break;
  1370         // Move (backwards) to the next character to process.  
  1371         // If this is a beginning-of-input loop iteration, don't advance
  1372         //    the input position.  The next iteration will be processing the
  1373         //    first real input character.
  1374         if (mode == RBBI_RUN) {
  1375             c = UTEXT_PREVIOUS32(fText);
  1376         } else {            
  1377             if (mode == RBBI_START) {
  1378                 mode = RBBI_RUN;
  1383     // The state machine is done.  Check whether it found a match...
  1385     // If the iterator failed to advance in the match engine, force it ahead by one.
  1386     //   (This really indicates a defect in the break rules.  They should always match
  1387     //    at least one character.)
  1388     if (result == initialPosition) {
  1389         UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1390         UTEXT_PREVIOUS32(fText);
  1391         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1394     // Leave the iterator at our result position.
  1395     UTEXT_SETNATIVEINDEX(fText, result);
  1396     #ifdef RBBI_DEBUG
  1397         if (fTrace) {
  1398             RBBIDebugPrintf("result = %d\n\n", result);
  1400     #endif
  1401     return result;
  1405 void
  1406 RuleBasedBreakIterator::reset()
  1408     if (fCachedBreakPositions) {
  1409         uprv_free(fCachedBreakPositions);
  1411     fCachedBreakPositions = NULL;
  1412     fNumCachedBreakPositions = 0;
  1413     fDictionaryCharCount = 0;
  1414     fPositionInCache = 0;
  1419 //-------------------------------------------------------------------------------
  1420 //
  1421 //   getRuleStatus()   Return the break rule tag associated with the current
  1422 //                     iterator position.  If the iterator arrived at its current
  1423 //                     position by iterating forwards, the value will have been
  1424 //                     cached by the handleNext() function.
  1425 //
  1426 //                     If no cached status value is available, the status is
  1427 //                     found by doing a previous() followed by a next(), which
  1428 //                     leaves the iterator where it started, and computes the
  1429 //                     status while doing the next().
  1430 //
  1431 //-------------------------------------------------------------------------------
  1432 void RuleBasedBreakIterator::makeRuleStatusValid() {
  1433     if (fLastStatusIndexValid == FALSE) {
  1434         //  No cached status is available.
  1435         if (fText == NULL || current() == 0) {
  1436             //  At start of text, or there is no text.  Status is always zero.
  1437             fLastRuleStatusIndex = 0;
  1438             fLastStatusIndexValid = TRUE;
  1439         } else {
  1440             //  Not at start of text.  Find status the tedious way.
  1441             int32_t pa = current();
  1442             previous();
  1443             if (fNumCachedBreakPositions > 0) {
  1444                 reset();                // Blow off the dictionary cache
  1446             int32_t pb = next();
  1447             if (pa != pb) {
  1448                 // note: the if (pa != pb) test is here only to eliminate warnings for
  1449                 //       unused local variables on gcc.  Logically, it isn't needed.
  1450                 U_ASSERT(pa == pb);
  1454     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
  1458 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
  1459     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
  1460     nonConstThis->makeRuleStatusValid();
  1462     // fLastRuleStatusIndex indexes to the start of the appropriate status record
  1463     //                                                 (the number of status values.)
  1464     //   This function returns the last (largest) of the array of status values.
  1465     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
  1466     int32_t  tagVal = fData->fRuleStatusTable[idx];
  1468     return tagVal;
  1474 int32_t RuleBasedBreakIterator::getRuleStatusVec(
  1475              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
  1477     if (U_FAILURE(status)) {
  1478         return 0;
  1481     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
  1482     nonConstThis->makeRuleStatusValid();
  1483     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
  1484     int32_t  numValsToCopy = numVals;
  1485     if (numVals > capacity) {
  1486         status = U_BUFFER_OVERFLOW_ERROR;
  1487         numValsToCopy = capacity;
  1489     int i;
  1490     for (i=0; i<numValsToCopy; i++) {
  1491         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
  1493     return numVals;
  1498 //-------------------------------------------------------------------------------
  1499 //
  1500 //   getBinaryRules        Access to the compiled form of the rules,
  1501 //                         for use by build system tools that save the data
  1502 //                         for standard iterator types.
  1503 //
  1504 //-------------------------------------------------------------------------------
  1505 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
  1506     const uint8_t  *retPtr = NULL;
  1507     length = 0;
  1509     if (fData != NULL) {
  1510         retPtr = (const uint8_t *)fData->fHeader;
  1511         length = fData->fHeader->fLength;
  1513     return retPtr;
  1517 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
  1518                                    int32_t &bufferSize,
  1519                                    UErrorCode &status)
  1521     if (U_FAILURE(status)){
  1522         return NULL;
  1525     if (bufferSize == 0) {
  1526         bufferSize = 1;  // preflighting for deprecated functionality
  1527         return NULL;
  1530     BreakIterator *clonedBI = clone();
  1531     if (clonedBI == NULL) {
  1532         status = U_MEMORY_ALLOCATION_ERROR;
  1533     } else {
  1534         status = U_SAFECLONE_ALLOCATED_WARNING;
  1536     return (RuleBasedBreakIterator *)clonedBI;
  1540 //-------------------------------------------------------------------------------
  1541 //
  1542 //  isDictionaryChar      Return true if the category lookup for this char
  1543 //                        indicates that it is in the set of dictionary lookup
  1544 //                        chars.
  1545 //
  1546 //                        This function is intended for use by dictionary based
  1547 //                        break iterators.
  1548 //
  1549 //-------------------------------------------------------------------------------
  1550 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
  1551     if (fData == NULL) {
  1552         return FALSE;
  1554     uint16_t category;
  1555     UTRIE_GET16(&fData->fTrie, c, category);
  1556     return (category & 0x4000) != 0;
  1557 }*/
  1560 //-------------------------------------------------------------------------------
  1561 //
  1562 //  checkDictionary       This function handles all processing of characters in
  1563 //                        the "dictionary" set. It will determine the appropriate
  1564 //                        course of action, and possibly set up a cache in the
  1565 //                        process.
  1566 //
  1567 //-------------------------------------------------------------------------------
  1568 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
  1569                             int32_t endPos,
  1570                             UBool reverse) {
  1571     // Reset the old break cache first.
  1572     reset();
  1574     // note: code segment below assumes that dictionary chars are in the 
  1575     // startPos-endPos range
  1576     // value returned should be next character in sequence
  1577     if ((endPos - startPos) <= 1) {
  1578         return (reverse ? startPos : endPos);
  1581     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
  1582     //      because native indexes are different from UTF-16 indexes.
  1583     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
  1584     //      It wont give the right breaks, but it's better than a crash.
  1585     //
  1586     //      Check the type of the UText by checking its pFuncs field, which
  1587     //      is UText's function dispatch table.  It will be the same for all
  1588     //      UTF-8 UTexts and different for any other UText type.
  1589     //
  1590     //      We have no other type of UText available with non-UTF-16 native indexing.
  1591     //      This whole check will go away once the dictionary code is fixed.
  1592     static const void *utext_utf8Funcs;
  1593     if (utext_utf8Funcs == NULL) {
  1594         // Cache the UTF-8 UText function pointer value.
  1595         UErrorCode status = U_ZERO_ERROR;
  1596         UText tempUText = UTEXT_INITIALIZER; 
  1597         utext_openUTF8(&tempUText, NULL, 0, &status);
  1598         utext_utf8Funcs = tempUText.pFuncs;
  1599         utext_close(&tempUText);
  1601     if (fText->pFuncs == utext_utf8Funcs) {
  1602         return (reverse ? startPos : endPos);
  1605     // Starting from the starting point, scan towards the proposed result,
  1606     // looking for the first dictionary character (which may be the one
  1607     // we're on, if we're starting in the middle of a range).
  1608     utext_setNativeIndex(fText, reverse ? endPos : startPos);
  1609     if (reverse) {
  1610         UTEXT_PREVIOUS32(fText);
  1613     int32_t rangeStart = startPos;
  1614     int32_t rangeEnd = endPos;
  1616     uint16_t    category;
  1617     int32_t     current;
  1618     UErrorCode  status = U_ZERO_ERROR;
  1619     UStack      breaks(status);
  1620     int32_t     foundBreakCount = 0;
  1621     UChar32     c = utext_current32(fText);
  1623     UTRIE_GET16(&fData->fTrie, c, category);
  1625     // Is the character we're starting on a dictionary character? If so, we
  1626     // need to back up to include the entire run; otherwise the results of
  1627     // the break algorithm will differ depending on where we start. Since
  1628     // the result is cached and there is typically a non-dictionary break
  1629     // within a small number of words, there should be little performance impact.
  1630     if (category & 0x4000) {
  1631         if (reverse) {
  1632             do {
  1633                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
  1634                 c = utext_current32(fText);
  1635                 UTRIE_GET16(&fData->fTrie, c, category);
  1636             } while (c != U_SENTINEL && (category & 0x4000));
  1637             // Back up to the last dictionary character
  1638             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1639             if (c == U_SENTINEL) {
  1640                 // c = fText->last32();
  1641                 //   TODO:  why was this if needed?
  1642                 c = UTEXT_PREVIOUS32(fText);
  1644             else {
  1645                 c = UTEXT_PREVIOUS32(fText);
  1648         else {
  1649             do {
  1650                 c = UTEXT_PREVIOUS32(fText);
  1651                 UTRIE_GET16(&fData->fTrie, c, category);
  1653             while (c != U_SENTINEL && (category & 0x4000));
  1654             // Back up to the last dictionary character
  1655             if (c == U_SENTINEL) {
  1656                 // c = fText->first32();
  1657                 c = utext_current32(fText);
  1659             else {
  1660                 utext_next32(fText);
  1661                 c = utext_current32(fText);
  1663             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
  1665         UTRIE_GET16(&fData->fTrie, c, category);
  1668     // Loop through the text, looking for ranges of dictionary characters.
  1669     // For each span, find the appropriate break engine, and ask it to find
  1670     // any breaks within the span.
  1671     // Note: we always do this in the forward direction, so that the break
  1672     // cache is built in the right order.
  1673     if (reverse) {
  1674         utext_setNativeIndex(fText, rangeStart);
  1675         c = utext_current32(fText);
  1676         UTRIE_GET16(&fData->fTrie, c, category);
  1678     while(U_SUCCESS(status)) {
  1679         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
  1680             utext_next32(fText);           // TODO:  tweak for post-increment operation
  1681             c = utext_current32(fText);
  1682             UTRIE_GET16(&fData->fTrie, c, category);
  1684         if (current >= rangeEnd) {
  1685             break;
  1688         // We now have a dictionary character. Get the appropriate language object
  1689         // to deal with it.
  1690         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
  1692         // Ask the language object if there are any breaks. It will leave the text
  1693         // pointer on the other side of its range, ready to search for the next one.
  1694         if (lbe != NULL) {
  1695             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
  1698         // Reload the loop variables for the next go-round
  1699         c = utext_current32(fText);
  1700         UTRIE_GET16(&fData->fTrie, c, category);
  1703     // If we found breaks, build a new break cache. The first and last entries must
  1704     // be the original starting and ending position.
  1705     if (foundBreakCount > 0) {
  1706         int32_t totalBreaks = foundBreakCount;
  1707         if (startPos < breaks.elementAti(0)) {
  1708             totalBreaks += 1;
  1710         if (endPos > breaks.peeki()) {
  1711             totalBreaks += 1;
  1713         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
  1714         if (fCachedBreakPositions != NULL) {
  1715             int32_t out = 0;
  1716             fNumCachedBreakPositions = totalBreaks;
  1717             if (startPos < breaks.elementAti(0)) {
  1718                 fCachedBreakPositions[out++] = startPos;
  1720             for (int32_t i = 0; i < foundBreakCount; ++i) {
  1721                 fCachedBreakPositions[out++] = breaks.elementAti(i);
  1723             if (endPos > fCachedBreakPositions[out-1]) {
  1724                 fCachedBreakPositions[out] = endPos;
  1726             // If there are breaks, then by definition, we are replacing the original
  1727             // proposed break by one of the breaks we found. Use following() and
  1728             // preceding() to do the work. They should never recurse in this case.
  1729             if (reverse) {
  1730                 return preceding(endPos);
  1732             else {
  1733                 return following(startPos);
  1736         // If the allocation failed, just fall through to the "no breaks found" case.
  1739     // If we get here, there were no language-based breaks. Set the text pointer
  1740     // to the original proposed break.
  1741     utext_setNativeIndex(fText, reverse ? startPos : endPos);
  1742     return (reverse ? startPos : endPos);
  1745 // defined in ucln_cmn.h
  1747 U_NAMESPACE_END
  1750 static icu::UStack *gLanguageBreakFactories = NULL;
  1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
  1753 /**
  1754  * Release all static memory held by breakiterator.  
  1755  */
  1756 U_CDECL_BEGIN
  1757 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
  1758     if (gLanguageBreakFactories) {
  1759         delete gLanguageBreakFactories;
  1760         gLanguageBreakFactories = NULL;
  1762     gLanguageBreakFactoriesInitOnce.reset();
  1763     return TRUE;
  1765 U_CDECL_END
  1767 U_CDECL_BEGIN
  1768 static void U_CALLCONV _deleteFactory(void *obj) {
  1769     delete (icu::LanguageBreakFactory *) obj;
  1771 U_CDECL_END
  1772 U_NAMESPACE_BEGIN
  1774 static void U_CALLCONV initLanguageFactories() {
  1775     UErrorCode status = U_ZERO_ERROR;
  1776     U_ASSERT(gLanguageBreakFactories == NULL);
  1777     gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
  1778     if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
  1779         ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
  1780         gLanguageBreakFactories->push(builtIn, status);
  1781 #ifdef U_LOCAL_SERVICE_HOOK
  1782         LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
  1783         if (extra != NULL) {
  1784             gLanguageBreakFactories->push(extra, status);
  1786 #endif
  1788     ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
  1792 static const LanguageBreakEngine*
  1793 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
  1795     umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
  1796     if (gLanguageBreakFactories == NULL) {
  1797         return NULL;
  1800     int32_t i = gLanguageBreakFactories->size();
  1801     const LanguageBreakEngine *lbe = NULL;
  1802     while (--i >= 0) {
  1803         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
  1804         lbe = factory->getEngineFor(c, breakType);
  1805         if (lbe != NULL) {
  1806             break;
  1809     return lbe;
  1813 //-------------------------------------------------------------------------------
  1814 //
  1815 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
  1816 //                          the character c.
  1817 //
  1818 //-------------------------------------------------------------------------------
  1819 const LanguageBreakEngine *
  1820 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
  1821     const LanguageBreakEngine *lbe = NULL;
  1822     UErrorCode status = U_ZERO_ERROR;
  1824     if (fLanguageBreakEngines == NULL) {
  1825         fLanguageBreakEngines = new UStack(status);
  1826         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
  1827             delete fLanguageBreakEngines;
  1828             fLanguageBreakEngines = 0;
  1829             return NULL;
  1833     int32_t i = fLanguageBreakEngines->size();
  1834     while (--i >= 0) {
  1835         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
  1836         if (lbe->handles(c, fBreakType)) {
  1837             return lbe;
  1841     // No existing dictionary took the character. See if a factory wants to
  1842     // give us a new LanguageBreakEngine for this character.
  1843     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
  1845     // If we got one, use it and push it on our stack.
  1846     if (lbe != NULL) {
  1847         fLanguageBreakEngines->push((void *)lbe, status);
  1848         // Even if we can't remember it, we can keep looking it up, so
  1849         // return it even if the push fails.
  1850         return lbe;
  1853     // No engine is forthcoming for this character. Add it to the
  1854     // reject set. Create the reject break engine if needed.
  1855     if (fUnhandledBreakEngine == NULL) {
  1856         fUnhandledBreakEngine = new UnhandledEngine(status);
  1857         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
  1858             status = U_MEMORY_ALLOCATION_ERROR;
  1860         // Put it last so that scripts for which we have an engine get tried
  1861         // first.
  1862         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
  1863         // If we can't insert it, or creation failed, get rid of it
  1864         if (U_FAILURE(status)) {
  1865             delete fUnhandledBreakEngine;
  1866             fUnhandledBreakEngine = 0;
  1867             return NULL;
  1871     // Tell the reject engine about the character; at its discretion, it may
  1872     // add more than just the one character.
  1873     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
  1875     return fUnhandledBreakEngine;
  1880 /*int32_t RuleBasedBreakIterator::getBreakType() const {
  1881     return fBreakType;
  1882 }*/
  1884 void RuleBasedBreakIterator::setBreakType(int32_t type) {
  1885     fBreakType = type;
  1886     reset();
  1889 U_NAMESPACE_END
  1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial