intl/icu/source/common/rbbi.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/rbbi.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1891 @@
     1.4 +/*
     1.5 +***************************************************************************
     1.6 +*   Copyright (C) 1999-2013 International Business Machines Corporation
     1.7 +*   and others. All rights reserved.
     1.8 +***************************************************************************
     1.9 +*/
    1.10 +//
    1.11 +//  file:  rbbi.c    Contains the implementation of the rule based break iterator
    1.12 +//                   runtime engine and the API implementation for
    1.13 +//                   class RuleBasedBreakIterator
    1.14 +//
    1.15 +
    1.16 +#include "utypeinfo.h"  // for 'typeid' to work
    1.17 +
    1.18 +#include "unicode/utypes.h"
    1.19 +
    1.20 +#if !UCONFIG_NO_BREAK_ITERATION
    1.21 +
    1.22 +#include "unicode/rbbi.h"
    1.23 +#include "unicode/schriter.h"
    1.24 +#include "unicode/uchriter.h"
    1.25 +#include "unicode/udata.h"
    1.26 +#include "unicode/uclean.h"
    1.27 +#include "rbbidata.h"
    1.28 +#include "rbbirb.h"
    1.29 +#include "cmemory.h"
    1.30 +#include "cstring.h"
    1.31 +#include "umutex.h"
    1.32 +#include "ucln_cmn.h"
    1.33 +#include "brkeng.h"
    1.34 +
    1.35 +#include "uassert.h"
    1.36 +#include "uvector.h"
    1.37 +
    1.38 +// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
    1.39 +#if U_LOCAL_SERVICE_HOOK
    1.40 +#include "localsvc.h"
    1.41 +#endif
    1.42 +
    1.43 +#ifdef RBBI_DEBUG
    1.44 +static UBool fTrace = FALSE;
    1.45 +#endif
    1.46 +
    1.47 +U_NAMESPACE_BEGIN
    1.48 +
    1.49 +// The state number of the starting state
    1.50 +#define START_STATE 1
    1.51 +
    1.52 +// The state-transition value indicating "stop"
    1.53 +#define STOP_STATE  0
    1.54 +
    1.55 +
    1.56 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
    1.57 +
    1.58 +
    1.59 +//=======================================================================
    1.60 +// constructors
    1.61 +//=======================================================================
    1.62 +
    1.63 +/**
    1.64 + * Constructs a RuleBasedBreakIterator that uses the already-created
    1.65 + * tables object that is passed in as a parameter.
    1.66 + */
    1.67 +RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
    1.68 +{
    1.69 +    init();
    1.70 +    fData = new RBBIDataWrapper(data, status); // status checked in constructor
    1.71 +    if (U_FAILURE(status)) {return;}
    1.72 +    if(fData == 0) {
    1.73 +        status = U_MEMORY_ALLOCATION_ERROR;
    1.74 +        return;
    1.75 +    }
    1.76 +}
    1.77 +
    1.78 +/**
    1.79 + * Same as above but does not adopt memory
    1.80 + */
    1.81 +RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
    1.82 +{
    1.83 +    init();
    1.84 +    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
    1.85 +    if (U_FAILURE(status)) {return;}
    1.86 +    if(fData == 0) {
    1.87 +        status = U_MEMORY_ALLOCATION_ERROR;
    1.88 +        return;
    1.89 +    }
    1.90 +}
    1.91 +
    1.92 +
    1.93 +//
    1.94 +//  Construct from precompiled binary rules (tables).  This constructor is public API,
    1.95 +//  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
    1.96 +//
    1.97 +RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
    1.98 +                       uint32_t       ruleLength,
    1.99 +                       UErrorCode     &status) {
   1.100 +    init();
   1.101 +    if (U_FAILURE(status)) {
   1.102 +        return;
   1.103 +    }
   1.104 +    if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
   1.105 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.106 +        return;
   1.107 +    }
   1.108 +    const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
   1.109 +    if (data->fLength > ruleLength) {
   1.110 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.111 +        return;
   1.112 +    }
   1.113 +    fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 
   1.114 +    if (U_FAILURE(status)) {return;}
   1.115 +    if(fData == 0) {
   1.116 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.117 +        return;
   1.118 +    }
   1.119 +}    
   1.120 +
   1.121 +
   1.122 +//-------------------------------------------------------------------------------
   1.123 +//
   1.124 +//   Constructor   from a UDataMemory handle to precompiled break rules
   1.125 +//                 stored in an ICU data file.
   1.126 +//
   1.127 +//-------------------------------------------------------------------------------
   1.128 +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
   1.129 +{
   1.130 +    init();
   1.131 +    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
   1.132 +    if (U_FAILURE(status)) {return;}
   1.133 +    if(fData == 0) {
   1.134 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.135 +        return;
   1.136 +    }
   1.137 +}
   1.138 +
   1.139 +
   1.140 +
   1.141 +//-------------------------------------------------------------------------------
   1.142 +//
   1.143 +//   Constructor       from a set of rules supplied as a string.
   1.144 +//
   1.145 +//-------------------------------------------------------------------------------
   1.146 +RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
   1.147 +                                                UParseError          &parseError,
   1.148 +                                                UErrorCode           &status)
   1.149 +{
   1.150 +    init();
   1.151 +    if (U_FAILURE(status)) {return;}
   1.152 +    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
   1.153 +        RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
   1.154 +    // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
   1.155 +    //        creates and returns a complete RBBI.  From here, in a constructor, we
   1.156 +    //        can't just return the object created by the builder factory, hence
   1.157 +    //        the assignment of the factory created object to "this".
   1.158 +    if (U_SUCCESS(status)) {
   1.159 +        *this = *bi;
   1.160 +        delete bi;
   1.161 +    }
   1.162 +}
   1.163 +
   1.164 +
   1.165 +//-------------------------------------------------------------------------------
   1.166 +//
   1.167 +// Default Constructor.      Create an empty shell that can be set up later.
   1.168 +//                           Used when creating a RuleBasedBreakIterator from a set
   1.169 +//                           of rules.
   1.170 +//-------------------------------------------------------------------------------
   1.171 +RuleBasedBreakIterator::RuleBasedBreakIterator() {
   1.172 +    init();
   1.173 +}
   1.174 +
   1.175 +
   1.176 +//-------------------------------------------------------------------------------
   1.177 +//
   1.178 +//   Copy constructor.  Will produce a break iterator with the same behavior,
   1.179 +//                      and which iterates over the same text, as the one passed in.
   1.180 +//
   1.181 +//-------------------------------------------------------------------------------
   1.182 +RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
   1.183 +: BreakIterator(other)
   1.184 +{
   1.185 +    this->init();
   1.186 +    *this = other;
   1.187 +}
   1.188 +
   1.189 +
   1.190 +/**
   1.191 + * Destructor
   1.192 + */
   1.193 +RuleBasedBreakIterator::~RuleBasedBreakIterator() {
   1.194 +    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   1.195 +        // fCharIter was adopted from the outside.
   1.196 +        delete fCharIter;
   1.197 +    }
   1.198 +    fCharIter = NULL;
   1.199 +    delete fSCharIter;
   1.200 +    fCharIter = NULL;
   1.201 +    delete fDCharIter;
   1.202 +    fDCharIter = NULL;
   1.203 +    
   1.204 +    utext_close(fText);
   1.205 +
   1.206 +    if (fData != NULL) {
   1.207 +        fData->removeReference();
   1.208 +        fData = NULL;
   1.209 +    }
   1.210 +    if (fCachedBreakPositions) {
   1.211 +        uprv_free(fCachedBreakPositions);
   1.212 +        fCachedBreakPositions = NULL;
   1.213 +    }
   1.214 +    if (fLanguageBreakEngines) {
   1.215 +        delete fLanguageBreakEngines;
   1.216 +        fLanguageBreakEngines = NULL;
   1.217 +    }
   1.218 +    if (fUnhandledBreakEngine) {
   1.219 +        delete fUnhandledBreakEngine;
   1.220 +        fUnhandledBreakEngine = NULL;
   1.221 +    }
   1.222 +}
   1.223 +
   1.224 +/**
   1.225 + * Assignment operator.  Sets this iterator to have the same behavior,
   1.226 + * and iterate over the same text, as the one passed in.
   1.227 + */
   1.228 +RuleBasedBreakIterator&
   1.229 +RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
   1.230 +    if (this == &that) {
   1.231 +        return *this;
   1.232 +    }
   1.233 +    reset();    // Delete break cache information
   1.234 +    fBreakType = that.fBreakType;
   1.235 +    if (fLanguageBreakEngines != NULL) {
   1.236 +        delete fLanguageBreakEngines;
   1.237 +        fLanguageBreakEngines = NULL;   // Just rebuild for now
   1.238 +    }
   1.239 +    // TODO: clone fLanguageBreakEngines from "that"
   1.240 +    UErrorCode status = U_ZERO_ERROR;
   1.241 +    fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
   1.242 +
   1.243 +    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   1.244 +        delete fCharIter;
   1.245 +    }
   1.246 +    fCharIter = NULL;
   1.247 +
   1.248 +    if (that.fCharIter != NULL ) {
   1.249 +        // This is a little bit tricky - it will intially appear that
   1.250 +        //  this->fCharIter is adopted, even if that->fCharIter was
   1.251 +        //  not adopted.  That's ok.
   1.252 +        fCharIter = that.fCharIter->clone();
   1.253 +    }
   1.254 +
   1.255 +    if (fData != NULL) {
   1.256 +        fData->removeReference();
   1.257 +        fData = NULL;
   1.258 +    }
   1.259 +    if (that.fData != NULL) {
   1.260 +        fData = that.fData->addReference();
   1.261 +    }
   1.262 +
   1.263 +    return *this;
   1.264 +}
   1.265 +
   1.266 +
   1.267 +
   1.268 +//-----------------------------------------------------------------------------
   1.269 +//
   1.270 +//    init()      Shared initialization routine.   Used by all the constructors.
   1.271 +//                Initializes all fields, leaving the object in a consistent state.
   1.272 +//
   1.273 +//-----------------------------------------------------------------------------
   1.274 +void RuleBasedBreakIterator::init() {
   1.275 +    UErrorCode  status    = U_ZERO_ERROR;
   1.276 +    fText                 = utext_openUChars(NULL, NULL, 0, &status);
   1.277 +    fCharIter             = NULL;
   1.278 +    fSCharIter            = NULL;
   1.279 +    fDCharIter            = NULL;
   1.280 +    fData                 = NULL;
   1.281 +    fLastRuleStatusIndex  = 0;
   1.282 +    fLastStatusIndexValid = TRUE;
   1.283 +    fDictionaryCharCount  = 0;
   1.284 +    fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
   1.285 +                                        //   dictionary behavior for Break Iterators that are
   1.286 +                                        //   built from rules.  Even better would be the ability to
   1.287 +                                        //   declare the type in the rules.
   1.288 +
   1.289 +    fCachedBreakPositions    = NULL;
   1.290 +    fLanguageBreakEngines    = NULL;
   1.291 +    fUnhandledBreakEngine    = NULL;
   1.292 +    fNumCachedBreakPositions = 0;
   1.293 +    fPositionInCache         = 0;
   1.294 +
   1.295 +#ifdef RBBI_DEBUG
   1.296 +    static UBool debugInitDone = FALSE;
   1.297 +    if (debugInitDone == FALSE) {
   1.298 +        char *debugEnv = getenv("U_RBBIDEBUG");
   1.299 +        if (debugEnv && uprv_strstr(debugEnv, "trace")) {
   1.300 +            fTrace = TRUE;
   1.301 +        }
   1.302 +        debugInitDone = TRUE;
   1.303 +    }
   1.304 +#endif
   1.305 +}
   1.306 +
   1.307 +
   1.308 +
   1.309 +//-----------------------------------------------------------------------------
   1.310 +//
   1.311 +//    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
   1.312 +//            behavior, and iterating over the same text, as this one.
   1.313 +//            Virtual function: does the right thing with subclasses.
   1.314 +//
   1.315 +//-----------------------------------------------------------------------------
   1.316 +BreakIterator*
   1.317 +RuleBasedBreakIterator::clone(void) const {
   1.318 +    return new RuleBasedBreakIterator(*this);
   1.319 +}
   1.320 +
   1.321 +/**
   1.322 + * Equality operator.  Returns TRUE if both BreakIterators are of the
   1.323 + * same class, have the same behavior, and iterate over the same text.
   1.324 + */
   1.325 +UBool
   1.326 +RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
   1.327 +    if (typeid(*this) != typeid(that)) {
   1.328 +        return FALSE;
   1.329 +    }
   1.330 +
   1.331 +    const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
   1.332 +
   1.333 +    if (!utext_equals(fText, that2.fText)) {
   1.334 +        // The two break iterators are operating on different text,
   1.335 +        //   or have a different interation position.
   1.336 +        return FALSE;
   1.337 +    };
   1.338 +
   1.339 +    // TODO:  need a check for when in a dictionary region at different offsets.
   1.340 +
   1.341 +    if (that2.fData == fData ||
   1.342 +        (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
   1.343 +            // The two break iterators are using the same rules.
   1.344 +            return TRUE;
   1.345 +        }
   1.346 +    return FALSE;
   1.347 +}
   1.348 +
   1.349 +/**
   1.350 + * Compute a hash code for this BreakIterator
   1.351 + * @return A hash code
   1.352 + */
   1.353 +int32_t
   1.354 +RuleBasedBreakIterator::hashCode(void) const {
   1.355 +    int32_t   hash = 0;
   1.356 +    if (fData != NULL) {
   1.357 +        hash = fData->hashCode();
   1.358 +    }
   1.359 +    return hash;
   1.360 +}
   1.361 +
   1.362 +
   1.363 +void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
   1.364 +    if (U_FAILURE(status)) {
   1.365 +        return;
   1.366 +    }
   1.367 +    reset();
   1.368 +    fText = utext_clone(fText, ut, FALSE, TRUE, &status);
   1.369 +
   1.370 +    // Set up a dummy CharacterIterator to be returned if anyone
   1.371 +    //   calls getText().  With input from UText, there is no reasonable
   1.372 +    //   way to return a characterIterator over the actual input text.
   1.373 +    //   Return one over an empty string instead - this is the closest
   1.374 +    //   we can come to signaling a failure.
   1.375 +    //   (GetText() is obsolete, this failure is sort of OK)
   1.376 +    if (fDCharIter == NULL) {
   1.377 +        static const UChar c = 0;
   1.378 +        fDCharIter = new UCharCharacterIterator(&c, 0);
   1.379 +        if (fDCharIter == NULL) {
   1.380 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.381 +            return;
   1.382 +        }
   1.383 +    }
   1.384 +
   1.385 +    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   1.386 +        // existing fCharIter was adopted from the outside.  Delete it now.
   1.387 +        delete fCharIter;
   1.388 +    }
   1.389 +    fCharIter = fDCharIter;
   1.390 +
   1.391 +    this->first();
   1.392 +}
   1.393 +
   1.394 +
   1.395 +UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
   1.396 +    UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);  
   1.397 +    return result;
   1.398 +}
   1.399 +
   1.400 +
   1.401 +
   1.402 +/**
   1.403 + * Returns the description used to create this iterator
   1.404 + */
   1.405 +const UnicodeString&
   1.406 +RuleBasedBreakIterator::getRules() const {
   1.407 +    if (fData != NULL) {
   1.408 +        return fData->getRuleSourceString();
   1.409 +    } else {
   1.410 +        static const UnicodeString *s;
   1.411 +        if (s == NULL) {
   1.412 +            // TODO:  something more elegant here.
   1.413 +            //        perhaps API should return the string by value.
   1.414 +            //        Note:  thread unsafe init & leak are semi-ok, better than
   1.415 +            //               what was before.  Sould be cleaned up, though.
   1.416 +            s = new UnicodeString;
   1.417 +        }
   1.418 +        return *s;
   1.419 +    }
   1.420 +}
   1.421 +
   1.422 +//=======================================================================
   1.423 +// BreakIterator overrides
   1.424 +//=======================================================================
   1.425 +
   1.426 +/**
   1.427 + * Return a CharacterIterator over the text being analyzed.  
   1.428 + */
   1.429 +CharacterIterator&
   1.430 +RuleBasedBreakIterator::getText() const {
   1.431 +    return *fCharIter;
   1.432 +}
   1.433 +
   1.434 +/**
   1.435 + * Set the iterator to analyze a new piece of text.  This function resets
   1.436 + * the current iteration position to the beginning of the text.
   1.437 + * @param newText An iterator over the text to analyze.
   1.438 + */
   1.439 +void
   1.440 +RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
   1.441 +    // If we are holding a CharacterIterator adopted from a 
   1.442 +    //   previous call to this function, delete it now.
   1.443 +    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   1.444 +        delete fCharIter;
   1.445 +    }
   1.446 +
   1.447 +    fCharIter = newText;
   1.448 +    UErrorCode status = U_ZERO_ERROR;
   1.449 +    reset();
   1.450 +    if (newText==NULL || newText->startIndex() != 0) {   
   1.451 +        // startIndex !=0 wants to be an error, but there's no way to report it.
   1.452 +        // Make the iterator text be an empty string.
   1.453 +        fText = utext_openUChars(fText, NULL, 0, &status);
   1.454 +    } else {
   1.455 +        fText = utext_openCharacterIterator(fText, newText, &status);
   1.456 +    }
   1.457 +    this->first();
   1.458 +}
   1.459 +
   1.460 +/**
   1.461 + * Set the iterator to analyze a new piece of text.  This function resets
   1.462 + * the current iteration position to the beginning of the text.
   1.463 + * @param newText An iterator over the text to analyze.
   1.464 + */
   1.465 +void
   1.466 +RuleBasedBreakIterator::setText(const UnicodeString& newText) {
   1.467 +    UErrorCode status = U_ZERO_ERROR;
   1.468 +    reset();
   1.469 +    fText = utext_openConstUnicodeString(fText, &newText, &status);
   1.470 +
   1.471 +    // Set up a character iterator on the string.  
   1.472 +    //   Needed in case someone calls getText().
   1.473 +    //  Can not, unfortunately, do this lazily on the (probably never)
   1.474 +    //  call to getText(), because getText is const.
   1.475 +    if (fSCharIter == NULL) {
   1.476 +        fSCharIter = new StringCharacterIterator(newText);
   1.477 +    } else {
   1.478 +        fSCharIter->setText(newText);
   1.479 +    }
   1.480 +
   1.481 +    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
   1.482 +        // old fCharIter was adopted from the outside.  Delete it.
   1.483 +        delete fCharIter;
   1.484 +    }
   1.485 +    fCharIter = fSCharIter;
   1.486 +
   1.487 +    this->first();
   1.488 +}
   1.489 +
   1.490 +
   1.491 +/**
   1.492 + *  Provide a new UText for the input text.  Must reference text with contents identical
   1.493 + *  to the original.
   1.494 + *  Intended for use with text data originating in Java (garbage collected) environments
   1.495 + *  where the data may be moved in memory at arbitrary times.
   1.496 + */
   1.497 +RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
   1.498 +    if (U_FAILURE(status)) {
   1.499 +        return *this;
   1.500 +    }
   1.501 +    if (input == NULL) {
   1.502 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.503 +        return *this;
   1.504 +    }
   1.505 +    int64_t pos = utext_getNativeIndex(fText);
   1.506 +    //  Shallow read-only clone of the new UText into the existing input UText
   1.507 +    fText = utext_clone(fText, input, FALSE, TRUE, &status);
   1.508 +    if (U_FAILURE(status)) {
   1.509 +        return *this;
   1.510 +    }
   1.511 +    utext_setNativeIndex(fText, pos);
   1.512 +    if (utext_getNativeIndex(fText) != pos) {
   1.513 +        // Sanity check.  The new input utext is supposed to have the exact same
   1.514 +        // contents as the old.  If we can't set to the same position, it doesn't.
   1.515 +        // The contents underlying the old utext might be invalid at this point,
   1.516 +        // so it's not safe to check directly.
   1.517 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.518 +    }
   1.519 +    return *this;
   1.520 +}
   1.521 +
   1.522 +
   1.523 +/**
   1.524 + * Sets the current iteration position to the beginning of the text.
   1.525 + * @return The offset of the beginning of the text.
   1.526 + */
   1.527 +int32_t RuleBasedBreakIterator::first(void) {
   1.528 +    reset();
   1.529 +    fLastRuleStatusIndex  = 0;
   1.530 +    fLastStatusIndexValid = TRUE;
   1.531 +    //if (fText == NULL)
   1.532 +    //    return BreakIterator::DONE;
   1.533 +
   1.534 +    utext_setNativeIndex(fText, 0);
   1.535 +    return 0;
   1.536 +}
   1.537 +
   1.538 +/**
   1.539 + * Sets the current iteration position to the end of the text.
   1.540 + * @return The text's past-the-end offset.
   1.541 + */
   1.542 +int32_t RuleBasedBreakIterator::last(void) {
   1.543 +    reset();
   1.544 +    if (fText == NULL) {
   1.545 +        fLastRuleStatusIndex  = 0;
   1.546 +        fLastStatusIndexValid = TRUE;
   1.547 +        return BreakIterator::DONE;
   1.548 +    }
   1.549 +
   1.550 +    fLastStatusIndexValid = FALSE;
   1.551 +    int32_t pos = (int32_t)utext_nativeLength(fText);
   1.552 +    utext_setNativeIndex(fText, pos);
   1.553 +    return pos;
   1.554 +}
   1.555 +
   1.556 +/**
   1.557 + * Advances the iterator either forward or backward the specified number of steps.
   1.558 + * Negative values move backward, and positive values move forward.  This is
   1.559 + * equivalent to repeatedly calling next() or previous().
   1.560 + * @param n The number of steps to move.  The sign indicates the direction
   1.561 + * (negative is backwards, and positive is forwards).
   1.562 + * @return The character offset of the boundary position n boundaries away from
   1.563 + * the current one.
   1.564 + */
   1.565 +int32_t RuleBasedBreakIterator::next(int32_t n) {
   1.566 +    int32_t result = current();
   1.567 +    while (n > 0) {
   1.568 +        result = next();
   1.569 +        --n;
   1.570 +    }
   1.571 +    while (n < 0) {
   1.572 +        result = previous();
   1.573 +        ++n;
   1.574 +    }
   1.575 +    return result;
   1.576 +}
   1.577 +
   1.578 +/**
   1.579 + * Advances the iterator to the next boundary position.
   1.580 + * @return The position of the first boundary after this one.
   1.581 + */
   1.582 +int32_t RuleBasedBreakIterator::next(void) {
   1.583 +    // if we have cached break positions and we're still in the range
   1.584 +    // covered by them, just move one step forward in the cache
   1.585 +    if (fCachedBreakPositions != NULL) {
   1.586 +        if (fPositionInCache < fNumCachedBreakPositions - 1) {
   1.587 +            ++fPositionInCache;
   1.588 +            int32_t pos = fCachedBreakPositions[fPositionInCache];
   1.589 +            utext_setNativeIndex(fText, pos);
   1.590 +            return pos;
   1.591 +        }
   1.592 +        else {
   1.593 +            reset();
   1.594 +        }
   1.595 +    }
   1.596 +
   1.597 +    int32_t startPos = current();
   1.598 +    int32_t result = handleNext(fData->fForwardTable);
   1.599 +    if (fDictionaryCharCount > 0) {
   1.600 +        result = checkDictionary(startPos, result, FALSE);
   1.601 +    }
   1.602 +    return result;
   1.603 +}
   1.604 +
   1.605 +/**
   1.606 + * Advances the iterator backwards, to the last boundary preceding this one.
   1.607 + * @return The position of the last boundary position preceding this one.
   1.608 + */
   1.609 +int32_t RuleBasedBreakIterator::previous(void) {
   1.610 +    int32_t result;
   1.611 +    int32_t startPos;
   1.612 +
   1.613 +    // if we have cached break positions and we're still in the range
   1.614 +    // covered by them, just move one step backward in the cache
   1.615 +    if (fCachedBreakPositions != NULL) {
   1.616 +        if (fPositionInCache > 0) {
   1.617 +            --fPositionInCache;
   1.618 +            // If we're at the beginning of the cache, need to reevaluate the
   1.619 +            // rule status
   1.620 +            if (fPositionInCache <= 0) {
   1.621 +                fLastStatusIndexValid = FALSE;
   1.622 +            }
   1.623 +            int32_t pos = fCachedBreakPositions[fPositionInCache];
   1.624 +            utext_setNativeIndex(fText, pos);
   1.625 +            return pos;
   1.626 +        }
   1.627 +        else {
   1.628 +            reset();
   1.629 +        }
   1.630 +    }
   1.631 +
   1.632 +    // if we're already sitting at the beginning of the text, return DONE
   1.633 +    if (fText == NULL || (startPos = current()) == 0) {
   1.634 +        fLastRuleStatusIndex  = 0;
   1.635 +        fLastStatusIndexValid = TRUE;
   1.636 +        return BreakIterator::DONE;
   1.637 +    }
   1.638 +
   1.639 +    if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
   1.640 +        result = handlePrevious(fData->fReverseTable);
   1.641 +        if (fDictionaryCharCount > 0) {
   1.642 +            result = checkDictionary(result, startPos, TRUE);
   1.643 +        }
   1.644 +        return result;
   1.645 +    }
   1.646 +
   1.647 +    // old rule syntax
   1.648 +    // set things up.  handlePrevious() will back us up to some valid
   1.649 +    // break position before the current position (we back our internal
   1.650 +    // iterator up one step to prevent handlePrevious() from returning
   1.651 +    // the current position), but not necessarily the last one before
   1.652 +
   1.653 +    // where we started
   1.654 +
   1.655 +    int32_t start = current();
   1.656 +
   1.657 +    (void)UTEXT_PREVIOUS32(fText);
   1.658 +    int32_t lastResult    = handlePrevious(fData->fReverseTable);
   1.659 +    if (lastResult == UBRK_DONE) {
   1.660 +        lastResult = 0;
   1.661 +        utext_setNativeIndex(fText, 0);
   1.662 +    }
   1.663 +    result = lastResult;
   1.664 +    int32_t lastTag       = 0;
   1.665 +    UBool   breakTagValid = FALSE;
   1.666 +
   1.667 +    // iterate forward from the known break position until we pass our
   1.668 +    // starting point.  The last break position before the starting
   1.669 +    // point is our return value
   1.670 +
   1.671 +    for (;;) {
   1.672 +        result         = next();
   1.673 +        if (result == BreakIterator::DONE || result >= start) {
   1.674 +            break;
   1.675 +        }
   1.676 +        lastResult     = result;
   1.677 +        lastTag        = fLastRuleStatusIndex;
   1.678 +        breakTagValid  = TRUE;
   1.679 +    }
   1.680 +
   1.681 +    // fLastBreakTag wants to have the value for section of text preceding
   1.682 +    // the result position that we are to return (in lastResult.)  If
   1.683 +    // the backwards rules overshot and the above loop had to do two or more
   1.684 +    // next()s to move up to the desired return position, we will have a valid
   1.685 +    // tag value. But, if handlePrevious() took us to exactly the correct result positon,
   1.686 +    // we wont have a tag value for that position, which is only set by handleNext().
   1.687 +
   1.688 +    // set the current iteration position to be the last break position
   1.689 +    // before where we started, and then return that value
   1.690 +    utext_setNativeIndex(fText, lastResult);
   1.691 +    fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
   1.692 +    fLastStatusIndexValid = breakTagValid;
   1.693 +
   1.694 +    // No need to check the dictionary; it will have been handled by
   1.695 +    // next()
   1.696 +
   1.697 +    return lastResult;
   1.698 +}
   1.699 +
   1.700 +/**
   1.701 + * Sets the iterator to refer to the first boundary position following
   1.702 + * the specified position.
   1.703 + * @offset The position from which to begin searching for a break position.
   1.704 + * @return The position of the first break after the current position.
   1.705 + */
   1.706 +int32_t RuleBasedBreakIterator::following(int32_t offset) {
   1.707 +    // if we have cached break positions and offset is in the range
   1.708 +    // covered by them, use them
   1.709 +    // TODO: could use binary search
   1.710 +    // TODO: what if offset is outside range, but break is not?
   1.711 +    if (fCachedBreakPositions != NULL) {
   1.712 +        if (offset >= fCachedBreakPositions[0]
   1.713 +                && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
   1.714 +            fPositionInCache = 0;
   1.715 +            // We are guaranteed not to leave the array due to range test above
   1.716 +            while (offset >= fCachedBreakPositions[fPositionInCache]) {
   1.717 +                ++fPositionInCache;
   1.718 +            }
   1.719 +            int32_t pos = fCachedBreakPositions[fPositionInCache];
   1.720 +            utext_setNativeIndex(fText, pos);
   1.721 +            return pos;
   1.722 +        }
   1.723 +        else {
   1.724 +            reset();
   1.725 +        }
   1.726 +    }
   1.727 +
   1.728 +    // if the offset passed in is already past the end of the text,
   1.729 +    // just return DONE; if it's before the beginning, return the
   1.730 +    // text's starting offset
   1.731 +    fLastRuleStatusIndex  = 0;
   1.732 +    fLastStatusIndexValid = TRUE;
   1.733 +    if (fText == NULL || offset >= utext_nativeLength(fText)) {
   1.734 +        last();
   1.735 +        return next();
   1.736 +    }
   1.737 +    else if (offset < 0) {
   1.738 +        return first();
   1.739 +    }
   1.740 +
   1.741 +    // otherwise, set our internal iteration position (temporarily)
   1.742 +    // to the position passed in.  If this is the _beginning_ position,
   1.743 +    // then we can just use next() to get our return value
   1.744 +
   1.745 +    int32_t result = 0;
   1.746 +
   1.747 +    if (fData->fSafeRevTable != NULL) {
   1.748 +        // new rule syntax
   1.749 +        utext_setNativeIndex(fText, offset);
   1.750 +        // move forward one codepoint to prepare for moving back to a
   1.751 +        // safe point.
   1.752 +        // this handles offset being between a supplementary character
   1.753 +        (void)UTEXT_NEXT32(fText);
   1.754 +        // handlePrevious will move most of the time to < 1 boundary away
   1.755 +        handlePrevious(fData->fSafeRevTable);
   1.756 +        int32_t result = next();
   1.757 +        while (result <= offset) {
   1.758 +            result = next();
   1.759 +        }
   1.760 +        return result;
   1.761 +    }
   1.762 +    if (fData->fSafeFwdTable != NULL) {
   1.763 +        // backup plan if forward safe table is not available
   1.764 +        utext_setNativeIndex(fText, offset);
   1.765 +        (void)UTEXT_PREVIOUS32(fText);
   1.766 +        // handle next will give result >= offset
   1.767 +        handleNext(fData->fSafeFwdTable);
   1.768 +        // previous will give result 0 or 1 boundary away from offset,
   1.769 +        // most of the time
   1.770 +        // we have to
   1.771 +        int32_t oldresult = previous();
   1.772 +        while (oldresult > offset) {
   1.773 +            int32_t result = previous();
   1.774 +            if (result <= offset) {
   1.775 +                return oldresult;
   1.776 +            }
   1.777 +            oldresult = result;
   1.778 +        }
   1.779 +        int32_t result = next();
   1.780 +        if (result <= offset) {
   1.781 +            return next();
   1.782 +        }
   1.783 +        return result;
   1.784 +    }
   1.785 +    // otherwise, we have to sync up first.  Use handlePrevious() to back
   1.786 +    // up to a known break position before the specified position (if
   1.787 +    // we can determine that the specified position is a break position,
   1.788 +    // we don't back up at all).  This may or may not be the last break
   1.789 +    // position at or before our starting position.  Advance forward
   1.790 +    // from here until we've passed the starting position.  The position
   1.791 +    // we stop on will be the first break position after the specified one.
   1.792 +    // old rule syntax
   1.793 +
   1.794 +    utext_setNativeIndex(fText, offset);
   1.795 +    if (offset==0 || 
   1.796 +        (offset==1  && utext_getNativeIndex(fText)==0)) {
   1.797 +        return next();
   1.798 +    }
   1.799 +    result = previous();
   1.800 +
   1.801 +    while (result != BreakIterator::DONE && result <= offset) {
   1.802 +        result = next();
   1.803 +    }
   1.804 +
   1.805 +    return result;
   1.806 +}
   1.807 +
   1.808 +/**
   1.809 + * Sets the iterator to refer to the last boundary position before the
   1.810 + * specified position.
   1.811 + * @offset The position to begin searching for a break from.
   1.812 + * @return The position of the last boundary before the starting position.
   1.813 + */
   1.814 +int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
   1.815 +    // if we have cached break positions and offset is in the range
   1.816 +    // covered by them, use them
   1.817 +    if (fCachedBreakPositions != NULL) {
   1.818 +        // TODO: binary search?
   1.819 +        // TODO: What if offset is outside range, but break is not?
   1.820 +        if (offset > fCachedBreakPositions[0]
   1.821 +                && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
   1.822 +            fPositionInCache = 0;
   1.823 +            while (fPositionInCache < fNumCachedBreakPositions
   1.824 +                   && offset > fCachedBreakPositions[fPositionInCache])
   1.825 +                ++fPositionInCache;
   1.826 +            --fPositionInCache;
   1.827 +            // If we're at the beginning of the cache, need to reevaluate the
   1.828 +            // rule status
   1.829 +            if (fPositionInCache <= 0) {
   1.830 +                fLastStatusIndexValid = FALSE;
   1.831 +            }
   1.832 +            utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
   1.833 +            return fCachedBreakPositions[fPositionInCache];
   1.834 +        }
   1.835 +        else {
   1.836 +            reset();
   1.837 +        }
   1.838 +    }
   1.839 +
   1.840 +    // if the offset passed in is already past the end of the text,
   1.841 +    // just return DONE; if it's before the beginning, return the
   1.842 +    // text's starting offset
   1.843 +    if (fText == NULL || offset > utext_nativeLength(fText)) {
   1.844 +        // return BreakIterator::DONE;
   1.845 +        return last();
   1.846 +    }
   1.847 +    else if (offset < 0) {
   1.848 +        return first();
   1.849 +    }
   1.850 +
   1.851 +    // if we start by updating the current iteration position to the
   1.852 +    // position specified by the caller, we can just use previous()
   1.853 +    // to carry out this operation
   1.854 +
   1.855 +    if (fData->fSafeFwdTable != NULL) {
   1.856 +        // new rule syntax
   1.857 +        utext_setNativeIndex(fText, offset);
   1.858 +        int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1.859 +        if (newOffset != offset) {
   1.860 +            // Will come here if specified offset was not a code point boundary AND
   1.861 +            //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
   1.862 +            //   indices to the containing code point.
   1.863 +            // For breakitereator::preceding only, these non-code-point indices need to be moved
   1.864 +            //   up to refer to the following codepoint.
   1.865 +            (void)UTEXT_NEXT32(fText);
   1.866 +            offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1.867 +        }
   1.868 +
   1.869 +        // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
   1.870 +        //        rather than adjusting the position unconditionally?
   1.871 +        //        (Change would interact with safe rules.)
   1.872 +        // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
   1.873 +        //        affects only preceding(), seems cleaner, but is slightly different.
   1.874 +        (void)UTEXT_PREVIOUS32(fText);
   1.875 +        handleNext(fData->fSafeFwdTable);
   1.876 +        int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1.877 +        while (result >= offset) {
   1.878 +            result = previous();
   1.879 +        }
   1.880 +        return result;
   1.881 +    }
   1.882 +    if (fData->fSafeRevTable != NULL) {
   1.883 +        // backup plan if forward safe table is not available
   1.884 +        //  TODO:  check whether this path can be discarded
   1.885 +        //         It's probably OK to say that rules must supply both safe tables
   1.886 +        //            if they use safe tables at all.  We have certainly never described
   1.887 +        //            to anyone how to work with just one safe table.
   1.888 +        utext_setNativeIndex(fText, offset);
   1.889 +        (void)UTEXT_NEXT32(fText);
   1.890 +        
   1.891 +        // handle previous will give result <= offset
   1.892 +        handlePrevious(fData->fSafeRevTable);
   1.893 +
   1.894 +        // next will give result 0 or 1 boundary away from offset,
   1.895 +        // most of the time
   1.896 +        // we have to
   1.897 +        int32_t oldresult = next();
   1.898 +        while (oldresult < offset) {
   1.899 +            int32_t result = next();
   1.900 +            if (result >= offset) {
   1.901 +                return oldresult;
   1.902 +            }
   1.903 +            oldresult = result;
   1.904 +        }
   1.905 +        int32_t result = previous();
   1.906 +        if (result >= offset) {
   1.907 +            return previous();
   1.908 +        }
   1.909 +        return result;
   1.910 +    }
   1.911 +
   1.912 +    // old rule syntax
   1.913 +    utext_setNativeIndex(fText, offset);
   1.914 +    return previous();
   1.915 +}
   1.916 +
   1.917 +/**
   1.918 + * Returns true if the specfied position is a boundary position.  As a side
   1.919 + * effect, leaves the iterator pointing to the first boundary position at
   1.920 + * or after "offset".
   1.921 + * @param offset the offset to check.
   1.922 + * @return True if "offset" is a boundary position.
   1.923 + */
   1.924 +UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
   1.925 +    // the beginning index of the iterator is always a boundary position by definition
   1.926 +    if (offset == 0) {
   1.927 +        first();       // For side effects on current position, tag values.
   1.928 +        return TRUE;
   1.929 +    }
   1.930 +
   1.931 +    if (offset == (int32_t)utext_nativeLength(fText)) {
   1.932 +        last();       // For side effects on current position, tag values.
   1.933 +        return TRUE;
   1.934 +    }
   1.935 +
   1.936 +    // out-of-range indexes are never boundary positions
   1.937 +    if (offset < 0) {
   1.938 +        first();       // For side effects on current position, tag values.
   1.939 +        return FALSE;
   1.940 +    }
   1.941 +
   1.942 +    if (offset > utext_nativeLength(fText)) {
   1.943 +        last();        // For side effects on current position, tag values.
   1.944 +        return FALSE;
   1.945 +    }
   1.946 +
   1.947 +    // otherwise, we can use following() on the position before the specified
   1.948 +    // one and return true if the position we get back is the one the user
   1.949 +    // specified
   1.950 +    utext_previous32From(fText, offset);
   1.951 +    int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1.952 +    UBool    result  = following(backOne) == offset;
   1.953 +    return result;
   1.954 +}
   1.955 +
   1.956 +/**
   1.957 + * Returns the current iteration position.
   1.958 + * @return The current iteration position.
   1.959 + */
   1.960 +int32_t RuleBasedBreakIterator::current(void) const {
   1.961 +    int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
   1.962 +    return pos;
   1.963 +}
   1.964 + 
   1.965 +//=======================================================================
   1.966 +// implementation
   1.967 +//=======================================================================
   1.968 +
   1.969 +//
   1.970 +// RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
   1.971 +//                 of user text.  A variable with this enum type keeps track of where we
   1.972 +//                 are.  The state machine only fetches user input while in the RUN mode.
   1.973 +//
   1.974 +enum RBBIRunMode {
   1.975 +    RBBI_START,     // state machine processing is before first char of input
   1.976 +    RBBI_RUN,       // state machine processing is in the user text
   1.977 +    RBBI_END        // state machine processing is after end of user text.
   1.978 +};
   1.979 +
   1.980 +
   1.981 +//-----------------------------------------------------------------------------------
   1.982 +//
   1.983 +//  handleNext(stateTable)
   1.984 +//     This method is the actual implementation of the rbbi next() method. 
   1.985 +//     This method initializes the state machine to state 1
   1.986 +//     and advances through the text character by character until we reach the end
   1.987 +//     of the text or the state machine transitions to state 0.  We update our return
   1.988 +//     value every time the state machine passes through an accepting state.
   1.989 +//
   1.990 +//-----------------------------------------------------------------------------------
   1.991 +int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
   1.992 +    int32_t             state;
   1.993 +    uint16_t            category        = 0;
   1.994 +    RBBIRunMode         mode;
   1.995 +    
   1.996 +    RBBIStateTableRow  *row;
   1.997 +    UChar32             c;
   1.998 +    int32_t             lookaheadStatus = 0;
   1.999 +    int32_t             lookaheadTagIdx = 0;
  1.1000 +    int32_t             result          = 0;
  1.1001 +    int32_t             initialPosition = 0;
  1.1002 +    int32_t             lookaheadResult = 0;
  1.1003 +    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
  1.1004 +    const char         *tableData       = statetable->fTableData;
  1.1005 +    uint32_t            tableRowLen     = statetable->fRowLen;
  1.1006 +
  1.1007 +    #ifdef RBBI_DEBUG
  1.1008 +        if (fTrace) {
  1.1009 +            RBBIDebugPuts("Handle Next   pos   char  state category");
  1.1010 +        }
  1.1011 +    #endif
  1.1012 +
  1.1013 +    // No matter what, handleNext alway correctly sets the break tag value.
  1.1014 +    fLastStatusIndexValid = TRUE;
  1.1015 +    fLastRuleStatusIndex = 0;
  1.1016 +
  1.1017 +    // if we're already at the end of the text, return DONE.
  1.1018 +    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 
  1.1019 +    result          = initialPosition;
  1.1020 +    c               = UTEXT_NEXT32(fText);
  1.1021 +    if (fData == NULL || c==U_SENTINEL) {
  1.1022 +        return BreakIterator::DONE;
  1.1023 +    }
  1.1024 +
  1.1025 +    //  Set the initial state for the state machine
  1.1026 +    state = START_STATE;
  1.1027 +    row = (RBBIStateTableRow *)
  1.1028 +            //(statetable->fTableData + (statetable->fRowLen * state));
  1.1029 +            (tableData + tableRowLen * state);
  1.1030 +            
  1.1031 +    
  1.1032 +    mode     = RBBI_RUN;
  1.1033 +    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  1.1034 +        category = 2;
  1.1035 +        mode     = RBBI_START;
  1.1036 +    }
  1.1037 +
  1.1038 +
  1.1039 +    // loop until we reach the end of the text or transition to state 0
  1.1040 +    //
  1.1041 +    for (;;) {
  1.1042 +        if (c == U_SENTINEL) {
  1.1043 +            // Reached end of input string.
  1.1044 +            if (mode == RBBI_END) {
  1.1045 +                // We have already run the loop one last time with the 
  1.1046 +                //   character set to the psueudo {eof} value.  Now it is time
  1.1047 +                //   to unconditionally bail out.
  1.1048 +                if (lookaheadResult > result) {
  1.1049 +                    // We ran off the end of the string with a pending look-ahead match.
  1.1050 +                    // Treat this as if the look-ahead condition had been met, and return
  1.1051 +                    //  the match at the / position from the look-ahead rule.
  1.1052 +                    result               = lookaheadResult;
  1.1053 +                    fLastRuleStatusIndex = lookaheadTagIdx;
  1.1054 +                    lookaheadStatus = 0;
  1.1055 +                } 
  1.1056 +                break;
  1.1057 +            }
  1.1058 +            // Run the loop one last time with the fake end-of-input character category.
  1.1059 +            mode = RBBI_END;
  1.1060 +            category = 1;
  1.1061 +        }
  1.1062 +
  1.1063 +        //
  1.1064 +        // Get the char category.  An incoming category of 1 or 2 means that
  1.1065 +        //      we are preset for doing the beginning or end of input, and
  1.1066 +        //      that we shouldn't get a category from an actual text input character.
  1.1067 +        //
  1.1068 +        if (mode == RBBI_RUN) {
  1.1069 +            // look up the current character's character category, which tells us
  1.1070 +            // which column in the state table to look at.
  1.1071 +            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  1.1072 +            //        not the size of the character going in, which is a UChar32.
  1.1073 +            //
  1.1074 +            UTRIE_GET16(&fData->fTrie, c, category);
  1.1075 +
  1.1076 +            // Check the dictionary bit in the character's category.
  1.1077 +            //    Counter is only used by dictionary based iterators (subclasses).
  1.1078 +            //    Chars that need to be handled by a dictionary have a flag bit set
  1.1079 +            //    in their category values.
  1.1080 +            //
  1.1081 +            if ((category & 0x4000) != 0)  {
  1.1082 +                fDictionaryCharCount++;
  1.1083 +                //  And off the dictionary flag bit.
  1.1084 +                category &= ~0x4000;
  1.1085 +            }
  1.1086 +        }
  1.1087 +
  1.1088 +       #ifdef RBBI_DEBUG
  1.1089 +            if (fTrace) {
  1.1090 +                RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
  1.1091 +                if (0x20<=c && c<0x7f) {
  1.1092 +                    RBBIDebugPrintf("\"%c\"  ", c);
  1.1093 +                } else {
  1.1094 +                    RBBIDebugPrintf("%5x  ", c);
  1.1095 +                }
  1.1096 +                RBBIDebugPrintf("%3d  %3d\n", state, category);
  1.1097 +            }
  1.1098 +        #endif
  1.1099 +
  1.1100 +        // State Transition - move machine to its next state
  1.1101 +        //
  1.1102 +
  1.1103 +        // Note: fNextState is defined as uint16_t[2], but we are casting
  1.1104 +        // a generated RBBI table to RBBIStateTableRow and some tables
  1.1105 +        // actually have more than 2 categories.
  1.1106 +        U_ASSERT(category<fData->fHeader->fCatCount);
  1.1107 +        state = row->fNextState[category];  /*Not accessing beyond memory*/
  1.1108 +        row = (RBBIStateTableRow *)
  1.1109 +            // (statetable->fTableData + (statetable->fRowLen * state));
  1.1110 +            (tableData + tableRowLen * state);
  1.1111 +
  1.1112 +
  1.1113 +        if (row->fAccepting == -1) {
  1.1114 +            // Match found, common case.
  1.1115 +            if (mode != RBBI_START) {
  1.1116 +                result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1117 +            }
  1.1118 +            fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
  1.1119 +        }
  1.1120 +
  1.1121 +        if (row->fLookAhead != 0) {
  1.1122 +            if (lookaheadStatus != 0
  1.1123 +                && row->fAccepting == lookaheadStatus) {
  1.1124 +                // Lookahead match is completed.  
  1.1125 +                result               = lookaheadResult;
  1.1126 +                fLastRuleStatusIndex = lookaheadTagIdx;
  1.1127 +                lookaheadStatus      = 0;
  1.1128 +                // TODO:  make a standalone hard break in a rule work.
  1.1129 +                if (lookAheadHardBreak) {
  1.1130 +                    UTEXT_SETNATIVEINDEX(fText, result);
  1.1131 +                    return result;
  1.1132 +                }
  1.1133 +                // Look-ahead completed, but other rules may match further.  Continue on
  1.1134 +                //  TODO:  junk this feature?  I don't think it's used anywhwere.
  1.1135 +                goto continueOn;
  1.1136 +            }
  1.1137 +
  1.1138 +            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1139 +            lookaheadResult = r;
  1.1140 +            lookaheadStatus = row->fLookAhead;
  1.1141 +            lookaheadTagIdx = row->fTagIdx;
  1.1142 +            goto continueOn;
  1.1143 +        }
  1.1144 +
  1.1145 +
  1.1146 +        if (row->fAccepting != 0) {
  1.1147 +            // Because this is an accepting state, any in-progress look-ahead match
  1.1148 +            //   is no longer relavant.  Clear out the pending lookahead status.
  1.1149 +            lookaheadStatus = 0;           // clear out any pending look-ahead match.
  1.1150 +        }
  1.1151 +
  1.1152 +continueOn:
  1.1153 +        if (state == STOP_STATE) {
  1.1154 +            // This is the normal exit from the lookup state machine.
  1.1155 +            // We have advanced through the string until it is certain that no
  1.1156 +            //   longer match is possible, no matter what characters follow.
  1.1157 +            break;
  1.1158 +        }
  1.1159 +        
  1.1160 +        // Advance to the next character.  
  1.1161 +        // If this is a beginning-of-input loop iteration, don't advance
  1.1162 +        //    the input position.  The next iteration will be processing the
  1.1163 +        //    first real input character.
  1.1164 +        if (mode == RBBI_RUN) {
  1.1165 +            c = UTEXT_NEXT32(fText);
  1.1166 +        } else {
  1.1167 +            if (mode == RBBI_START) {
  1.1168 +                mode = RBBI_RUN;
  1.1169 +            }
  1.1170 +        }
  1.1171 +
  1.1172 +
  1.1173 +    }
  1.1174 +
  1.1175 +    // The state machine is done.  Check whether it found a match...
  1.1176 +
  1.1177 +    // If the iterator failed to advance in the match engine, force it ahead by one.
  1.1178 +    //   (This really indicates a defect in the break rules.  They should always match
  1.1179 +    //    at least one character.)
  1.1180 +    if (result == initialPosition) {
  1.1181 +        UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1.1182 +        UTEXT_NEXT32(fText);
  1.1183 +        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1184 +    }
  1.1185 +
  1.1186 +    // Leave the iterator at our result position.
  1.1187 +    UTEXT_SETNATIVEINDEX(fText, result);
  1.1188 +    #ifdef RBBI_DEBUG
  1.1189 +        if (fTrace) {
  1.1190 +            RBBIDebugPrintf("result = %d\n\n", result);
  1.1191 +        }
  1.1192 +    #endif
  1.1193 +    return result;
  1.1194 +}
  1.1195 +
  1.1196 +
  1.1197 +
  1.1198 +//-----------------------------------------------------------------------------------
  1.1199 +//
  1.1200 +//  handlePrevious()
  1.1201 +//
  1.1202 +//      Iterate backwards, according to the logic of the reverse rules.
  1.1203 +//      This version handles the exact style backwards rules.
  1.1204 +//
  1.1205 +//      The logic of this function is very similar to handleNext(), above.
  1.1206 +//
  1.1207 +//-----------------------------------------------------------------------------------
  1.1208 +int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
  1.1209 +    int32_t             state;
  1.1210 +    uint16_t            category        = 0;
  1.1211 +    RBBIRunMode         mode;
  1.1212 +    RBBIStateTableRow  *row;
  1.1213 +    UChar32             c;
  1.1214 +    int32_t             lookaheadStatus = 0;
  1.1215 +    int32_t             result          = 0;
  1.1216 +    int32_t             initialPosition = 0;
  1.1217 +    int32_t             lookaheadResult = 0;
  1.1218 +    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
  1.1219 +
  1.1220 +    #ifdef RBBI_DEBUG
  1.1221 +        if (fTrace) {
  1.1222 +            RBBIDebugPuts("Handle Previous   pos   char  state category");
  1.1223 +        }
  1.1224 +    #endif
  1.1225 +
  1.1226 +    // handlePrevious() never gets the rule status.
  1.1227 +    // Flag the status as invalid; if the user ever asks for status, we will need
  1.1228 +    // to back up, then re-find the break position using handleNext(), which does
  1.1229 +    // get the status value.
  1.1230 +    fLastStatusIndexValid = FALSE;
  1.1231 +    fLastRuleStatusIndex = 0;
  1.1232 +
  1.1233 +    // if we're already at the start of the text, return DONE.
  1.1234 +    if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
  1.1235 +        return BreakIterator::DONE;
  1.1236 +    }
  1.1237 +
  1.1238 +    //  Set up the starting char.
  1.1239 +    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1240 +    result          = initialPosition;
  1.1241 +    c               = UTEXT_PREVIOUS32(fText);
  1.1242 +
  1.1243 +    //  Set the initial state for the state machine
  1.1244 +    state = START_STATE;
  1.1245 +    row = (RBBIStateTableRow *)
  1.1246 +            (statetable->fTableData + (statetable->fRowLen * state));
  1.1247 +    category = 3;
  1.1248 +    mode     = RBBI_RUN;
  1.1249 +    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
  1.1250 +        category = 2;
  1.1251 +        mode     = RBBI_START;
  1.1252 +    }
  1.1253 +
  1.1254 +
  1.1255 +    // loop until we reach the start of the text or transition to state 0
  1.1256 +    //
  1.1257 +    for (;;) {
  1.1258 +        if (c == U_SENTINEL) {
  1.1259 +            // Reached end of input string.
  1.1260 +            if (mode == RBBI_END) {
  1.1261 +                // We have already run the loop one last time with the 
  1.1262 +                //   character set to the psueudo {eof} value.  Now it is time
  1.1263 +                //   to unconditionally bail out.
  1.1264 +                if (lookaheadResult < result) {
  1.1265 +                    // We ran off the end of the string with a pending look-ahead match.
  1.1266 +                    // Treat this as if the look-ahead condition had been met, and return
  1.1267 +                    //  the match at the / position from the look-ahead rule.
  1.1268 +                    result               = lookaheadResult;
  1.1269 +                    lookaheadStatus = 0;
  1.1270 +                } else if (result == initialPosition) {
  1.1271 +                    // Ran off start, no match found.
  1.1272 +                    // move one index one (towards the start, since we are doing a previous())
  1.1273 +                    UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1.1274 +                    (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
  1.1275 +                }
  1.1276 +                break;
  1.1277 +            }
  1.1278 +            // Run the loop one last time with the fake end-of-input character category.
  1.1279 +            mode = RBBI_END;
  1.1280 +            category = 1;
  1.1281 +        }
  1.1282 +
  1.1283 +        //
  1.1284 +        // Get the char category.  An incoming category of 1 or 2 means that
  1.1285 +        //      we are preset for doing the beginning or end of input, and
  1.1286 +        //      that we shouldn't get a category from an actual text input character.
  1.1287 +        //
  1.1288 +        if (mode == RBBI_RUN) {
  1.1289 +            // look up the current character's character category, which tells us
  1.1290 +            // which column in the state table to look at.
  1.1291 +            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
  1.1292 +            //        not the size of the character going in, which is a UChar32.
  1.1293 +            //
  1.1294 +            UTRIE_GET16(&fData->fTrie, c, category);
  1.1295 +
  1.1296 +            // Check the dictionary bit in the character's category.
  1.1297 +            //    Counter is only used by dictionary based iterators (subclasses).
  1.1298 +            //    Chars that need to be handled by a dictionary have a flag bit set
  1.1299 +            //    in their category values.
  1.1300 +            //
  1.1301 +            if ((category & 0x4000) != 0)  {
  1.1302 +                fDictionaryCharCount++;
  1.1303 +                //  And off the dictionary flag bit.
  1.1304 +                category &= ~0x4000;
  1.1305 +            }
  1.1306 +        }
  1.1307 +
  1.1308 +        #ifdef RBBI_DEBUG
  1.1309 +            if (fTrace) {
  1.1310 +                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
  1.1311 +                if (0x20<=c && c<0x7f) {
  1.1312 +                    RBBIDebugPrintf("\"%c\"  ", c);
  1.1313 +                } else {
  1.1314 +                    RBBIDebugPrintf("%5x  ", c);
  1.1315 +                }
  1.1316 +                RBBIDebugPrintf("%3d  %3d\n", state, category);
  1.1317 +            }
  1.1318 +        #endif
  1.1319 +
  1.1320 +        // State Transition - move machine to its next state
  1.1321 +        //
  1.1322 +
  1.1323 +        // Note: fNextState is defined as uint16_t[2], but we are casting
  1.1324 +        // a generated RBBI table to RBBIStateTableRow and some tables
  1.1325 +        // actually have more than 2 categories.
  1.1326 +        U_ASSERT(category<fData->fHeader->fCatCount);
  1.1327 +        state = row->fNextState[category];  /*Not accessing beyond memory*/
  1.1328 +        row = (RBBIStateTableRow *)
  1.1329 +            (statetable->fTableData + (statetable->fRowLen * state));
  1.1330 +
  1.1331 +        if (row->fAccepting == -1) {
  1.1332 +            // Match found, common case.
  1.1333 +            result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1334 +        }
  1.1335 +
  1.1336 +        if (row->fLookAhead != 0) {
  1.1337 +            if (lookaheadStatus != 0
  1.1338 +                && row->fAccepting == lookaheadStatus) {
  1.1339 +                // Lookahead match is completed.  
  1.1340 +                result               = lookaheadResult;
  1.1341 +                lookaheadStatus      = 0;
  1.1342 +                // TODO:  make a standalone hard break in a rule work.
  1.1343 +                if (lookAheadHardBreak) {
  1.1344 +                    UTEXT_SETNATIVEINDEX(fText, result);
  1.1345 +                    return result;
  1.1346 +                }
  1.1347 +                // Look-ahead completed, but other rules may match further.  Continue on
  1.1348 +                //  TODO:  junk this feature?  I don't think it's used anywhwere.
  1.1349 +                goto continueOn;
  1.1350 +            }
  1.1351 +
  1.1352 +            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1353 +            lookaheadResult = r;
  1.1354 +            lookaheadStatus = row->fLookAhead;
  1.1355 +            goto continueOn;
  1.1356 +        }
  1.1357 +
  1.1358 +
  1.1359 +        if (row->fAccepting != 0) {
  1.1360 +            // Because this is an accepting state, any in-progress look-ahead match
  1.1361 +            //   is no longer relavant.  Clear out the pending lookahead status.
  1.1362 +            lookaheadStatus = 0;    
  1.1363 +        }
  1.1364 +
  1.1365 +continueOn:
  1.1366 +        if (state == STOP_STATE) {
  1.1367 +            // This is the normal exit from the lookup state machine.
  1.1368 +            // We have advanced through the string until it is certain that no
  1.1369 +            //   longer match is possible, no matter what characters follow.
  1.1370 +            break;
  1.1371 +        }
  1.1372 +
  1.1373 +        // Move (backwards) to the next character to process.  
  1.1374 +        // If this is a beginning-of-input loop iteration, don't advance
  1.1375 +        //    the input position.  The next iteration will be processing the
  1.1376 +        //    first real input character.
  1.1377 +        if (mode == RBBI_RUN) {
  1.1378 +            c = UTEXT_PREVIOUS32(fText);
  1.1379 +        } else {            
  1.1380 +            if (mode == RBBI_START) {
  1.1381 +                mode = RBBI_RUN;
  1.1382 +            }
  1.1383 +        }
  1.1384 +    }
  1.1385 +
  1.1386 +    // The state machine is done.  Check whether it found a match...
  1.1387 +
  1.1388 +    // If the iterator failed to advance in the match engine, force it ahead by one.
  1.1389 +    //   (This really indicates a defect in the break rules.  They should always match
  1.1390 +    //    at least one character.)
  1.1391 +    if (result == initialPosition) {
  1.1392 +        UTEXT_SETNATIVEINDEX(fText, initialPosition);
  1.1393 +        UTEXT_PREVIOUS32(fText);
  1.1394 +        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1395 +    }
  1.1396 +
  1.1397 +    // Leave the iterator at our result position.
  1.1398 +    UTEXT_SETNATIVEINDEX(fText, result);
  1.1399 +    #ifdef RBBI_DEBUG
  1.1400 +        if (fTrace) {
  1.1401 +            RBBIDebugPrintf("result = %d\n\n", result);
  1.1402 +        }
  1.1403 +    #endif
  1.1404 +    return result;
  1.1405 +}
  1.1406 +
  1.1407 +
  1.1408 +void
  1.1409 +RuleBasedBreakIterator::reset()
  1.1410 +{
  1.1411 +    if (fCachedBreakPositions) {
  1.1412 +        uprv_free(fCachedBreakPositions);
  1.1413 +    }
  1.1414 +    fCachedBreakPositions = NULL;
  1.1415 +    fNumCachedBreakPositions = 0;
  1.1416 +    fDictionaryCharCount = 0;
  1.1417 +    fPositionInCache = 0;
  1.1418 +}
  1.1419 +
  1.1420 +
  1.1421 +
  1.1422 +//-------------------------------------------------------------------------------
  1.1423 +//
  1.1424 +//   getRuleStatus()   Return the break rule tag associated with the current
  1.1425 +//                     iterator position.  If the iterator arrived at its current
  1.1426 +//                     position by iterating forwards, the value will have been
  1.1427 +//                     cached by the handleNext() function.
  1.1428 +//
  1.1429 +//                     If no cached status value is available, the status is
  1.1430 +//                     found by doing a previous() followed by a next(), which
  1.1431 +//                     leaves the iterator where it started, and computes the
  1.1432 +//                     status while doing the next().
  1.1433 +//
  1.1434 +//-------------------------------------------------------------------------------
  1.1435 +void RuleBasedBreakIterator::makeRuleStatusValid() {
  1.1436 +    if (fLastStatusIndexValid == FALSE) {
  1.1437 +        //  No cached status is available.
  1.1438 +        if (fText == NULL || current() == 0) {
  1.1439 +            //  At start of text, or there is no text.  Status is always zero.
  1.1440 +            fLastRuleStatusIndex = 0;
  1.1441 +            fLastStatusIndexValid = TRUE;
  1.1442 +        } else {
  1.1443 +            //  Not at start of text.  Find status the tedious way.
  1.1444 +            int32_t pa = current();
  1.1445 +            previous();
  1.1446 +            if (fNumCachedBreakPositions > 0) {
  1.1447 +                reset();                // Blow off the dictionary cache
  1.1448 +            }
  1.1449 +            int32_t pb = next();
  1.1450 +            if (pa != pb) {
  1.1451 +                // note: the if (pa != pb) test is here only to eliminate warnings for
  1.1452 +                //       unused local variables on gcc.  Logically, it isn't needed.
  1.1453 +                U_ASSERT(pa == pb);
  1.1454 +            }
  1.1455 +        }
  1.1456 +    }
  1.1457 +    U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
  1.1458 +}
  1.1459 +
  1.1460 +
  1.1461 +int32_t  RuleBasedBreakIterator::getRuleStatus() const {
  1.1462 +    RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
  1.1463 +    nonConstThis->makeRuleStatusValid();
  1.1464 +
  1.1465 +    // fLastRuleStatusIndex indexes to the start of the appropriate status record
  1.1466 +    //                                                 (the number of status values.)
  1.1467 +    //   This function returns the last (largest) of the array of status values.
  1.1468 +    int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
  1.1469 +    int32_t  tagVal = fData->fRuleStatusTable[idx];
  1.1470 +
  1.1471 +    return tagVal;
  1.1472 +}
  1.1473 +
  1.1474 +
  1.1475 +
  1.1476 +
  1.1477 +int32_t RuleBasedBreakIterator::getRuleStatusVec(
  1.1478 +             int32_t *fillInVec, int32_t capacity, UErrorCode &status)
  1.1479 +{
  1.1480 +    if (U_FAILURE(status)) {
  1.1481 +        return 0;
  1.1482 +    }
  1.1483 +
  1.1484 +    RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
  1.1485 +    nonConstThis->makeRuleStatusValid();
  1.1486 +    int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
  1.1487 +    int32_t  numValsToCopy = numVals;
  1.1488 +    if (numVals > capacity) {
  1.1489 +        status = U_BUFFER_OVERFLOW_ERROR;
  1.1490 +        numValsToCopy = capacity;
  1.1491 +    }
  1.1492 +    int i;
  1.1493 +    for (i=0; i<numValsToCopy; i++) {
  1.1494 +        fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
  1.1495 +    }
  1.1496 +    return numVals;
  1.1497 +}
  1.1498 +
  1.1499 +
  1.1500 +
  1.1501 +//-------------------------------------------------------------------------------
  1.1502 +//
  1.1503 +//   getBinaryRules        Access to the compiled form of the rules,
  1.1504 +//                         for use by build system tools that save the data
  1.1505 +//                         for standard iterator types.
  1.1506 +//
  1.1507 +//-------------------------------------------------------------------------------
  1.1508 +const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
  1.1509 +    const uint8_t  *retPtr = NULL;
  1.1510 +    length = 0;
  1.1511 +
  1.1512 +    if (fData != NULL) {
  1.1513 +        retPtr = (const uint8_t *)fData->fHeader;
  1.1514 +        length = fData->fHeader->fLength;
  1.1515 +    }
  1.1516 +    return retPtr;
  1.1517 +}
  1.1518 +
  1.1519 +
  1.1520 +BreakIterator *  RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
  1.1521 +                                   int32_t &bufferSize,
  1.1522 +                                   UErrorCode &status)
  1.1523 +{
  1.1524 +    if (U_FAILURE(status)){
  1.1525 +        return NULL;
  1.1526 +    }
  1.1527 +
  1.1528 +    if (bufferSize == 0) {
  1.1529 +        bufferSize = 1;  // preflighting for deprecated functionality
  1.1530 +        return NULL;
  1.1531 +    }
  1.1532 +
  1.1533 +    BreakIterator *clonedBI = clone();
  1.1534 +    if (clonedBI == NULL) {
  1.1535 +        status = U_MEMORY_ALLOCATION_ERROR;
  1.1536 +    } else {
  1.1537 +        status = U_SAFECLONE_ALLOCATED_WARNING;
  1.1538 +    }
  1.1539 +    return (RuleBasedBreakIterator *)clonedBI;
  1.1540 +}
  1.1541 +
  1.1542 +
  1.1543 +//-------------------------------------------------------------------------------
  1.1544 +//
  1.1545 +//  isDictionaryChar      Return true if the category lookup for this char
  1.1546 +//                        indicates that it is in the set of dictionary lookup
  1.1547 +//                        chars.
  1.1548 +//
  1.1549 +//                        This function is intended for use by dictionary based
  1.1550 +//                        break iterators.
  1.1551 +//
  1.1552 +//-------------------------------------------------------------------------------
  1.1553 +/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
  1.1554 +    if (fData == NULL) {
  1.1555 +        return FALSE;
  1.1556 +    }
  1.1557 +    uint16_t category;
  1.1558 +    UTRIE_GET16(&fData->fTrie, c, category);
  1.1559 +    return (category & 0x4000) != 0;
  1.1560 +}*/
  1.1561 +
  1.1562 +
  1.1563 +//-------------------------------------------------------------------------------
  1.1564 +//
  1.1565 +//  checkDictionary       This function handles all processing of characters in
  1.1566 +//                        the "dictionary" set. It will determine the appropriate
  1.1567 +//                        course of action, and possibly set up a cache in the
  1.1568 +//                        process.
  1.1569 +//
  1.1570 +//-------------------------------------------------------------------------------
  1.1571 +int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
  1.1572 +                            int32_t endPos,
  1.1573 +                            UBool reverse) {
  1.1574 +    // Reset the old break cache first.
  1.1575 +    reset();
  1.1576 +
  1.1577 +    // note: code segment below assumes that dictionary chars are in the 
  1.1578 +    // startPos-endPos range
  1.1579 +    // value returned should be next character in sequence
  1.1580 +    if ((endPos - startPos) <= 1) {
  1.1581 +        return (reverse ? startPos : endPos);
  1.1582 +    }
  1.1583 +    
  1.1584 +    // Bug 5532.  The dictionary code will crash if the input text is UTF-8
  1.1585 +    //      because native indexes are different from UTF-16 indexes.
  1.1586 +    //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
  1.1587 +    //      It wont give the right breaks, but it's better than a crash.
  1.1588 +    //
  1.1589 +    //      Check the type of the UText by checking its pFuncs field, which
  1.1590 +    //      is UText's function dispatch table.  It will be the same for all
  1.1591 +    //      UTF-8 UTexts and different for any other UText type.
  1.1592 +    //
  1.1593 +    //      We have no other type of UText available with non-UTF-16 native indexing.
  1.1594 +    //      This whole check will go away once the dictionary code is fixed.
  1.1595 +    static const void *utext_utf8Funcs;
  1.1596 +    if (utext_utf8Funcs == NULL) {
  1.1597 +        // Cache the UTF-8 UText function pointer value.
  1.1598 +        UErrorCode status = U_ZERO_ERROR;
  1.1599 +        UText tempUText = UTEXT_INITIALIZER; 
  1.1600 +        utext_openUTF8(&tempUText, NULL, 0, &status);
  1.1601 +        utext_utf8Funcs = tempUText.pFuncs;
  1.1602 +        utext_close(&tempUText);
  1.1603 +    }
  1.1604 +    if (fText->pFuncs == utext_utf8Funcs) {
  1.1605 +        return (reverse ? startPos : endPos);
  1.1606 +    }
  1.1607 +
  1.1608 +    // Starting from the starting point, scan towards the proposed result,
  1.1609 +    // looking for the first dictionary character (which may be the one
  1.1610 +    // we're on, if we're starting in the middle of a range).
  1.1611 +    utext_setNativeIndex(fText, reverse ? endPos : startPos);
  1.1612 +    if (reverse) {
  1.1613 +        UTEXT_PREVIOUS32(fText);
  1.1614 +    }
  1.1615 +    
  1.1616 +    int32_t rangeStart = startPos;
  1.1617 +    int32_t rangeEnd = endPos;
  1.1618 +
  1.1619 +    uint16_t    category;
  1.1620 +    int32_t     current;
  1.1621 +    UErrorCode  status = U_ZERO_ERROR;
  1.1622 +    UStack      breaks(status);
  1.1623 +    int32_t     foundBreakCount = 0;
  1.1624 +    UChar32     c = utext_current32(fText);
  1.1625 +
  1.1626 +    UTRIE_GET16(&fData->fTrie, c, category);
  1.1627 +    
  1.1628 +    // Is the character we're starting on a dictionary character? If so, we
  1.1629 +    // need to back up to include the entire run; otherwise the results of
  1.1630 +    // the break algorithm will differ depending on where we start. Since
  1.1631 +    // the result is cached and there is typically a non-dictionary break
  1.1632 +    // within a small number of words, there should be little performance impact.
  1.1633 +    if (category & 0x4000) {
  1.1634 +        if (reverse) {
  1.1635 +            do {
  1.1636 +                utext_next32(fText);          // TODO:  recast to work directly with postincrement.
  1.1637 +                c = utext_current32(fText);
  1.1638 +                UTRIE_GET16(&fData->fTrie, c, category);
  1.1639 +            } while (c != U_SENTINEL && (category & 0x4000));
  1.1640 +            // Back up to the last dictionary character
  1.1641 +            rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
  1.1642 +            if (c == U_SENTINEL) {
  1.1643 +                // c = fText->last32();
  1.1644 +                //   TODO:  why was this if needed?
  1.1645 +                c = UTEXT_PREVIOUS32(fText);
  1.1646 +            }
  1.1647 +            else {
  1.1648 +                c = UTEXT_PREVIOUS32(fText);
  1.1649 +            }
  1.1650 +        }
  1.1651 +        else {
  1.1652 +            do {
  1.1653 +                c = UTEXT_PREVIOUS32(fText);
  1.1654 +                UTRIE_GET16(&fData->fTrie, c, category);
  1.1655 +            }
  1.1656 +            while (c != U_SENTINEL && (category & 0x4000));
  1.1657 +            // Back up to the last dictionary character
  1.1658 +            if (c == U_SENTINEL) {
  1.1659 +                // c = fText->first32();
  1.1660 +                c = utext_current32(fText);
  1.1661 +            }
  1.1662 +            else {
  1.1663 +                utext_next32(fText);
  1.1664 +                c = utext_current32(fText);
  1.1665 +            }
  1.1666 +            rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
  1.1667 +        }
  1.1668 +        UTRIE_GET16(&fData->fTrie, c, category);
  1.1669 +    }
  1.1670 +    
  1.1671 +    // Loop through the text, looking for ranges of dictionary characters.
  1.1672 +    // For each span, find the appropriate break engine, and ask it to find
  1.1673 +    // any breaks within the span.
  1.1674 +    // Note: we always do this in the forward direction, so that the break
  1.1675 +    // cache is built in the right order.
  1.1676 +    if (reverse) {
  1.1677 +        utext_setNativeIndex(fText, rangeStart);
  1.1678 +        c = utext_current32(fText);
  1.1679 +        UTRIE_GET16(&fData->fTrie, c, category);
  1.1680 +    }
  1.1681 +    while(U_SUCCESS(status)) {
  1.1682 +        while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
  1.1683 +            utext_next32(fText);           // TODO:  tweak for post-increment operation
  1.1684 +            c = utext_current32(fText);
  1.1685 +            UTRIE_GET16(&fData->fTrie, c, category);
  1.1686 +        }
  1.1687 +        if (current >= rangeEnd) {
  1.1688 +            break;
  1.1689 +        }
  1.1690 +        
  1.1691 +        // We now have a dictionary character. Get the appropriate language object
  1.1692 +        // to deal with it.
  1.1693 +        const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
  1.1694 +        
  1.1695 +        // Ask the language object if there are any breaks. It will leave the text
  1.1696 +        // pointer on the other side of its range, ready to search for the next one.
  1.1697 +        if (lbe != NULL) {
  1.1698 +            foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
  1.1699 +        }
  1.1700 +        
  1.1701 +        // Reload the loop variables for the next go-round
  1.1702 +        c = utext_current32(fText);
  1.1703 +        UTRIE_GET16(&fData->fTrie, c, category);
  1.1704 +    }
  1.1705 +    
  1.1706 +    // If we found breaks, build a new break cache. The first and last entries must
  1.1707 +    // be the original starting and ending position.
  1.1708 +    if (foundBreakCount > 0) {
  1.1709 +        int32_t totalBreaks = foundBreakCount;
  1.1710 +        if (startPos < breaks.elementAti(0)) {
  1.1711 +            totalBreaks += 1;
  1.1712 +        }
  1.1713 +        if (endPos > breaks.peeki()) {
  1.1714 +            totalBreaks += 1;
  1.1715 +        }
  1.1716 +        fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
  1.1717 +        if (fCachedBreakPositions != NULL) {
  1.1718 +            int32_t out = 0;
  1.1719 +            fNumCachedBreakPositions = totalBreaks;
  1.1720 +            if (startPos < breaks.elementAti(0)) {
  1.1721 +                fCachedBreakPositions[out++] = startPos;
  1.1722 +            }
  1.1723 +            for (int32_t i = 0; i < foundBreakCount; ++i) {
  1.1724 +                fCachedBreakPositions[out++] = breaks.elementAti(i);
  1.1725 +            }
  1.1726 +            if (endPos > fCachedBreakPositions[out-1]) {
  1.1727 +                fCachedBreakPositions[out] = endPos;
  1.1728 +            }
  1.1729 +            // If there are breaks, then by definition, we are replacing the original
  1.1730 +            // proposed break by one of the breaks we found. Use following() and
  1.1731 +            // preceding() to do the work. They should never recurse in this case.
  1.1732 +            if (reverse) {
  1.1733 +                return preceding(endPos);
  1.1734 +            }
  1.1735 +            else {
  1.1736 +                return following(startPos);
  1.1737 +            }
  1.1738 +        }
  1.1739 +        // If the allocation failed, just fall through to the "no breaks found" case.
  1.1740 +    }
  1.1741 +
  1.1742 +    // If we get here, there were no language-based breaks. Set the text pointer
  1.1743 +    // to the original proposed break.
  1.1744 +    utext_setNativeIndex(fText, reverse ? startPos : endPos);
  1.1745 +    return (reverse ? startPos : endPos);
  1.1746 +}
  1.1747 +
  1.1748 +// defined in ucln_cmn.h
  1.1749 +
  1.1750 +U_NAMESPACE_END
  1.1751 +
  1.1752 +
  1.1753 +static icu::UStack *gLanguageBreakFactories = NULL;
  1.1754 +static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
  1.1755 +
  1.1756 +/**
  1.1757 + * Release all static memory held by breakiterator.  
  1.1758 + */
  1.1759 +U_CDECL_BEGIN
  1.1760 +static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
  1.1761 +    if (gLanguageBreakFactories) {
  1.1762 +        delete gLanguageBreakFactories;
  1.1763 +        gLanguageBreakFactories = NULL;
  1.1764 +    }
  1.1765 +    gLanguageBreakFactoriesInitOnce.reset();
  1.1766 +    return TRUE;
  1.1767 +}
  1.1768 +U_CDECL_END
  1.1769 +
  1.1770 +U_CDECL_BEGIN
  1.1771 +static void U_CALLCONV _deleteFactory(void *obj) {
  1.1772 +    delete (icu::LanguageBreakFactory *) obj;
  1.1773 +}
  1.1774 +U_CDECL_END
  1.1775 +U_NAMESPACE_BEGIN
  1.1776 +
  1.1777 +static void U_CALLCONV initLanguageFactories() {
  1.1778 +    UErrorCode status = U_ZERO_ERROR;
  1.1779 +    U_ASSERT(gLanguageBreakFactories == NULL);
  1.1780 +    gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
  1.1781 +    if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
  1.1782 +        ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
  1.1783 +        gLanguageBreakFactories->push(builtIn, status);
  1.1784 +#ifdef U_LOCAL_SERVICE_HOOK
  1.1785 +        LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
  1.1786 +        if (extra != NULL) {
  1.1787 +            gLanguageBreakFactories->push(extra, status);
  1.1788 +        }
  1.1789 +#endif
  1.1790 +    }
  1.1791 +    ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
  1.1792 +}
  1.1793 +
  1.1794 +
  1.1795 +static const LanguageBreakEngine*
  1.1796 +getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
  1.1797 +{
  1.1798 +    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
  1.1799 +    if (gLanguageBreakFactories == NULL) {
  1.1800 +        return NULL;
  1.1801 +    }
  1.1802 +    
  1.1803 +    int32_t i = gLanguageBreakFactories->size();
  1.1804 +    const LanguageBreakEngine *lbe = NULL;
  1.1805 +    while (--i >= 0) {
  1.1806 +        LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
  1.1807 +        lbe = factory->getEngineFor(c, breakType);
  1.1808 +        if (lbe != NULL) {
  1.1809 +            break;
  1.1810 +        }
  1.1811 +    }
  1.1812 +    return lbe;
  1.1813 +}
  1.1814 +
  1.1815 +
  1.1816 +//-------------------------------------------------------------------------------
  1.1817 +//
  1.1818 +//  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
  1.1819 +//                          the character c.
  1.1820 +//
  1.1821 +//-------------------------------------------------------------------------------
  1.1822 +const LanguageBreakEngine *
  1.1823 +RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
  1.1824 +    const LanguageBreakEngine *lbe = NULL;
  1.1825 +    UErrorCode status = U_ZERO_ERROR;
  1.1826 +    
  1.1827 +    if (fLanguageBreakEngines == NULL) {
  1.1828 +        fLanguageBreakEngines = new UStack(status);
  1.1829 +        if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
  1.1830 +            delete fLanguageBreakEngines;
  1.1831 +            fLanguageBreakEngines = 0;
  1.1832 +            return NULL;
  1.1833 +        }
  1.1834 +    }
  1.1835 +    
  1.1836 +    int32_t i = fLanguageBreakEngines->size();
  1.1837 +    while (--i >= 0) {
  1.1838 +        lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
  1.1839 +        if (lbe->handles(c, fBreakType)) {
  1.1840 +            return lbe;
  1.1841 +        }
  1.1842 +    }
  1.1843 +    
  1.1844 +    // No existing dictionary took the character. See if a factory wants to
  1.1845 +    // give us a new LanguageBreakEngine for this character.
  1.1846 +    lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
  1.1847 +    
  1.1848 +    // If we got one, use it and push it on our stack.
  1.1849 +    if (lbe != NULL) {
  1.1850 +        fLanguageBreakEngines->push((void *)lbe, status);
  1.1851 +        // Even if we can't remember it, we can keep looking it up, so
  1.1852 +        // return it even if the push fails.
  1.1853 +        return lbe;
  1.1854 +    }
  1.1855 +    
  1.1856 +    // No engine is forthcoming for this character. Add it to the
  1.1857 +    // reject set. Create the reject break engine if needed.
  1.1858 +    if (fUnhandledBreakEngine == NULL) {
  1.1859 +        fUnhandledBreakEngine = new UnhandledEngine(status);
  1.1860 +        if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
  1.1861 +            status = U_MEMORY_ALLOCATION_ERROR;
  1.1862 +        }
  1.1863 +        // Put it last so that scripts for which we have an engine get tried
  1.1864 +        // first.
  1.1865 +        fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
  1.1866 +        // If we can't insert it, or creation failed, get rid of it
  1.1867 +        if (U_FAILURE(status)) {
  1.1868 +            delete fUnhandledBreakEngine;
  1.1869 +            fUnhandledBreakEngine = 0;
  1.1870 +            return NULL;
  1.1871 +        }
  1.1872 +    }
  1.1873 +    
  1.1874 +    // Tell the reject engine about the character; at its discretion, it may
  1.1875 +    // add more than just the one character.
  1.1876 +    fUnhandledBreakEngine->handleCharacter(c, fBreakType);
  1.1877 +        
  1.1878 +    return fUnhandledBreakEngine;
  1.1879 +}
  1.1880 +
  1.1881 +
  1.1882 +
  1.1883 +/*int32_t RuleBasedBreakIterator::getBreakType() const {
  1.1884 +    return fBreakType;
  1.1885 +}*/
  1.1886 +
  1.1887 +void RuleBasedBreakIterator::setBreakType(int32_t type) {
  1.1888 +    fBreakType = type;
  1.1889 +    reset();
  1.1890 +}
  1.1891 +
  1.1892 +U_NAMESPACE_END
  1.1893 +
  1.1894 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

mercurial