1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/rbbi.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,782 @@ 1.4 +/* 1.5 +*************************************************************************** 1.6 +* Copyright (C) 1999-2013 International Business Machines Corporation * 1.7 +* and others. All rights reserved. * 1.8 +*************************************************************************** 1.9 + 1.10 +********************************************************************** 1.11 +* Date Name Description 1.12 +* 10/22/99 alan Creation. 1.13 +* 11/11/99 rgillam Complete port from Java. 1.14 +********************************************************************** 1.15 +*/ 1.16 + 1.17 +#ifndef RBBI_H 1.18 +#define RBBI_H 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 + 1.22 +/** 1.23 + * \file 1.24 + * \brief C++ API: Rule Based Break Iterator 1.25 + */ 1.26 + 1.27 +#if !UCONFIG_NO_BREAK_ITERATION 1.28 + 1.29 +#include "unicode/brkiter.h" 1.30 +#include "unicode/udata.h" 1.31 +#include "unicode/parseerr.h" 1.32 +#include "unicode/schriter.h" 1.33 +#include "unicode/uchriter.h" 1.34 + 1.35 + 1.36 +struct UTrie; 1.37 + 1.38 +U_NAMESPACE_BEGIN 1.39 + 1.40 +/** @internal */ 1.41 +struct RBBIDataHeader; 1.42 +class RuleBasedBreakIteratorTables; 1.43 +class BreakIterator; 1.44 +class RBBIDataWrapper; 1.45 +class UStack; 1.46 +class LanguageBreakEngine; 1.47 +class UnhandledEngine; 1.48 +struct RBBIStateTable; 1.49 + 1.50 + 1.51 + 1.52 + 1.53 +/** 1.54 + * 1.55 + * A subclass of BreakIterator whose behavior is specified using a list of rules. 1.56 + * <p>Instances of this class are most commonly created by the factory methods of 1.57 + * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., 1.58 + * and then used via the abstract API in class BreakIterator</p> 1.59 + * 1.60 + * <p>See the ICU User Guide for information on Break Iterator Rules.</p> 1.61 + * 1.62 + * <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator 1.63 + * is a subclass, but that relationship is effectively internal to the ICU 1.64 + * implementation. The subclassing interface to RulesBasedBreakIterator is 1.65 + * not part of the ICU API, and may not remain stable.</p> 1.66 + * 1.67 + */ 1.68 +class U_COMMON_API RuleBasedBreakIterator : public BreakIterator { 1.69 + 1.70 +protected: 1.71 + /** 1.72 + * The UText through which this BreakIterator accesses the text 1.73 + * @internal 1.74 + */ 1.75 + UText *fText; 1.76 + 1.77 + /** 1.78 + * A character iterator that refers to the same text as the UText, above. 1.79 + * Only included for compatibility with old API, which was based on CharacterIterators. 1.80 + * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. 1.81 + */ 1.82 + CharacterIterator *fCharIter; 1.83 + 1.84 + /** 1.85 + * When the input text is provided by a UnicodeString, this will point to 1.86 + * a characterIterator that wraps that data. Needed only for the 1.87 + * implementation of getText(), a backwards compatibility issue. 1.88 + */ 1.89 + StringCharacterIterator *fSCharIter; 1.90 + 1.91 + /** 1.92 + * When the input text is provided by a UText, this 1.93 + * dummy CharacterIterator over an empty string will 1.94 + * be returned from getText() 1.95 + */ 1.96 + UCharCharacterIterator *fDCharIter; 1.97 + 1.98 + /** 1.99 + * The rule data for this BreakIterator instance 1.100 + * @internal 1.101 + */ 1.102 + RBBIDataWrapper *fData; 1.103 + 1.104 + /** Index of the Rule {tag} values for the most recent match. 1.105 + * @internal 1.106 + */ 1.107 + int32_t fLastRuleStatusIndex; 1.108 + 1.109 + /** 1.110 + * Rule tag value valid flag. 1.111 + * Some iterator operations don't intrinsically set the correct tag value. 1.112 + * This flag lets us lazily compute the value if we are ever asked for it. 1.113 + * @internal 1.114 + */ 1.115 + UBool fLastStatusIndexValid; 1.116 + 1.117 + /** 1.118 + * Counter for the number of characters encountered with the "dictionary" 1.119 + * flag set. 1.120 + * @internal 1.121 + */ 1.122 + uint32_t fDictionaryCharCount; 1.123 + 1.124 + /** 1.125 + * When a range of characters is divided up using the dictionary, the break 1.126 + * positions that are discovered are stored here, preventing us from having 1.127 + * to use either the dictionary or the state table again until the iterator 1.128 + * leaves this range of text. Has the most impact for line breaking. 1.129 + * @internal 1.130 + */ 1.131 + int32_t* fCachedBreakPositions; 1.132 + 1.133 + /** 1.134 + * The number of elements in fCachedBreakPositions 1.135 + * @internal 1.136 + */ 1.137 + int32_t fNumCachedBreakPositions; 1.138 + 1.139 + /** 1.140 + * if fCachedBreakPositions is not null, this indicates which item in the 1.141 + * cache the current iteration position refers to 1.142 + * @internal 1.143 + */ 1.144 + int32_t fPositionInCache; 1.145 + 1.146 + /** 1.147 + * 1.148 + * If present, UStack of LanguageBreakEngine objects that might handle 1.149 + * dictionary characters. Searched from top to bottom to find an object to 1.150 + * handle a given character. 1.151 + * @internal 1.152 + */ 1.153 + UStack *fLanguageBreakEngines; 1.154 + 1.155 + /** 1.156 + * 1.157 + * If present, the special LanguageBreakEngine used for handling 1.158 + * characters that are in the dictionary set, but not handled by any 1.159 + * LangugageBreakEngine. 1.160 + * @internal 1.161 + */ 1.162 + UnhandledEngine *fUnhandledBreakEngine; 1.163 + 1.164 + /** 1.165 + * 1.166 + * The type of the break iterator, or -1 if it has not been set. 1.167 + * @internal 1.168 + */ 1.169 + int32_t fBreakType; 1.170 + 1.171 +protected: 1.172 + //======================================================================= 1.173 + // constructors 1.174 + //======================================================================= 1.175 + 1.176 +#ifndef U_HIDE_INTERNAL_API 1.177 + /** 1.178 + * Constant to be used in the constructor 1.179 + * RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt, UErrorCode &); 1.180 + * which does not adopt the memory indicated by the RBBIDataHeader* 1.181 + * parameter. 1.182 + * 1.183 + * @internal 1.184 + */ 1.185 + enum EDontAdopt { 1.186 + kDontAdopt 1.187 + }; 1.188 + 1.189 + /** 1.190 + * Constructor from a flattened set of RBBI data in malloced memory. 1.191 + * RulesBasedBreakIterators built from a custom set of rules 1.192 + * are created via this constructor; the rules are compiled 1.193 + * into memory, then the break iterator is constructed here. 1.194 + * 1.195 + * The break iterator adopts the memory, and will 1.196 + * free it when done. 1.197 + * @internal 1.198 + */ 1.199 + RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); 1.200 + 1.201 + /** 1.202 + * Constructor from a flattened set of RBBI data in memory which need not 1.203 + * be malloced (e.g. it may be a memory-mapped file, etc.). 1.204 + * 1.205 + * This version does not adopt the memory, and does not 1.206 + * free it when done. 1.207 + * @internal 1.208 + */ 1.209 + RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status); 1.210 +#endif /* U_HIDE_INTERNAL_API */ 1.211 + 1.212 + 1.213 + friend class RBBIRuleBuilder; 1.214 + /** @internal */ 1.215 + friend class BreakIterator; 1.216 + 1.217 + 1.218 + 1.219 +public: 1.220 + 1.221 + /** Default constructor. Creates an empty shell of an iterator, with no 1.222 + * rules or text to iterate over. Object can subsequently be assigned to. 1.223 + * @stable ICU 2.2 1.224 + */ 1.225 + RuleBasedBreakIterator(); 1.226 + 1.227 + /** 1.228 + * Copy constructor. Will produce a break iterator with the same behavior, 1.229 + * and which iterates over the same text, as the one passed in. 1.230 + * @param that The RuleBasedBreakIterator passed to be copied 1.231 + * @stable ICU 2.0 1.232 + */ 1.233 + RuleBasedBreakIterator(const RuleBasedBreakIterator& that); 1.234 + 1.235 + /** 1.236 + * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 1.237 + * @param rules The break rules to be used. 1.238 + * @param parseError In the event of a syntax error in the rules, provides the location 1.239 + * within the rules of the problem. 1.240 + * @param status Information on any errors encountered. 1.241 + * @stable ICU 2.2 1.242 + */ 1.243 + RuleBasedBreakIterator( const UnicodeString &rules, 1.244 + UParseError &parseError, 1.245 + UErrorCode &status); 1.246 + 1.247 + /** 1.248 + * Contruct a RuleBasedBreakIterator from a set of precompiled binary rules. 1.249 + * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). 1.250 + * Construction of a break iterator in this way is substantially faster than 1.251 + * constuction from source rules. 1.252 + * 1.253 + * Ownership of the storage containing the compiled rules remains with the 1.254 + * caller of this function. The compiled rules must not be modified or 1.255 + * deleted during the life of the break iterator. 1.256 + * 1.257 + * The compiled rules are not compatible across different major versions of ICU. 1.258 + * The compiled rules are comaptible only between machines with the same 1.259 + * byte ordering (little or big endian) and the same base character set family 1.260 + * (ASCII or EBCDIC). 1.261 + * 1.262 + * @see #getBinaryRules 1.263 + * @param compiledRules A pointer to the compiled break rules to be used. 1.264 + * @param ruleLength The length of the compiled break rules, in bytes. This 1.265 + * corresponds to the length value produced by getBinaryRules(). 1.266 + * @param status Information on any errors encountered, including invalid 1.267 + * binary rules. 1.268 + * @stable ICU 4.8 1.269 + */ 1.270 + RuleBasedBreakIterator(const uint8_t *compiledRules, 1.271 + uint32_t ruleLength, 1.272 + UErrorCode &status); 1.273 + 1.274 + /** 1.275 + * This constructor uses the udata interface to create a BreakIterator 1.276 + * whose internal tables live in a memory-mapped file. "image" is an 1.277 + * ICU UDataMemory handle for the pre-compiled break iterator tables. 1.278 + * @param image handle to the memory image for the break iterator data. 1.279 + * Ownership of the UDataMemory handle passes to the Break Iterator, 1.280 + * which will be responsible for closing it when it is no longer needed. 1.281 + * @param status Information on any errors encountered. 1.282 + * @see udata_open 1.283 + * @see #getBinaryRules 1.284 + * @stable ICU 2.8 1.285 + */ 1.286 + RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); 1.287 + 1.288 + /** 1.289 + * Destructor 1.290 + * @stable ICU 2.0 1.291 + */ 1.292 + virtual ~RuleBasedBreakIterator(); 1.293 + 1.294 + /** 1.295 + * Assignment operator. Sets this iterator to have the same behavior, 1.296 + * and iterate over the same text, as the one passed in. 1.297 + * @param that The RuleBasedBreakItertor passed in 1.298 + * @return the newly created RuleBasedBreakIterator 1.299 + * @stable ICU 2.0 1.300 + */ 1.301 + RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); 1.302 + 1.303 + /** 1.304 + * Equality operator. Returns TRUE if both BreakIterators are of the 1.305 + * same class, have the same behavior, and iterate over the same text. 1.306 + * @param that The BreakIterator to be compared for equality 1.307 + * @return TRUE if both BreakIterators are of the 1.308 + * same class, have the same behavior, and iterate over the same text. 1.309 + * @stable ICU 2.0 1.310 + */ 1.311 + virtual UBool operator==(const BreakIterator& that) const; 1.312 + 1.313 + /** 1.314 + * Not-equal operator. If operator== returns TRUE, this returns FALSE, 1.315 + * and vice versa. 1.316 + * @param that The BreakIterator to be compared for inequality 1.317 + * @return TRUE if both BreakIterators are not same. 1.318 + * @stable ICU 2.0 1.319 + */ 1.320 + UBool operator!=(const BreakIterator& that) const; 1.321 + 1.322 + /** 1.323 + * Returns a newly-constructed RuleBasedBreakIterator with the same 1.324 + * behavior, and iterating over the same text, as this one. 1.325 + * Differs from the copy constructor in that it is polymorphic, and 1.326 + * will correctly clone (copy) a derived class. 1.327 + * clone() is thread safe. Multiple threads may simultaeneously 1.328 + * clone the same source break iterator. 1.329 + * @return a newly-constructed RuleBasedBreakIterator 1.330 + * @stable ICU 2.0 1.331 + */ 1.332 + virtual BreakIterator* clone() const; 1.333 + 1.334 + /** 1.335 + * Compute a hash code for this BreakIterator 1.336 + * @return A hash code 1.337 + * @stable ICU 2.0 1.338 + */ 1.339 + virtual int32_t hashCode(void) const; 1.340 + 1.341 + /** 1.342 + * Returns the description used to create this iterator 1.343 + * @return the description used to create this iterator 1.344 + * @stable ICU 2.0 1.345 + */ 1.346 + virtual const UnicodeString& getRules(void) const; 1.347 + 1.348 + //======================================================================= 1.349 + // BreakIterator overrides 1.350 + //======================================================================= 1.351 + 1.352 + /** 1.353 + * <p> 1.354 + * Return a CharacterIterator over the text being analyzed. 1.355 + * The returned character iterator is owned by the break iterator, and must 1.356 + * not be deleted by the caller. Repeated calls to this function may 1.357 + * return the same CharacterIterator. 1.358 + * </p> 1.359 + * <p> 1.360 + * The returned character iterator must not be used concurrently with 1.361 + * the break iterator. If concurrent operation is needed, clone the 1.362 + * returned character iterator first and operate on the clone. 1.363 + * </p> 1.364 + * <p> 1.365 + * When the break iterator is operating on text supplied via a UText, 1.366 + * this function will fail. Lacking any way to signal failures, it 1.367 + * returns an CharacterIterator containing no text. 1.368 + * The function getUText() provides similar functionality, 1.369 + * is reliable, and is more efficient. 1.370 + * </p> 1.371 + * 1.372 + * TODO: deprecate this function? 1.373 + * 1.374 + * @return An iterator over the text being analyzed. 1.375 + * @stable ICU 2.0 1.376 + */ 1.377 + virtual CharacterIterator& getText(void) const; 1.378 + 1.379 + 1.380 + /** 1.381 + * Get a UText for the text being analyzed. 1.382 + * The returned UText is a shallow clone of the UText used internally 1.383 + * by the break iterator implementation. It can safely be used to 1.384 + * access the text without impacting any break iterator operations, 1.385 + * but the underlying text itself must not be altered. 1.386 + * 1.387 + * @param fillIn A UText to be filled in. If NULL, a new UText will be 1.388 + * allocated to hold the result. 1.389 + * @param status receives any error codes. 1.390 + * @return The current UText for this break iterator. If an input 1.391 + * UText was provided, it will always be returned. 1.392 + * @stable ICU 3.4 1.393 + */ 1.394 + virtual UText *getUText(UText *fillIn, UErrorCode &status) const; 1.395 + 1.396 + /** 1.397 + * Set the iterator to analyze a new piece of text. This function resets 1.398 + * the current iteration position to the beginning of the text. 1.399 + * @param newText An iterator over the text to analyze. The BreakIterator 1.400 + * takes ownership of the character iterator. The caller MUST NOT delete it! 1.401 + * @stable ICU 2.0 1.402 + */ 1.403 + virtual void adoptText(CharacterIterator* newText); 1.404 + 1.405 + /** 1.406 + * Set the iterator to analyze a new piece of text. This function resets 1.407 + * the current iteration position to the beginning of the text. 1.408 + * @param newText The text to analyze. 1.409 + * @stable ICU 2.0 1.410 + */ 1.411 + virtual void setText(const UnicodeString& newText); 1.412 + 1.413 + /** 1.414 + * Reset the break iterator to operate over the text represented by 1.415 + * the UText. The iterator position is reset to the start. 1.416 + * 1.417 + * This function makes a shallow clone of the supplied UText. This means 1.418 + * that the caller is free to immediately close or otherwise reuse the 1.419 + * Utext that was passed as a parameter, but that the underlying text itself 1.420 + * must not be altered while being referenced by the break iterator. 1.421 + * 1.422 + * @param text The UText used to change the text. 1.423 + * @param status Receives any error codes. 1.424 + * @stable ICU 3.4 1.425 + */ 1.426 + virtual void setText(UText *text, UErrorCode &status); 1.427 + 1.428 + /** 1.429 + * Sets the current iteration position to the beginning of the text. 1.430 + * @return The offset of the beginning of the text. 1.431 + * @stable ICU 2.0 1.432 + */ 1.433 + virtual int32_t first(void); 1.434 + 1.435 + /** 1.436 + * Sets the current iteration position to the end of the text. 1.437 + * @return The text's past-the-end offset. 1.438 + * @stable ICU 2.0 1.439 + */ 1.440 + virtual int32_t last(void); 1.441 + 1.442 + /** 1.443 + * Advances the iterator either forward or backward the specified number of steps. 1.444 + * Negative values move backward, and positive values move forward. This is 1.445 + * equivalent to repeatedly calling next() or previous(). 1.446 + * @param n The number of steps to move. The sign indicates the direction 1.447 + * (negative is backwards, and positive is forwards). 1.448 + * @return The character offset of the boundary position n boundaries away from 1.449 + * the current one. 1.450 + * @stable ICU 2.0 1.451 + */ 1.452 + virtual int32_t next(int32_t n); 1.453 + 1.454 + /** 1.455 + * Advances the iterator to the next boundary position. 1.456 + * @return The position of the first boundary after this one. 1.457 + * @stable ICU 2.0 1.458 + */ 1.459 + virtual int32_t next(void); 1.460 + 1.461 + /** 1.462 + * Moves the iterator backwards, to the last boundary preceding this one. 1.463 + * @return The position of the last boundary position preceding this one. 1.464 + * @stable ICU 2.0 1.465 + */ 1.466 + virtual int32_t previous(void); 1.467 + 1.468 + /** 1.469 + * Sets the iterator to refer to the first boundary position following 1.470 + * the specified position. 1.471 + * @param offset The position from which to begin searching for a break position. 1.472 + * @return The position of the first break after the current position. 1.473 + * @stable ICU 2.0 1.474 + */ 1.475 + virtual int32_t following(int32_t offset); 1.476 + 1.477 + /** 1.478 + * Sets the iterator to refer to the last boundary position before the 1.479 + * specified position. 1.480 + * @param offset The position to begin searching for a break from. 1.481 + * @return The position of the last boundary before the starting position. 1.482 + * @stable ICU 2.0 1.483 + */ 1.484 + virtual int32_t preceding(int32_t offset); 1.485 + 1.486 + /** 1.487 + * Returns true if the specfied position is a boundary position. As a side 1.488 + * effect, leaves the iterator pointing to the first boundary position at 1.489 + * or after "offset". 1.490 + * @param offset the offset to check. 1.491 + * @return True if "offset" is a boundary position. 1.492 + * @stable ICU 2.0 1.493 + */ 1.494 + virtual UBool isBoundary(int32_t offset); 1.495 + 1.496 + /** 1.497 + * Returns the current iteration position. 1.498 + * @return The current iteration position. 1.499 + * @stable ICU 2.0 1.500 + */ 1.501 + virtual int32_t current(void) const; 1.502 + 1.503 + 1.504 + /** 1.505 + * Return the status tag from the break rule that determined the most recently 1.506 + * returned break position. For break rules that do not specify a 1.507 + * status, a default value of 0 is returned. If more than one break rule 1.508 + * would cause a boundary to be located at some position in the text, 1.509 + * the numerically largest of the applicable status values is returned. 1.510 + * <p> 1.511 + * Of the standard types of ICU break iterators, only word break and 1.512 + * line break provide status values. The values are defined in 1.513 + * the header file ubrk.h. For Word breaks, the status allows distinguishing between words 1.514 + * that contain alphabetic letters, "words" that appear to be numbers, 1.515 + * punctuation and spaces, words containing ideographic characters, and 1.516 + * more. For Line Break, the status distinguishes between hard (mandatory) breaks 1.517 + * and soft (potential) break positions. 1.518 + * <p> 1.519 + * <code>getRuleStatus()</code> can be called after obtaining a boundary 1.520 + * position from <code>next()</code>, <code>previous()</code>, or 1.521 + * any other break iterator functions that returns a boundary position. 1.522 + * <p> 1.523 + * When creating custom break rules, one is free to define whatever 1.524 + * status values may be convenient for the application. 1.525 + * <p> 1.526 + * Note: this function is not thread safe. It should not have been 1.527 + * declared const, and the const remains only for compatibility 1.528 + * reasons. (The function is logically const, but not bit-wise const). 1.529 + * <p> 1.530 + * @return the status from the break rule that determined the most recently 1.531 + * returned break position. 1.532 + * 1.533 + * @see UWordBreak 1.534 + * @stable ICU 2.2 1.535 + */ 1.536 + virtual int32_t getRuleStatus() const; 1.537 + 1.538 + /** 1.539 + * Get the status (tag) values from the break rule(s) that determined the most 1.540 + * recently returned break position. 1.541 + * <p> 1.542 + * The returned status value(s) are stored into an array provided by the caller. 1.543 + * The values are stored in sorted (ascending) order. 1.544 + * If the capacity of the output array is insufficient to hold the data, 1.545 + * the output will be truncated to the available length, and a 1.546 + * U_BUFFER_OVERFLOW_ERROR will be signaled. 1.547 + * 1.548 + * @param fillInVec an array to be filled in with the status values. 1.549 + * @param capacity the length of the supplied vector. A length of zero causes 1.550 + * the function to return the number of status values, in the 1.551 + * normal way, without attemtping to store any values. 1.552 + * @param status receives error codes. 1.553 + * @return The number of rule status values from rules that determined 1.554 + * the most recent boundary returned by the break iterator. 1.555 + * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value 1.556 + * is the total number of status values that were available, 1.557 + * not the reduced number that were actually returned. 1.558 + * @see getRuleStatus 1.559 + * @stable ICU 3.0 1.560 + */ 1.561 + virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); 1.562 + 1.563 + /** 1.564 + * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. 1.565 + * This method is to implement a simple version of RTTI, since not all 1.566 + * C++ compilers support genuine RTTI. Polymorphic operator==() and 1.567 + * clone() methods call this method. 1.568 + * 1.569 + * @return The class ID for this object. All objects of a 1.570 + * given class have the same class ID. Objects of 1.571 + * other classes have different class IDs. 1.572 + * @stable ICU 2.0 1.573 + */ 1.574 + virtual UClassID getDynamicClassID(void) const; 1.575 + 1.576 + /** 1.577 + * Returns the class ID for this class. This is useful only for 1.578 + * comparing to a return value from getDynamicClassID(). For example: 1.579 + * 1.580 + * Base* polymorphic_pointer = createPolymorphicObject(); 1.581 + * if (polymorphic_pointer->getDynamicClassID() == 1.582 + * Derived::getStaticClassID()) ... 1.583 + * 1.584 + * @return The class ID for all objects of this class. 1.585 + * @stable ICU 2.0 1.586 + */ 1.587 + static UClassID U_EXPORT2 getStaticClassID(void); 1.588 + 1.589 + /** 1.590 + * Deprecated functionality. Use clone() instead. 1.591 + * 1.592 + * Create a clone (copy) of this break iterator in memory provided 1.593 + * by the caller. The idea is to increase performance by avoiding 1.594 + * a storage allocation. Use of this functoin is NOT RECOMMENDED. 1.595 + * Performance gains are minimal, and correct buffer management is 1.596 + * tricky. Use clone() instead. 1.597 + * 1.598 + * @param stackBuffer The pointer to the memory into which the cloned object 1.599 + * should be placed. If NULL, allocate heap memory 1.600 + * for the cloned object. 1.601 + * @param BufferSize The size of the buffer. If zero, return the required 1.602 + * buffer size, but do not clone the object. If the 1.603 + * size was too small (but not zero), allocate heap 1.604 + * storage for the cloned object. 1.605 + * 1.606 + * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be 1.607 + * returned if the the provided buffer was too small, and 1.608 + * the clone was therefore put on the heap. 1.609 + * 1.610 + * @return Pointer to the clone object. This may differ from the stackBuffer 1.611 + * address if the byte alignment of the stack buffer was not suitable 1.612 + * or if the stackBuffer was too small to hold the clone. 1.613 + * @deprecated ICU 52. Use clone() instead. 1.614 + */ 1.615 + virtual BreakIterator * createBufferClone(void *stackBuffer, 1.616 + int32_t &BufferSize, 1.617 + UErrorCode &status); 1.618 + 1.619 + 1.620 + /** 1.621 + * Return the binary form of compiled break rules, 1.622 + * which can then be used to create a new break iterator at some 1.623 + * time in the future. Creating a break iterator from pre-compiled rules 1.624 + * is much faster than building one from the source form of the 1.625 + * break rules. 1.626 + * 1.627 + * The binary data can only be used with the same version of ICU 1.628 + * and on the same platform type (processor endian-ness) 1.629 + * 1.630 + * @param length Returns the length of the binary data. (Out paramter.) 1.631 + * 1.632 + * @return A pointer to the binary (compiled) rule data. The storage 1.633 + * belongs to the RulesBasedBreakIterator object, not the 1.634 + * caller, and must not be modified or deleted. 1.635 + * @stable ICU 4.8 1.636 + */ 1.637 + virtual const uint8_t *getBinaryRules(uint32_t &length); 1.638 + 1.639 + /** 1.640 + * Set the subject text string upon which the break iterator is operating 1.641 + * without changing any other aspect of the matching state. 1.642 + * The new and previous text strings must have the same content. 1.643 + * 1.644 + * This function is intended for use in environments where ICU is operating on 1.645 + * strings that may move around in memory. It provides a mechanism for notifying 1.646 + * ICU that the string has been relocated, and providing a new UText to access the 1.647 + * string in its new position. 1.648 + * 1.649 + * Note that the break iterator implementation never copies the underlying text 1.650 + * of a string being processed, but always operates directly on the original text 1.651 + * provided by the user. Refreshing simply drops the references to the old text 1.652 + * and replaces them with references to the new. 1.653 + * 1.654 + * Caution: this function is normally used only by very specialized, 1.655 + * system-level code. One example use case is with garbage collection that moves 1.656 + * the text in memory. 1.657 + * 1.658 + * @param input The new (moved) text string. 1.659 + * @param status Receives errors detected by this function. 1.660 + * @return *this 1.661 + * 1.662 + * @stable ICU 49 1.663 + */ 1.664 + virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); 1.665 + 1.666 + 1.667 +protected: 1.668 + //======================================================================= 1.669 + // implementation 1.670 + //======================================================================= 1.671 + /** 1.672 + * Dumps caches and performs other actions associated with a complete change 1.673 + * in text or iteration position. 1.674 + * @internal 1.675 + */ 1.676 + virtual void reset(void); 1.677 + 1.678 +#if 0 1.679 + /** 1.680 + * Return true if the category lookup for this char 1.681 + * indicates that it is in the set of dictionary lookup chars. 1.682 + * This function is intended for use by dictionary based break iterators. 1.683 + * @return true if the category lookup for this char 1.684 + * indicates that it is in the set of dictionary lookup chars. 1.685 + * @internal 1.686 + */ 1.687 + virtual UBool isDictionaryChar(UChar32); 1.688 + 1.689 + /** 1.690 + * Get the type of the break iterator. 1.691 + * @internal 1.692 + */ 1.693 + virtual int32_t getBreakType() const; 1.694 +#endif 1.695 + 1.696 + /** 1.697 + * Set the type of the break iterator. 1.698 + * @internal 1.699 + */ 1.700 + virtual void setBreakType(int32_t type); 1.701 + 1.702 +#ifndef U_HIDE_INTERNAL_API 1.703 + /** 1.704 + * Common initialization function, used by constructors and bufferClone. 1.705 + * @internal 1.706 + */ 1.707 + void init(); 1.708 +#endif /* U_HIDE_INTERNAL_API */ 1.709 + 1.710 +private: 1.711 + 1.712 + /** 1.713 + * This method backs the iterator back up to a "safe position" in the text. 1.714 + * This is a position that we know, without any context, must be a break position. 1.715 + * The various calling methods then iterate forward from this safe position to 1.716 + * the appropriate position to return. (For more information, see the description 1.717 + * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) 1.718 + * @param statetable state table used of moving backwards 1.719 + * @internal 1.720 + */ 1.721 + int32_t handlePrevious(const RBBIStateTable *statetable); 1.722 + 1.723 + /** 1.724 + * This method is the actual implementation of the next() method. All iteration 1.725 + * vectors through here. This method initializes the state machine to state 1 1.726 + * and advances through the text character by character until we reach the end 1.727 + * of the text or the state machine transitions to state 0. We update our return 1.728 + * value every time the state machine passes through a possible end state. 1.729 + * @param statetable state table used of moving forwards 1.730 + * @internal 1.731 + */ 1.732 + int32_t handleNext(const RBBIStateTable *statetable); 1.733 + 1.734 +protected: 1.735 + 1.736 +#ifndef U_HIDE_INTERNAL_API 1.737 + /** 1.738 + * This is the function that actually implements dictionary-based 1.739 + * breaking. Covering at least the range from startPos to endPos, 1.740 + * it checks for dictionary characters, and if it finds them determines 1.741 + * the appropriate object to deal with them. It may cache found breaks in 1.742 + * fCachedBreakPositions as it goes. It may well also look at text outside 1.743 + * the range startPos to endPos. 1.744 + * If going forward, endPos is the normal Unicode break result, and 1.745 + * if goind in reverse, startPos is the normal Unicode break result 1.746 + * @param startPos The start position of a range of text 1.747 + * @param endPos The end position of a range of text 1.748 + * @param reverse The call is for the reverse direction 1.749 + * @internal 1.750 + */ 1.751 + int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse); 1.752 +#endif /* U_HIDE_INTERNAL_API */ 1.753 + 1.754 +private: 1.755 + 1.756 + /** 1.757 + * This function returns the appropriate LanguageBreakEngine for a 1.758 + * given character c. 1.759 + * @param c A character in the dictionary set 1.760 + * @internal 1.761 + */ 1.762 + const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); 1.763 + 1.764 + /** 1.765 + * @internal 1.766 + */ 1.767 + void makeRuleStatusValid(); 1.768 + 1.769 +}; 1.770 + 1.771 +//------------------------------------------------------------------------------ 1.772 +// 1.773 +// Inline Functions Definitions ... 1.774 +// 1.775 +//------------------------------------------------------------------------------ 1.776 + 1.777 +inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { 1.778 + return !operator==(that); 1.779 +} 1.780 + 1.781 +U_NAMESPACE_END 1.782 + 1.783 +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1.784 + 1.785 +#endif