intl/icu/source/common/unicode/rbbi.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 ***************************************************************************
michael@0 3 * Copyright (C) 1999-2013 International Business Machines Corporation *
michael@0 4 * and others. All rights reserved. *
michael@0 5 ***************************************************************************
michael@0 6
michael@0 7 **********************************************************************
michael@0 8 * Date Name Description
michael@0 9 * 10/22/99 alan Creation.
michael@0 10 * 11/11/99 rgillam Complete port from Java.
michael@0 11 **********************************************************************
michael@0 12 */
michael@0 13
michael@0 14 #ifndef RBBI_H
michael@0 15 #define RBBI_H
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18
michael@0 19 /**
michael@0 20 * \file
michael@0 21 * \brief C++ API: Rule Based Break Iterator
michael@0 22 */
michael@0 23
michael@0 24 #if !UCONFIG_NO_BREAK_ITERATION
michael@0 25
michael@0 26 #include "unicode/brkiter.h"
michael@0 27 #include "unicode/udata.h"
michael@0 28 #include "unicode/parseerr.h"
michael@0 29 #include "unicode/schriter.h"
michael@0 30 #include "unicode/uchriter.h"
michael@0 31
michael@0 32
michael@0 33 struct UTrie;
michael@0 34
michael@0 35 U_NAMESPACE_BEGIN
michael@0 36
michael@0 37 /** @internal */
michael@0 38 struct RBBIDataHeader;
michael@0 39 class RuleBasedBreakIteratorTables;
michael@0 40 class BreakIterator;
michael@0 41 class RBBIDataWrapper;
michael@0 42 class UStack;
michael@0 43 class LanguageBreakEngine;
michael@0 44 class UnhandledEngine;
michael@0 45 struct RBBIStateTable;
michael@0 46
michael@0 47
michael@0 48
michael@0 49
michael@0 50 /**
michael@0 51 *
michael@0 52 * A subclass of BreakIterator whose behavior is specified using a list of rules.
michael@0 53 * <p>Instances of this class are most commonly created by the factory methods of
michael@0 54 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
michael@0 55 * and then used via the abstract API in class BreakIterator</p>
michael@0 56 *
michael@0 57 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
michael@0 58 *
michael@0 59 * <p>This class is not intended to be subclassed. (Class DictionaryBasedBreakIterator
michael@0 60 * is a subclass, but that relationship is effectively internal to the ICU
michael@0 61 * implementation. The subclassing interface to RulesBasedBreakIterator is
michael@0 62 * not part of the ICU API, and may not remain stable.</p>
michael@0 63 *
michael@0 64 */
michael@0 65 class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
michael@0 66
michael@0 67 protected:
michael@0 68 /**
michael@0 69 * The UText through which this BreakIterator accesses the text
michael@0 70 * @internal
michael@0 71 */
michael@0 72 UText *fText;
michael@0 73
michael@0 74 /**
michael@0 75 * A character iterator that refers to the same text as the UText, above.
michael@0 76 * Only included for compatibility with old API, which was based on CharacterIterators.
michael@0 77 * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
michael@0 78 */
michael@0 79 CharacterIterator *fCharIter;
michael@0 80
michael@0 81 /**
michael@0 82 * When the input text is provided by a UnicodeString, this will point to
michael@0 83 * a characterIterator that wraps that data. Needed only for the
michael@0 84 * implementation of getText(), a backwards compatibility issue.
michael@0 85 */
michael@0 86 StringCharacterIterator *fSCharIter;
michael@0 87
michael@0 88 /**
michael@0 89 * When the input text is provided by a UText, this
michael@0 90 * dummy CharacterIterator over an empty string will
michael@0 91 * be returned from getText()
michael@0 92 */
michael@0 93 UCharCharacterIterator *fDCharIter;
michael@0 94
michael@0 95 /**
michael@0 96 * The rule data for this BreakIterator instance
michael@0 97 * @internal
michael@0 98 */
michael@0 99 RBBIDataWrapper *fData;
michael@0 100
michael@0 101 /** Index of the Rule {tag} values for the most recent match.
michael@0 102 * @internal
michael@0 103 */
michael@0 104 int32_t fLastRuleStatusIndex;
michael@0 105
michael@0 106 /**
michael@0 107 * Rule tag value valid flag.
michael@0 108 * Some iterator operations don't intrinsically set the correct tag value.
michael@0 109 * This flag lets us lazily compute the value if we are ever asked for it.
michael@0 110 * @internal
michael@0 111 */
michael@0 112 UBool fLastStatusIndexValid;
michael@0 113
michael@0 114 /**
michael@0 115 * Counter for the number of characters encountered with the "dictionary"
michael@0 116 * flag set.
michael@0 117 * @internal
michael@0 118 */
michael@0 119 uint32_t fDictionaryCharCount;
michael@0 120
michael@0 121 /**
michael@0 122 * When a range of characters is divided up using the dictionary, the break
michael@0 123 * positions that are discovered are stored here, preventing us from having
michael@0 124 * to use either the dictionary or the state table again until the iterator
michael@0 125 * leaves this range of text. Has the most impact for line breaking.
michael@0 126 * @internal
michael@0 127 */
michael@0 128 int32_t* fCachedBreakPositions;
michael@0 129
michael@0 130 /**
michael@0 131 * The number of elements in fCachedBreakPositions
michael@0 132 * @internal
michael@0 133 */
michael@0 134 int32_t fNumCachedBreakPositions;
michael@0 135
michael@0 136 /**
michael@0 137 * if fCachedBreakPositions is not null, this indicates which item in the
michael@0 138 * cache the current iteration position refers to
michael@0 139 * @internal
michael@0 140 */
michael@0 141 int32_t fPositionInCache;
michael@0 142
michael@0 143 /**
michael@0 144 *
michael@0 145 * If present, UStack of LanguageBreakEngine objects that might handle
michael@0 146 * dictionary characters. Searched from top to bottom to find an object to
michael@0 147 * handle a given character.
michael@0 148 * @internal
michael@0 149 */
michael@0 150 UStack *fLanguageBreakEngines;
michael@0 151
michael@0 152 /**
michael@0 153 *
michael@0 154 * If present, the special LanguageBreakEngine used for handling
michael@0 155 * characters that are in the dictionary set, but not handled by any
michael@0 156 * LangugageBreakEngine.
michael@0 157 * @internal
michael@0 158 */
michael@0 159 UnhandledEngine *fUnhandledBreakEngine;
michael@0 160
michael@0 161 /**
michael@0 162 *
michael@0 163 * The type of the break iterator, or -1 if it has not been set.
michael@0 164 * @internal
michael@0 165 */
michael@0 166 int32_t fBreakType;
michael@0 167
michael@0 168 protected:
michael@0 169 //=======================================================================
michael@0 170 // constructors
michael@0 171 //=======================================================================
michael@0 172
michael@0 173 #ifndef U_HIDE_INTERNAL_API
michael@0 174 /**
michael@0 175 * Constant to be used in the constructor
michael@0 176 * RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt, UErrorCode &);
michael@0 177 * which does not adopt the memory indicated by the RBBIDataHeader*
michael@0 178 * parameter.
michael@0 179 *
michael@0 180 * @internal
michael@0 181 */
michael@0 182 enum EDontAdopt {
michael@0 183 kDontAdopt
michael@0 184 };
michael@0 185
michael@0 186 /**
michael@0 187 * Constructor from a flattened set of RBBI data in malloced memory.
michael@0 188 * RulesBasedBreakIterators built from a custom set of rules
michael@0 189 * are created via this constructor; the rules are compiled
michael@0 190 * into memory, then the break iterator is constructed here.
michael@0 191 *
michael@0 192 * The break iterator adopts the memory, and will
michael@0 193 * free it when done.
michael@0 194 * @internal
michael@0 195 */
michael@0 196 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
michael@0 197
michael@0 198 /**
michael@0 199 * Constructor from a flattened set of RBBI data in memory which need not
michael@0 200 * be malloced (e.g. it may be a memory-mapped file, etc.).
michael@0 201 *
michael@0 202 * This version does not adopt the memory, and does not
michael@0 203 * free it when done.
michael@0 204 * @internal
michael@0 205 */
michael@0 206 RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status);
michael@0 207 #endif /* U_HIDE_INTERNAL_API */
michael@0 208
michael@0 209
michael@0 210 friend class RBBIRuleBuilder;
michael@0 211 /** @internal */
michael@0 212 friend class BreakIterator;
michael@0 213
michael@0 214
michael@0 215
michael@0 216 public:
michael@0 217
michael@0 218 /** Default constructor. Creates an empty shell of an iterator, with no
michael@0 219 * rules or text to iterate over. Object can subsequently be assigned to.
michael@0 220 * @stable ICU 2.2
michael@0 221 */
michael@0 222 RuleBasedBreakIterator();
michael@0 223
michael@0 224 /**
michael@0 225 * Copy constructor. Will produce a break iterator with the same behavior,
michael@0 226 * and which iterates over the same text, as the one passed in.
michael@0 227 * @param that The RuleBasedBreakIterator passed to be copied
michael@0 228 * @stable ICU 2.0
michael@0 229 */
michael@0 230 RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
michael@0 231
michael@0 232 /**
michael@0 233 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
michael@0 234 * @param rules The break rules to be used.
michael@0 235 * @param parseError In the event of a syntax error in the rules, provides the location
michael@0 236 * within the rules of the problem.
michael@0 237 * @param status Information on any errors encountered.
michael@0 238 * @stable ICU 2.2
michael@0 239 */
michael@0 240 RuleBasedBreakIterator( const UnicodeString &rules,
michael@0 241 UParseError &parseError,
michael@0 242 UErrorCode &status);
michael@0 243
michael@0 244 /**
michael@0 245 * Contruct a RuleBasedBreakIterator from a set of precompiled binary rules.
michael@0 246 * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
michael@0 247 * Construction of a break iterator in this way is substantially faster than
michael@0 248 * constuction from source rules.
michael@0 249 *
michael@0 250 * Ownership of the storage containing the compiled rules remains with the
michael@0 251 * caller of this function. The compiled rules must not be modified or
michael@0 252 * deleted during the life of the break iterator.
michael@0 253 *
michael@0 254 * The compiled rules are not compatible across different major versions of ICU.
michael@0 255 * The compiled rules are comaptible only between machines with the same
michael@0 256 * byte ordering (little or big endian) and the same base character set family
michael@0 257 * (ASCII or EBCDIC).
michael@0 258 *
michael@0 259 * @see #getBinaryRules
michael@0 260 * @param compiledRules A pointer to the compiled break rules to be used.
michael@0 261 * @param ruleLength The length of the compiled break rules, in bytes. This
michael@0 262 * corresponds to the length value produced by getBinaryRules().
michael@0 263 * @param status Information on any errors encountered, including invalid
michael@0 264 * binary rules.
michael@0 265 * @stable ICU 4.8
michael@0 266 */
michael@0 267 RuleBasedBreakIterator(const uint8_t *compiledRules,
michael@0 268 uint32_t ruleLength,
michael@0 269 UErrorCode &status);
michael@0 270
michael@0 271 /**
michael@0 272 * This constructor uses the udata interface to create a BreakIterator
michael@0 273 * whose internal tables live in a memory-mapped file. "image" is an
michael@0 274 * ICU UDataMemory handle for the pre-compiled break iterator tables.
michael@0 275 * @param image handle to the memory image for the break iterator data.
michael@0 276 * Ownership of the UDataMemory handle passes to the Break Iterator,
michael@0 277 * which will be responsible for closing it when it is no longer needed.
michael@0 278 * @param status Information on any errors encountered.
michael@0 279 * @see udata_open
michael@0 280 * @see #getBinaryRules
michael@0 281 * @stable ICU 2.8
michael@0 282 */
michael@0 283 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
michael@0 284
michael@0 285 /**
michael@0 286 * Destructor
michael@0 287 * @stable ICU 2.0
michael@0 288 */
michael@0 289 virtual ~RuleBasedBreakIterator();
michael@0 290
michael@0 291 /**
michael@0 292 * Assignment operator. Sets this iterator to have the same behavior,
michael@0 293 * and iterate over the same text, as the one passed in.
michael@0 294 * @param that The RuleBasedBreakItertor passed in
michael@0 295 * @return the newly created RuleBasedBreakIterator
michael@0 296 * @stable ICU 2.0
michael@0 297 */
michael@0 298 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
michael@0 299
michael@0 300 /**
michael@0 301 * Equality operator. Returns TRUE if both BreakIterators are of the
michael@0 302 * same class, have the same behavior, and iterate over the same text.
michael@0 303 * @param that The BreakIterator to be compared for equality
michael@0 304 * @return TRUE if both BreakIterators are of the
michael@0 305 * same class, have the same behavior, and iterate over the same text.
michael@0 306 * @stable ICU 2.0
michael@0 307 */
michael@0 308 virtual UBool operator==(const BreakIterator& that) const;
michael@0 309
michael@0 310 /**
michael@0 311 * Not-equal operator. If operator== returns TRUE, this returns FALSE,
michael@0 312 * and vice versa.
michael@0 313 * @param that The BreakIterator to be compared for inequality
michael@0 314 * @return TRUE if both BreakIterators are not same.
michael@0 315 * @stable ICU 2.0
michael@0 316 */
michael@0 317 UBool operator!=(const BreakIterator& that) const;
michael@0 318
michael@0 319 /**
michael@0 320 * Returns a newly-constructed RuleBasedBreakIterator with the same
michael@0 321 * behavior, and iterating over the same text, as this one.
michael@0 322 * Differs from the copy constructor in that it is polymorphic, and
michael@0 323 * will correctly clone (copy) a derived class.
michael@0 324 * clone() is thread safe. Multiple threads may simultaeneously
michael@0 325 * clone the same source break iterator.
michael@0 326 * @return a newly-constructed RuleBasedBreakIterator
michael@0 327 * @stable ICU 2.0
michael@0 328 */
michael@0 329 virtual BreakIterator* clone() const;
michael@0 330
michael@0 331 /**
michael@0 332 * Compute a hash code for this BreakIterator
michael@0 333 * @return A hash code
michael@0 334 * @stable ICU 2.0
michael@0 335 */
michael@0 336 virtual int32_t hashCode(void) const;
michael@0 337
michael@0 338 /**
michael@0 339 * Returns the description used to create this iterator
michael@0 340 * @return the description used to create this iterator
michael@0 341 * @stable ICU 2.0
michael@0 342 */
michael@0 343 virtual const UnicodeString& getRules(void) const;
michael@0 344
michael@0 345 //=======================================================================
michael@0 346 // BreakIterator overrides
michael@0 347 //=======================================================================
michael@0 348
michael@0 349 /**
michael@0 350 * <p>
michael@0 351 * Return a CharacterIterator over the text being analyzed.
michael@0 352 * The returned character iterator is owned by the break iterator, and must
michael@0 353 * not be deleted by the caller. Repeated calls to this function may
michael@0 354 * return the same CharacterIterator.
michael@0 355 * </p>
michael@0 356 * <p>
michael@0 357 * The returned character iterator must not be used concurrently with
michael@0 358 * the break iterator. If concurrent operation is needed, clone the
michael@0 359 * returned character iterator first and operate on the clone.
michael@0 360 * </p>
michael@0 361 * <p>
michael@0 362 * When the break iterator is operating on text supplied via a UText,
michael@0 363 * this function will fail. Lacking any way to signal failures, it
michael@0 364 * returns an CharacterIterator containing no text.
michael@0 365 * The function getUText() provides similar functionality,
michael@0 366 * is reliable, and is more efficient.
michael@0 367 * </p>
michael@0 368 *
michael@0 369 * TODO: deprecate this function?
michael@0 370 *
michael@0 371 * @return An iterator over the text being analyzed.
michael@0 372 * @stable ICU 2.0
michael@0 373 */
michael@0 374 virtual CharacterIterator& getText(void) const;
michael@0 375
michael@0 376
michael@0 377 /**
michael@0 378 * Get a UText for the text being analyzed.
michael@0 379 * The returned UText is a shallow clone of the UText used internally
michael@0 380 * by the break iterator implementation. It can safely be used to
michael@0 381 * access the text without impacting any break iterator operations,
michael@0 382 * but the underlying text itself must not be altered.
michael@0 383 *
michael@0 384 * @param fillIn A UText to be filled in. If NULL, a new UText will be
michael@0 385 * allocated to hold the result.
michael@0 386 * @param status receives any error codes.
michael@0 387 * @return The current UText for this break iterator. If an input
michael@0 388 * UText was provided, it will always be returned.
michael@0 389 * @stable ICU 3.4
michael@0 390 */
michael@0 391 virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
michael@0 392
michael@0 393 /**
michael@0 394 * Set the iterator to analyze a new piece of text. This function resets
michael@0 395 * the current iteration position to the beginning of the text.
michael@0 396 * @param newText An iterator over the text to analyze. The BreakIterator
michael@0 397 * takes ownership of the character iterator. The caller MUST NOT delete it!
michael@0 398 * @stable ICU 2.0
michael@0 399 */
michael@0 400 virtual void adoptText(CharacterIterator* newText);
michael@0 401
michael@0 402 /**
michael@0 403 * Set the iterator to analyze a new piece of text. This function resets
michael@0 404 * the current iteration position to the beginning of the text.
michael@0 405 * @param newText The text to analyze.
michael@0 406 * @stable ICU 2.0
michael@0 407 */
michael@0 408 virtual void setText(const UnicodeString& newText);
michael@0 409
michael@0 410 /**
michael@0 411 * Reset the break iterator to operate over the text represented by
michael@0 412 * the UText. The iterator position is reset to the start.
michael@0 413 *
michael@0 414 * This function makes a shallow clone of the supplied UText. This means
michael@0 415 * that the caller is free to immediately close or otherwise reuse the
michael@0 416 * Utext that was passed as a parameter, but that the underlying text itself
michael@0 417 * must not be altered while being referenced by the break iterator.
michael@0 418 *
michael@0 419 * @param text The UText used to change the text.
michael@0 420 * @param status Receives any error codes.
michael@0 421 * @stable ICU 3.4
michael@0 422 */
michael@0 423 virtual void setText(UText *text, UErrorCode &status);
michael@0 424
michael@0 425 /**
michael@0 426 * Sets the current iteration position to the beginning of the text.
michael@0 427 * @return The offset of the beginning of the text.
michael@0 428 * @stable ICU 2.0
michael@0 429 */
michael@0 430 virtual int32_t first(void);
michael@0 431
michael@0 432 /**
michael@0 433 * Sets the current iteration position to the end of the text.
michael@0 434 * @return The text's past-the-end offset.
michael@0 435 * @stable ICU 2.0
michael@0 436 */
michael@0 437 virtual int32_t last(void);
michael@0 438
michael@0 439 /**
michael@0 440 * Advances the iterator either forward or backward the specified number of steps.
michael@0 441 * Negative values move backward, and positive values move forward. This is
michael@0 442 * equivalent to repeatedly calling next() or previous().
michael@0 443 * @param n The number of steps to move. The sign indicates the direction
michael@0 444 * (negative is backwards, and positive is forwards).
michael@0 445 * @return The character offset of the boundary position n boundaries away from
michael@0 446 * the current one.
michael@0 447 * @stable ICU 2.0
michael@0 448 */
michael@0 449 virtual int32_t next(int32_t n);
michael@0 450
michael@0 451 /**
michael@0 452 * Advances the iterator to the next boundary position.
michael@0 453 * @return The position of the first boundary after this one.
michael@0 454 * @stable ICU 2.0
michael@0 455 */
michael@0 456 virtual int32_t next(void);
michael@0 457
michael@0 458 /**
michael@0 459 * Moves the iterator backwards, to the last boundary preceding this one.
michael@0 460 * @return The position of the last boundary position preceding this one.
michael@0 461 * @stable ICU 2.0
michael@0 462 */
michael@0 463 virtual int32_t previous(void);
michael@0 464
michael@0 465 /**
michael@0 466 * Sets the iterator to refer to the first boundary position following
michael@0 467 * the specified position.
michael@0 468 * @param offset The position from which to begin searching for a break position.
michael@0 469 * @return The position of the first break after the current position.
michael@0 470 * @stable ICU 2.0
michael@0 471 */
michael@0 472 virtual int32_t following(int32_t offset);
michael@0 473
michael@0 474 /**
michael@0 475 * Sets the iterator to refer to the last boundary position before the
michael@0 476 * specified position.
michael@0 477 * @param offset The position to begin searching for a break from.
michael@0 478 * @return The position of the last boundary before the starting position.
michael@0 479 * @stable ICU 2.0
michael@0 480 */
michael@0 481 virtual int32_t preceding(int32_t offset);
michael@0 482
michael@0 483 /**
michael@0 484 * Returns true if the specfied position is a boundary position. As a side
michael@0 485 * effect, leaves the iterator pointing to the first boundary position at
michael@0 486 * or after "offset".
michael@0 487 * @param offset the offset to check.
michael@0 488 * @return True if "offset" is a boundary position.
michael@0 489 * @stable ICU 2.0
michael@0 490 */
michael@0 491 virtual UBool isBoundary(int32_t offset);
michael@0 492
michael@0 493 /**
michael@0 494 * Returns the current iteration position.
michael@0 495 * @return The current iteration position.
michael@0 496 * @stable ICU 2.0
michael@0 497 */
michael@0 498 virtual int32_t current(void) const;
michael@0 499
michael@0 500
michael@0 501 /**
michael@0 502 * Return the status tag from the break rule that determined the most recently
michael@0 503 * returned break position. For break rules that do not specify a
michael@0 504 * status, a default value of 0 is returned. If more than one break rule
michael@0 505 * would cause a boundary to be located at some position in the text,
michael@0 506 * the numerically largest of the applicable status values is returned.
michael@0 507 * <p>
michael@0 508 * Of the standard types of ICU break iterators, only word break and
michael@0 509 * line break provide status values. The values are defined in
michael@0 510 * the header file ubrk.h. For Word breaks, the status allows distinguishing between words
michael@0 511 * that contain alphabetic letters, "words" that appear to be numbers,
michael@0 512 * punctuation and spaces, words containing ideographic characters, and
michael@0 513 * more. For Line Break, the status distinguishes between hard (mandatory) breaks
michael@0 514 * and soft (potential) break positions.
michael@0 515 * <p>
michael@0 516 * <code>getRuleStatus()</code> can be called after obtaining a boundary
michael@0 517 * position from <code>next()</code>, <code>previous()</code>, or
michael@0 518 * any other break iterator functions that returns a boundary position.
michael@0 519 * <p>
michael@0 520 * When creating custom break rules, one is free to define whatever
michael@0 521 * status values may be convenient for the application.
michael@0 522 * <p>
michael@0 523 * Note: this function is not thread safe. It should not have been
michael@0 524 * declared const, and the const remains only for compatibility
michael@0 525 * reasons. (The function is logically const, but not bit-wise const).
michael@0 526 * <p>
michael@0 527 * @return the status from the break rule that determined the most recently
michael@0 528 * returned break position.
michael@0 529 *
michael@0 530 * @see UWordBreak
michael@0 531 * @stable ICU 2.2
michael@0 532 */
michael@0 533 virtual int32_t getRuleStatus() const;
michael@0 534
michael@0 535 /**
michael@0 536 * Get the status (tag) values from the break rule(s) that determined the most
michael@0 537 * recently returned break position.
michael@0 538 * <p>
michael@0 539 * The returned status value(s) are stored into an array provided by the caller.
michael@0 540 * The values are stored in sorted (ascending) order.
michael@0 541 * If the capacity of the output array is insufficient to hold the data,
michael@0 542 * the output will be truncated to the available length, and a
michael@0 543 * U_BUFFER_OVERFLOW_ERROR will be signaled.
michael@0 544 *
michael@0 545 * @param fillInVec an array to be filled in with the status values.
michael@0 546 * @param capacity the length of the supplied vector. A length of zero causes
michael@0 547 * the function to return the number of status values, in the
michael@0 548 * normal way, without attemtping to store any values.
michael@0 549 * @param status receives error codes.
michael@0 550 * @return The number of rule status values from rules that determined
michael@0 551 * the most recent boundary returned by the break iterator.
michael@0 552 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
michael@0 553 * is the total number of status values that were available,
michael@0 554 * not the reduced number that were actually returned.
michael@0 555 * @see getRuleStatus
michael@0 556 * @stable ICU 3.0
michael@0 557 */
michael@0 558 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
michael@0 559
michael@0 560 /**
michael@0 561 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
michael@0 562 * This method is to implement a simple version of RTTI, since not all
michael@0 563 * C++ compilers support genuine RTTI. Polymorphic operator==() and
michael@0 564 * clone() methods call this method.
michael@0 565 *
michael@0 566 * @return The class ID for this object. All objects of a
michael@0 567 * given class have the same class ID. Objects of
michael@0 568 * other classes have different class IDs.
michael@0 569 * @stable ICU 2.0
michael@0 570 */
michael@0 571 virtual UClassID getDynamicClassID(void) const;
michael@0 572
michael@0 573 /**
michael@0 574 * Returns the class ID for this class. This is useful only for
michael@0 575 * comparing to a return value from getDynamicClassID(). For example:
michael@0 576 *
michael@0 577 * Base* polymorphic_pointer = createPolymorphicObject();
michael@0 578 * if (polymorphic_pointer->getDynamicClassID() ==
michael@0 579 * Derived::getStaticClassID()) ...
michael@0 580 *
michael@0 581 * @return The class ID for all objects of this class.
michael@0 582 * @stable ICU 2.0
michael@0 583 */
michael@0 584 static UClassID U_EXPORT2 getStaticClassID(void);
michael@0 585
michael@0 586 /**
michael@0 587 * Deprecated functionality. Use clone() instead.
michael@0 588 *
michael@0 589 * Create a clone (copy) of this break iterator in memory provided
michael@0 590 * by the caller. The idea is to increase performance by avoiding
michael@0 591 * a storage allocation. Use of this functoin is NOT RECOMMENDED.
michael@0 592 * Performance gains are minimal, and correct buffer management is
michael@0 593 * tricky. Use clone() instead.
michael@0 594 *
michael@0 595 * @param stackBuffer The pointer to the memory into which the cloned object
michael@0 596 * should be placed. If NULL, allocate heap memory
michael@0 597 * for the cloned object.
michael@0 598 * @param BufferSize The size of the buffer. If zero, return the required
michael@0 599 * buffer size, but do not clone the object. If the
michael@0 600 * size was too small (but not zero), allocate heap
michael@0 601 * storage for the cloned object.
michael@0 602 *
michael@0 603 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
michael@0 604 * returned if the the provided buffer was too small, and
michael@0 605 * the clone was therefore put on the heap.
michael@0 606 *
michael@0 607 * @return Pointer to the clone object. This may differ from the stackBuffer
michael@0 608 * address if the byte alignment of the stack buffer was not suitable
michael@0 609 * or if the stackBuffer was too small to hold the clone.
michael@0 610 * @deprecated ICU 52. Use clone() instead.
michael@0 611 */
michael@0 612 virtual BreakIterator * createBufferClone(void *stackBuffer,
michael@0 613 int32_t &BufferSize,
michael@0 614 UErrorCode &status);
michael@0 615
michael@0 616
michael@0 617 /**
michael@0 618 * Return the binary form of compiled break rules,
michael@0 619 * which can then be used to create a new break iterator at some
michael@0 620 * time in the future. Creating a break iterator from pre-compiled rules
michael@0 621 * is much faster than building one from the source form of the
michael@0 622 * break rules.
michael@0 623 *
michael@0 624 * The binary data can only be used with the same version of ICU
michael@0 625 * and on the same platform type (processor endian-ness)
michael@0 626 *
michael@0 627 * @param length Returns the length of the binary data. (Out paramter.)
michael@0 628 *
michael@0 629 * @return A pointer to the binary (compiled) rule data. The storage
michael@0 630 * belongs to the RulesBasedBreakIterator object, not the
michael@0 631 * caller, and must not be modified or deleted.
michael@0 632 * @stable ICU 4.8
michael@0 633 */
michael@0 634 virtual const uint8_t *getBinaryRules(uint32_t &length);
michael@0 635
michael@0 636 /**
michael@0 637 * Set the subject text string upon which the break iterator is operating
michael@0 638 * without changing any other aspect of the matching state.
michael@0 639 * The new and previous text strings must have the same content.
michael@0 640 *
michael@0 641 * This function is intended for use in environments where ICU is operating on
michael@0 642 * strings that may move around in memory. It provides a mechanism for notifying
michael@0 643 * ICU that the string has been relocated, and providing a new UText to access the
michael@0 644 * string in its new position.
michael@0 645 *
michael@0 646 * Note that the break iterator implementation never copies the underlying text
michael@0 647 * of a string being processed, but always operates directly on the original text
michael@0 648 * provided by the user. Refreshing simply drops the references to the old text
michael@0 649 * and replaces them with references to the new.
michael@0 650 *
michael@0 651 * Caution: this function is normally used only by very specialized,
michael@0 652 * system-level code. One example use case is with garbage collection that moves
michael@0 653 * the text in memory.
michael@0 654 *
michael@0 655 * @param input The new (moved) text string.
michael@0 656 * @param status Receives errors detected by this function.
michael@0 657 * @return *this
michael@0 658 *
michael@0 659 * @stable ICU 49
michael@0 660 */
michael@0 661 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
michael@0 662
michael@0 663
michael@0 664 protected:
michael@0 665 //=======================================================================
michael@0 666 // implementation
michael@0 667 //=======================================================================
michael@0 668 /**
michael@0 669 * Dumps caches and performs other actions associated with a complete change
michael@0 670 * in text or iteration position.
michael@0 671 * @internal
michael@0 672 */
michael@0 673 virtual void reset(void);
michael@0 674
michael@0 675 #if 0
michael@0 676 /**
michael@0 677 * Return true if the category lookup for this char
michael@0 678 * indicates that it is in the set of dictionary lookup chars.
michael@0 679 * This function is intended for use by dictionary based break iterators.
michael@0 680 * @return true if the category lookup for this char
michael@0 681 * indicates that it is in the set of dictionary lookup chars.
michael@0 682 * @internal
michael@0 683 */
michael@0 684 virtual UBool isDictionaryChar(UChar32);
michael@0 685
michael@0 686 /**
michael@0 687 * Get the type of the break iterator.
michael@0 688 * @internal
michael@0 689 */
michael@0 690 virtual int32_t getBreakType() const;
michael@0 691 #endif
michael@0 692
michael@0 693 /**
michael@0 694 * Set the type of the break iterator.
michael@0 695 * @internal
michael@0 696 */
michael@0 697 virtual void setBreakType(int32_t type);
michael@0 698
michael@0 699 #ifndef U_HIDE_INTERNAL_API
michael@0 700 /**
michael@0 701 * Common initialization function, used by constructors and bufferClone.
michael@0 702 * @internal
michael@0 703 */
michael@0 704 void init();
michael@0 705 #endif /* U_HIDE_INTERNAL_API */
michael@0 706
michael@0 707 private:
michael@0 708
michael@0 709 /**
michael@0 710 * This method backs the iterator back up to a "safe position" in the text.
michael@0 711 * This is a position that we know, without any context, must be a break position.
michael@0 712 * The various calling methods then iterate forward from this safe position to
michael@0 713 * the appropriate position to return. (For more information, see the description
michael@0 714 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
michael@0 715 * @param statetable state table used of moving backwards
michael@0 716 * @internal
michael@0 717 */
michael@0 718 int32_t handlePrevious(const RBBIStateTable *statetable);
michael@0 719
michael@0 720 /**
michael@0 721 * This method is the actual implementation of the next() method. All iteration
michael@0 722 * vectors through here. This method initializes the state machine to state 1
michael@0 723 * and advances through the text character by character until we reach the end
michael@0 724 * of the text or the state machine transitions to state 0. We update our return
michael@0 725 * value every time the state machine passes through a possible end state.
michael@0 726 * @param statetable state table used of moving forwards
michael@0 727 * @internal
michael@0 728 */
michael@0 729 int32_t handleNext(const RBBIStateTable *statetable);
michael@0 730
michael@0 731 protected:
michael@0 732
michael@0 733 #ifndef U_HIDE_INTERNAL_API
michael@0 734 /**
michael@0 735 * This is the function that actually implements dictionary-based
michael@0 736 * breaking. Covering at least the range from startPos to endPos,
michael@0 737 * it checks for dictionary characters, and if it finds them determines
michael@0 738 * the appropriate object to deal with them. It may cache found breaks in
michael@0 739 * fCachedBreakPositions as it goes. It may well also look at text outside
michael@0 740 * the range startPos to endPos.
michael@0 741 * If going forward, endPos is the normal Unicode break result, and
michael@0 742 * if goind in reverse, startPos is the normal Unicode break result
michael@0 743 * @param startPos The start position of a range of text
michael@0 744 * @param endPos The end position of a range of text
michael@0 745 * @param reverse The call is for the reverse direction
michael@0 746 * @internal
michael@0 747 */
michael@0 748 int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
michael@0 749 #endif /* U_HIDE_INTERNAL_API */
michael@0 750
michael@0 751 private:
michael@0 752
michael@0 753 /**
michael@0 754 * This function returns the appropriate LanguageBreakEngine for a
michael@0 755 * given character c.
michael@0 756 * @param c A character in the dictionary set
michael@0 757 * @internal
michael@0 758 */
michael@0 759 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
michael@0 760
michael@0 761 /**
michael@0 762 * @internal
michael@0 763 */
michael@0 764 void makeRuleStatusValid();
michael@0 765
michael@0 766 };
michael@0 767
michael@0 768 //------------------------------------------------------------------------------
michael@0 769 //
michael@0 770 // Inline Functions Definitions ...
michael@0 771 //
michael@0 772 //------------------------------------------------------------------------------
michael@0 773
michael@0 774 inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
michael@0 775 return !operator==(that);
michael@0 776 }
michael@0 777
michael@0 778 U_NAMESPACE_END
michael@0 779
michael@0 780 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
michael@0 781
michael@0 782 #endif

mercurial