michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2001-2011 IBM and others. All rights reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 03/22/2000 helena Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #ifndef SEARCH_H michael@0: #define SEARCH_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: /** michael@0: * \file michael@0: * \brief C++ API: SearchIterator object. michael@0: */ michael@0: michael@0: #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION michael@0: michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/chariter.h" michael@0: #include "unicode/brkiter.h" michael@0: #include "unicode/usearch.h" michael@0: michael@0: /** michael@0: * @stable ICU 2.0 michael@0: */ michael@0: struct USearch; michael@0: /** michael@0: * @stable ICU 2.0 michael@0: */ michael@0: typedef struct USearch USearch; michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /** michael@0: * michael@0: * SearchIterator is an abstract base class that provides michael@0: * methods to search for a pattern within a text string. Instances of michael@0: * SearchIterator maintain a current position and scans over the michael@0: * target text, returning the indices the pattern is matched and the length michael@0: * of each match. michael@0: *

michael@0: * SearchIterator defines a protocol for text searching. michael@0: * Subclasses provide concrete implementations of various search algorithms. michael@0: * For example, StringSearch implements language-sensitive pattern michael@0: * matching based on the comparison rules defined in a michael@0: * RuleBasedCollator object. michael@0: *

michael@0: * Other options for searching includes using a BreakIterator to restrict michael@0: * the points at which matches are detected. michael@0: *

michael@0: * SearchIterator provides an API that is similar to that of michael@0: * other text iteration classes such as BreakIterator. Using michael@0: * this class, it is easy to scan through text looking for all occurances of michael@0: * a given pattern. The following example uses a StringSearch michael@0: * object to find all instances of "fox" in the target string. Any other michael@0: * subclass of SearchIterator can be used in an identical michael@0: * manner. michael@0: *


michael@0:  * UnicodeString target("The quick brown fox jumped over the lazy fox");
michael@0:  * UnicodeString pattern("fox");
michael@0:  *
michael@0:  * SearchIterator *iter  = new StringSearch(pattern, target);
michael@0:  * UErrorCode      error = U_ZERO_ERROR;
michael@0:  * for (int pos = iter->first(error); pos != USEARCH_DONE; 
michael@0:  *                               pos = iter->next(error)) {
michael@0:  *     printf("Found match at %d pos, length is %d\n", pos, 
michael@0:  *                                             iter.getMatchLength());
michael@0:  * }
michael@0:  * 
michael@0: * michael@0: * @see StringSearch michael@0: * @see RuleBasedCollator michael@0: */ michael@0: class U_I18N_API SearchIterator : public UObject { michael@0: michael@0: public: michael@0: michael@0: // public constructors and destructors ------------------------------- michael@0: michael@0: /** michael@0: * Copy constructor that creates a SearchIterator instance with the same michael@0: * behavior, and iterating over the same text. michael@0: * @param other the SearchIterator instance to be copied. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: SearchIterator(const SearchIterator &other); michael@0: michael@0: /** michael@0: * Destructor. Cleans up the search iterator data struct. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual ~SearchIterator(); michael@0: michael@0: // public get and set methods ---------------------------------------- michael@0: michael@0: /** michael@0: * Sets the index to point to the given position, and clears any state michael@0: * that's affected. michael@0: *

michael@0: * This method takes the argument index and sets the position in the text michael@0: * string accordingly without checking if the index is pointing to a michael@0: * valid starting point to begin searching. michael@0: * @param position within the text to be set. If position is less michael@0: * than or greater than the text range for searching, michael@0: * an U_INDEX_OUTOFBOUNDS_ERROR will be returned michael@0: * @param status for errors if it occurs michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void setOffset(int32_t position, UErrorCode &status) = 0; michael@0: michael@0: /** michael@0: * Return the current index in the text being searched. michael@0: * If the iteration has gone past the end of the text michael@0: * (or past the beginning for a backwards search), USEARCH_DONE michael@0: * is returned. michael@0: * @return current index in the text being searched. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual int32_t getOffset(void) const = 0; michael@0: michael@0: /** michael@0: * Sets the text searching attributes located in the enum michael@0: * USearchAttribute with values from the enum USearchAttributeValue. michael@0: * USEARCH_DEFAULT can be used for all attributes for resetting. michael@0: * @param attribute text attribute (enum USearchAttribute) to be set michael@0: * @param value text attribute value michael@0: * @param status for errors if it occurs michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setAttribute(USearchAttribute attribute, michael@0: USearchAttributeValue value, michael@0: UErrorCode &status); michael@0: michael@0: /** michael@0: * Gets the text searching attributes michael@0: * @param attribute text attribute (enum USearchAttribute) to be retrieve michael@0: * @return text attribute value michael@0: * @stable ICU 2.0 michael@0: */ michael@0: USearchAttributeValue getAttribute(USearchAttribute attribute) const; michael@0: michael@0: /** michael@0: * Returns the index to the match in the text string that was searched. michael@0: * This call returns a valid result only after a successful call to michael@0: * first, next, previous, or last. michael@0: * Just after construction, or after a searching method returns michael@0: * USEARCH_DONE, this method will return USEARCH_DONE. michael@0: *

michael@0: * Use getMatchedLength to get the matched string length. michael@0: * @return index of a substring within the text string that is being michael@0: * searched. michael@0: * @see #first michael@0: * @see #next michael@0: * @see #previous michael@0: * @see #last michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t getMatchedStart(void) const; michael@0: michael@0: /** michael@0: * Returns the length of text in the string which matches the search michael@0: * pattern. This call returns a valid result only after a successful call michael@0: * to first, next, previous, or last. michael@0: * Just after construction, or after a searching method returns michael@0: * USEARCH_DONE, this method will return 0. michael@0: * @return The length of the match in the target text, or 0 if there michael@0: * is no match currently. michael@0: * @see #first michael@0: * @see #next michael@0: * @see #previous michael@0: * @see #last michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t getMatchedLength(void) const; michael@0: michael@0: /** michael@0: * Returns the text that was matched by the most recent call to michael@0: * first, next, previous, or last. michael@0: * If the iterator is not pointing at a valid match (e.g. just after michael@0: * construction or after USEARCH_DONE has been returned, michael@0: * returns an empty string. michael@0: * @param result stores the matched string or an empty string if a match michael@0: * is not found. michael@0: * @see #first michael@0: * @see #next michael@0: * @see #previous michael@0: * @see #last michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void getMatchedText(UnicodeString &result) const; michael@0: michael@0: /** michael@0: * Set the BreakIterator that will be used to restrict the points michael@0: * at which matches are detected. The user is responsible for deleting michael@0: * the breakiterator. michael@0: * @param breakiter A BreakIterator that will be used to restrict the michael@0: * points at which matches are detected. If a match is michael@0: * found, but the match's start or end index is not a michael@0: * boundary as determined by the BreakIterator, michael@0: * the match will be rejected and another will be searched michael@0: * for. If this parameter is NULL, no break michael@0: * detection is attempted. michael@0: * @param status for errors if it occurs michael@0: * @see BreakIterator michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the BreakIterator that is used to restrict the points at michael@0: * which matches are detected. This will be the same object that was michael@0: * passed to the constructor or to setBreakIterator. michael@0: * Note that NULL is a legal value; it means that break michael@0: * detection should not be attempted. michael@0: * @return BreakIterator used to restrict matchings. michael@0: * @see #setBreakIterator michael@0: * @stable ICU 2.0 michael@0: */ michael@0: const BreakIterator * getBreakIterator(void) const; michael@0: michael@0: /** michael@0: * Set the string text to be searched. Text iteration will hence begin at michael@0: * the start of the text string. This method is useful if you want to michael@0: * re-use an iterator to search for the same pattern within a different michael@0: * body of text. The user is responsible for deleting the text. michael@0: * @param text string to be searched. michael@0: * @param status for errors. If the text length is 0, michael@0: * an U_ILLEGAL_ARGUMENT_ERROR is returned. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void setText(const UnicodeString &text, UErrorCode &status); michael@0: michael@0: /** michael@0: * Set the string text to be searched. Text iteration will hence begin at michael@0: * the start of the text string. This method is useful if you want to michael@0: * re-use an iterator to search for the same pattern within a different michael@0: * body of text. michael@0: *

michael@0: * Note: No parsing of the text within the CharacterIterator michael@0: * will be done during searching for this version. The block of text michael@0: * in CharacterIterator will be used as it is. michael@0: * The user is responsible for deleting the text. michael@0: * @param text string iterator to be searched. michael@0: * @param status for errors if any. If the text length is 0 then an michael@0: * U_ILLEGAL_ARGUMENT_ERROR is returned. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void setText(CharacterIterator &text, UErrorCode &status); michael@0: michael@0: /** michael@0: * Return the string text to be searched. michael@0: * @return text string to be searched. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: const UnicodeString & getText(void) const; michael@0: michael@0: // operator overloading ---------------------------------------------- michael@0: michael@0: /** michael@0: * Equality operator. michael@0: * @param that SearchIterator instance to be compared. michael@0: * @return TRUE if both BreakIterators are of the same class, have the michael@0: * same behavior, terates over the same text and have the same michael@0: * attributes. FALSE otherwise. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual UBool operator==(const SearchIterator &that) const; michael@0: michael@0: /** michael@0: * Not-equal operator. michael@0: * @param that SearchIterator instance to be compared. michael@0: * @return FALSE if operator== returns TRUE, and vice versa. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UBool operator!=(const SearchIterator &that) const; michael@0: michael@0: // public methods ---------------------------------------------------- michael@0: michael@0: /** michael@0: * Returns a copy of SearchIterator with the same behavior, and michael@0: * iterating over the same text, as this one. Note that all data will be michael@0: * replicated, except for the text string to be searched. michael@0: * @return cloned object michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual SearchIterator* safeClone(void) const = 0; michael@0: michael@0: /** michael@0: * Returns the first index at which the string text matches the search michael@0: * pattern. The iterator is adjusted so that its current index (as michael@0: * returned by getOffset) is the match position if one michael@0: * was found. michael@0: * If a match is not found, USEARCH_DONE will be returned and michael@0: * the iterator will be adjusted to the index USEARCH_DONE michael@0: * @param status for errors if it occurs michael@0: * @return The character index of the first match, or michael@0: * USEARCH_DONE if there are no matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t first(UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the first index equal or greater than position at which the michael@0: * string text matches the search pattern. The iterator is adjusted so michael@0: * that its current index (as returned by getOffset) is the michael@0: * match position if one was found. michael@0: * If a match is not found, USEARCH_DONE will be returned and the michael@0: * iterator will be adjusted to the index USEARCH_DONE. michael@0: * @param position where search if to start from. If position is less michael@0: * than or greater than the text range for searching, michael@0: * an U_INDEX_OUTOFBOUNDS_ERROR will be returned michael@0: * @param status for errors if it occurs michael@0: * @return The character index of the first match following michael@0: * position, or USEARCH_DONE if there are no michael@0: * matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t following(int32_t position, UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the last index in the target text at which it matches the michael@0: * search pattern. The iterator is adjusted so that its current index michael@0: * (as returned by getOffset) is the match position if one was michael@0: * found. michael@0: * If a match is not found, USEARCH_DONE will be returned and michael@0: * the iterator will be adjusted to the index USEARCH_DONE. michael@0: * @param status for errors if it occurs michael@0: * @return The index of the first match, or USEARCH_DONE if michael@0: * there are no matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t last(UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the first index less than position at which the string michael@0: * text matches the search pattern. The iterator is adjusted so that its michael@0: * current index (as returned by getOffset) is the match michael@0: * position if one was found. If a match is not found, michael@0: * USEARCH_DONE will be returned and the iterator will be michael@0: * adjusted to the index USEARCH_DONE michael@0: *

michael@0: * When USEARCH_OVERLAP option is off, the last index of the michael@0: * result match is always less than position. michael@0: * When USERARCH_OVERLAP is on, the result match may span across michael@0: * position. michael@0: * michael@0: * @param position where search is to start from. If position is less michael@0: * than or greater than the text range for searching, michael@0: * an U_INDEX_OUTOFBOUNDS_ERROR will be returned michael@0: * @param status for errors if it occurs michael@0: * @return The character index of the first match preceding michael@0: * position, or USEARCH_DONE if there are michael@0: * no matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t preceding(int32_t position, UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the index of the next point at which the text matches the michael@0: * search pattern, starting from the current position michael@0: * The iterator is adjusted so that its current index (as returned by michael@0: * getOffset) is the match position if one was found. michael@0: * If a match is not found, USEARCH_DONE will be returned and michael@0: * the iterator will be adjusted to a position after the end of the text michael@0: * string. michael@0: * @param status for errors if it occurs michael@0: * @return The index of the next match after the current position, michael@0: * or USEARCH_DONE if there are no more matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t next(UErrorCode &status); michael@0: michael@0: /** michael@0: * Returns the index of the previous point at which the string text michael@0: * matches the search pattern, starting at the current position. michael@0: * The iterator is adjusted so that its current index (as returned by michael@0: * getOffset) is the match position if one was found. michael@0: * If a match is not found, USEARCH_DONE will be returned and michael@0: * the iterator will be adjusted to the index USEARCH_DONE michael@0: * @param status for errors if it occurs michael@0: * @return The index of the previous match before the current position, michael@0: * or USEARCH_DONE if there are no more matches. michael@0: * @see #getOffset michael@0: * @stable ICU 2.0 michael@0: */ michael@0: int32_t previous(UErrorCode &status); michael@0: michael@0: /** michael@0: * Resets the iteration. michael@0: * Search will begin at the start of the text string if a forward michael@0: * iteration is initiated before a backwards iteration. Otherwise if a michael@0: * backwards iteration is initiated before a forwards iteration, the michael@0: * search will begin at the end of the text string. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void reset(); michael@0: michael@0: protected: michael@0: // protected data members --------------------------------------------- michael@0: michael@0: /** michael@0: * C search data struct michael@0: * @stable ICU 2.0 michael@0: */ michael@0: USearch *m_search_; michael@0: michael@0: /** michael@0: * Break iterator. michael@0: * Currently the C++ breakiterator does not have getRules etc to reproduce michael@0: * another in C. Hence we keep the original around and do the verification michael@0: * at the end of the match. The user is responsible for deleting this michael@0: * break iterator. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: BreakIterator *m_breakiterator_; michael@0: michael@0: /** michael@0: * Unicode string version of the search text michael@0: * @stable ICU 2.0 michael@0: */ michael@0: UnicodeString m_text_; michael@0: michael@0: // protected constructors and destructors ----------------------------- michael@0: michael@0: /** michael@0: * Default constructor. michael@0: * Initializes data to the default values. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: SearchIterator(); michael@0: michael@0: /** michael@0: * Constructor for use by subclasses. michael@0: * @param text The target text to be searched. michael@0: * @param breakiter A {@link BreakIterator} that is used to restrict the michael@0: * points at which matches are detected. If michael@0: * handleNext or handlePrev finds a michael@0: * match, but the match's start or end index is not a michael@0: * boundary as determined by the BreakIterator, michael@0: * the match is rejected and handleNext or michael@0: * handlePrev is called again. If this parameter michael@0: * is NULL, no break detection is attempted. michael@0: * @see #handleNext michael@0: * @see #handlePrev michael@0: * @stable ICU 2.0 michael@0: */ michael@0: SearchIterator(const UnicodeString &text, michael@0: BreakIterator *breakiter = NULL); michael@0: michael@0: /** michael@0: * Constructor for use by subclasses. michael@0: *

michael@0: * Note: No parsing of the text within the CharacterIterator michael@0: * will be done during searching for this version. The block of text michael@0: * in CharacterIterator will be used as it is. michael@0: * @param text The target text to be searched. michael@0: * @param breakiter A {@link BreakIterator} that is used to restrict the michael@0: * points at which matches are detected. If michael@0: * handleNext or handlePrev finds a michael@0: * match, but the match's start or end index is not a michael@0: * boundary as determined by the BreakIterator, michael@0: * the match is rejected and handleNext or michael@0: * handlePrev is called again. If this parameter michael@0: * is NULL, no break detection is attempted. michael@0: * @see #handleNext michael@0: * @see #handlePrev michael@0: * @stable ICU 2.0 michael@0: */ michael@0: SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); michael@0: michael@0: // protected methods -------------------------------------------------- michael@0: michael@0: /** michael@0: * Assignment operator. Sets this iterator to have the same behavior, michael@0: * and iterate over the same text, as the one passed in. michael@0: * @param that instance to be copied. michael@0: * @stable ICU 2.0 michael@0: */ michael@0: SearchIterator & operator=(const SearchIterator &that); michael@0: michael@0: /** michael@0: * Abstract method which subclasses override to provide the mechanism michael@0: * for finding the next match in the target text. This allows different michael@0: * subclasses to provide different search algorithms. michael@0: *

michael@0: * If a match is found, the implementation should return the index at michael@0: * which the match starts and should call michael@0: * setMatchLength with the number of characters michael@0: * in the target text that make up the match. If no match is found, the michael@0: * method should return USEARCH_DONE. michael@0: *

michael@0: * @param position The index in the target text at which the search michael@0: * should start. michael@0: * @param status for error codes if it occurs. michael@0: * @return index at which the match starts, else if match is not found michael@0: * USEARCH_DONE is returned michael@0: * @see #setMatchLength michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual int32_t handleNext(int32_t position, UErrorCode &status) michael@0: = 0; michael@0: michael@0: /** michael@0: * Abstract method which subclasses override to provide the mechanism for michael@0: * finding the previous match in the target text. This allows different michael@0: * subclasses to provide different search algorithms. michael@0: *

michael@0: * If a match is found, the implementation should return the index at michael@0: * which the match starts and should call michael@0: * setMatchLength with the number of characters michael@0: * in the target text that make up the match. If no match is found, the michael@0: * method should return USEARCH_DONE. michael@0: *

michael@0: * @param position The index in the target text at which the search michael@0: * should start. michael@0: * @param status for error codes if it occurs. michael@0: * @return index at which the match starts, else if match is not found michael@0: * USEARCH_DONE is returned michael@0: * @see #setMatchLength michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual int32_t handlePrev(int32_t position, UErrorCode &status) michael@0: = 0; michael@0: michael@0: /** michael@0: * Sets the length of the currently matched string in the text string to michael@0: * be searched. michael@0: * Subclasses' handleNext and handlePrev michael@0: * methods should call this when they find a match in the target text. michael@0: * @param length length of the matched text. michael@0: * @see #handleNext michael@0: * @see #handlePrev michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void setMatchLength(int32_t length); michael@0: michael@0: /** michael@0: * Sets the offset of the currently matched string in the text string to michael@0: * be searched. michael@0: * Subclasses' handleNext and handlePrev michael@0: * methods should call this when they find a match in the target text. michael@0: * @param position start offset of the matched text. michael@0: * @see #handleNext michael@0: * @see #handlePrev michael@0: * @stable ICU 2.0 michael@0: */ michael@0: virtual void setMatchStart(int32_t position); michael@0: michael@0: /** michael@0: * sets match not found michael@0: * @stable ICU 2.0 michael@0: */ michael@0: void setMatchNotFound(); michael@0: }; michael@0: michael@0: inline UBool SearchIterator::operator!=(const SearchIterator &that) const michael@0: { michael@0: return !operator==(that); michael@0: } michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_COLLATION */ michael@0: michael@0: #endif michael@0: