intl/icu/source/i18n/unicode/search.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
     4 **********************************************************************
     5 *   Date        Name        Description
     6 *  03/22/2000   helena      Creation.
     7 **********************************************************************
     8 */
    10 #ifndef SEARCH_H
    11 #define SEARCH_H
    13 #include "unicode/utypes.h"
    15 /**
    16  * \file 
    17  * \brief C++ API: SearchIterator object.
    18  */
    20 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
    22 #include "unicode/uobject.h"
    23 #include "unicode/unistr.h"
    24 #include "unicode/chariter.h"
    25 #include "unicode/brkiter.h"
    26 #include "unicode/usearch.h"
    28 /**
    29 * @stable ICU 2.0
    30 */
    31 struct USearch;
    32 /**
    33 * @stable ICU 2.0
    34 */
    35 typedef struct USearch USearch;
    37 U_NAMESPACE_BEGIN
    39 /**
    40  *
    41  * <tt>SearchIterator</tt> is an abstract base class that provides 
    42  * methods to search for a pattern within a text string. Instances of
    43  * <tt>SearchIterator</tt> maintain a current position and scans over the 
    44  * target text, returning the indices the pattern is matched and the length 
    45  * of each match.
    46  * <p>
    47  * <tt>SearchIterator</tt> defines a protocol for text searching. 
    48  * Subclasses provide concrete implementations of various search algorithms. 
    49  * For example, <tt>StringSearch</tt> implements language-sensitive pattern 
    50  * matching based on the comparison rules defined in a 
    51  * <tt>RuleBasedCollator</tt> object. 
    52  * <p> 
    53  * Other options for searching includes using a BreakIterator to restrict 
    54  * the points at which matches are detected.
    55  * <p>
    56  * <tt>SearchIterator</tt> provides an API that is similar to that of
    57  * other text iteration classes such as <tt>BreakIterator</tt>. Using 
    58  * this class, it is easy to scan through text looking for all occurances of 
    59  * a given pattern. The following example uses a <tt>StringSearch</tt> 
    60  * object to find all instances of "fox" in the target string. Any other 
    61  * subclass of <tt>SearchIterator</tt> can be used in an identical 
    62  * manner.
    63  * <pre><code>
    64  * UnicodeString target("The quick brown fox jumped over the lazy fox");
    65  * UnicodeString pattern("fox");
    66  *
    67  * SearchIterator *iter  = new StringSearch(pattern, target);
    68  * UErrorCode      error = U_ZERO_ERROR;
    69  * for (int pos = iter->first(error); pos != USEARCH_DONE; 
    70  *                               pos = iter->next(error)) {
    71  *     printf("Found match at %d pos, length is %d\n", pos, 
    72  *                                             iter.getMatchLength());
    73  * }
    74  * </code></pre>
    75  *
    76  * @see StringSearch
    77  * @see RuleBasedCollator
    78  */
    79 class U_I18N_API SearchIterator : public UObject {
    81 public:
    83     // public constructors and destructors -------------------------------
    85     /** 
    86     * Copy constructor that creates a SearchIterator instance with the same 
    87     * behavior, and iterating over the same text. 
    88     * @param other the SearchIterator instance to be copied.
    89     * @stable ICU 2.0
    90     */
    91     SearchIterator(const SearchIterator &other);
    93     /**
    94      * Destructor. Cleans up the search iterator data struct.
    95      * @stable ICU 2.0
    96      */
    97     virtual ~SearchIterator();
    99     // public get and set methods ----------------------------------------
   101     /**
   102      * Sets the index to point to the given position, and clears any state 
   103      * that's affected.
   104      * <p>
   105      * This method takes the argument index and sets the position in the text 
   106      * string accordingly without checking if the index is pointing to a 
   107      * valid starting point to begin searching. 
   108      * @param position within the text to be set. If position is less
   109      *             than or greater than the text range for searching, 
   110      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
   111      * @param status for errors if it occurs
   112      * @stable ICU 2.0
   113      */
   114     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
   116     /**
   117      * Return the current index in the text being searched.
   118      * If the iteration has gone past the end of the text
   119      * (or past the beginning for a backwards search), USEARCH_DONE
   120      * is returned.
   121      * @return current index in the text being searched.
   122      * @stable ICU 2.0
   123      */
   124     virtual int32_t getOffset(void) const = 0;
   126     /**
   127     * Sets the text searching attributes located in the enum 
   128     * USearchAttribute with values from the enum USearchAttributeValue.
   129     * USEARCH_DEFAULT can be used for all attributes for resetting.
   130     * @param attribute text attribute (enum USearchAttribute) to be set
   131     * @param value text attribute value
   132     * @param status for errors if it occurs
   133     * @stable ICU 2.0
   134     */
   135     void setAttribute(USearchAttribute       attribute,
   136                       USearchAttributeValue  value,
   137                       UErrorCode            &status);
   139     /**    
   140     * Gets the text searching attributes
   141     * @param attribute text attribute (enum USearchAttribute) to be retrieve
   142     * @return text attribute value
   143     * @stable ICU 2.0
   144     */
   145     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
   147     /**
   148     * Returns the index to the match in the text string that was searched.
   149     * This call returns a valid result only after a successful call to 
   150     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
   151     * Just after construction, or after a searching method returns 
   152     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
   153     * <p>
   154     * Use getMatchedLength to get the matched string length.
   155     * @return index of a substring within the text string that is being 
   156     *         searched.
   157     * @see #first
   158     * @see #next
   159     * @see #previous
   160     * @see #last
   161     * @stable ICU 2.0
   162     */
   163     int32_t getMatchedStart(void) const;
   165     /**
   166      * Returns the length of text in the string which matches the search 
   167      * pattern. This call returns a valid result only after a successful call 
   168      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
   169      * Just after construction, or after a searching method returns 
   170      * <tt>USEARCH_DONE</tt>, this method will return 0.
   171      * @return The length of the match in the target text, or 0 if there
   172      *         is no match currently.
   173      * @see #first
   174      * @see #next
   175      * @see #previous
   176      * @see #last
   177      * @stable ICU 2.0
   178      */
   179     int32_t getMatchedLength(void) const;
   181     /**
   182      * Returns the text that was matched by the most recent call to 
   183      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
   184      * If the iterator is not pointing at a valid match (e.g. just after 
   185      * construction or after <tt>USEARCH_DONE</tt> has been returned, 
   186      * returns an empty string. 
   187      * @param result stores the matched string or an empty string if a match
   188      *        is not found.
   189      * @see #first
   190      * @see #next
   191      * @see #previous
   192      * @see #last
   193      * @stable ICU 2.0
   194      */
   195     void getMatchedText(UnicodeString &result) const;
   197     /**
   198      * Set the BreakIterator that will be used to restrict the points
   199      * at which matches are detected. The user is responsible for deleting 
   200      * the breakiterator.
   201      * @param breakiter A BreakIterator that will be used to restrict the 
   202      *                points at which matches are detected. If a match is 
   203      *                found, but the match's start or end index is not a 
   204      *                boundary as determined by the <tt>BreakIterator</tt>, 
   205      *                the match will be rejected and another will be searched 
   206      *                for. If this parameter is <tt>NULL</tt>, no break
   207      *                detection is attempted.
   208      * @param status for errors if it occurs
   209      * @see BreakIterator
   210      * @stable ICU 2.0
   211      */
   212     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
   214     /**
   215      * Returns the BreakIterator that is used to restrict the points at 
   216      * which matches are detected.  This will be the same object that was 
   217      * passed to the constructor or to <tt>setBreakIterator</tt>.
   218      * Note that <tt>NULL</tt> is a legal value; it means that break
   219      * detection should not be attempted.
   220      * @return BreakIterator used to restrict matchings.
   221      * @see #setBreakIterator
   222      * @stable ICU 2.0
   223      */
   224     const BreakIterator * getBreakIterator(void) const;
   226     /**
   227      * Set the string text to be searched. Text iteration will hence begin at 
   228      * the start of the text string. This method is useful if you want to 
   229      * re-use an iterator to search for the same pattern within a different 
   230      * body of text. The user is responsible for deleting the text.
   231      * @param text string to be searched.
   232      * @param status for errors. If the text length is 0, 
   233      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
   234      * @stable ICU 2.0
   235      */
   236     virtual void setText(const UnicodeString &text, UErrorCode &status);    
   238     /**
   239      * Set the string text to be searched. Text iteration will hence begin at 
   240      * the start of the text string. This method is useful if you want to 
   241      * re-use an iterator to search for the same pattern within a different 
   242      * body of text.
   243      * <p>
   244      * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
   245      * will be done during searching for this version. The block of text 
   246      * in <tt>CharacterIterator</tt> will be used as it is.
   247      * The user is responsible for deleting the text.
   248      * @param text string iterator to be searched.
   249      * @param status for errors if any. If the text length is 0 then an 
   250      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
   251      * @stable ICU 2.0
   252      */
   253     virtual void setText(CharacterIterator &text, UErrorCode &status);
   255     /**
   256      * Return the string text to be searched.
   257      * @return text string to be searched.
   258      * @stable ICU 2.0
   259      */
   260     const UnicodeString & getText(void) const;
   262     // operator overloading ----------------------------------------------
   264     /**
   265      * Equality operator. 
   266      * @param that SearchIterator instance to be compared.
   267      * @return TRUE if both BreakIterators are of the same class, have the 
   268      *         same behavior, terates over the same text and have the same
   269      *         attributes. FALSE otherwise.
   270      * @stable ICU 2.0
   271      */
   272     virtual UBool operator==(const SearchIterator &that) const;
   274     /**
   275      * Not-equal operator. 
   276      * @param that SearchIterator instance to be compared.
   277      * @return FALSE if operator== returns TRUE, and vice versa.
   278      * @stable ICU 2.0
   279      */
   280     UBool operator!=(const SearchIterator &that) const;
   282     // public methods ----------------------------------------------------
   284     /**
   285      * Returns a copy of SearchIterator with the same behavior, and 
   286      * iterating over the same text, as this one. Note that all data will be
   287      * replicated, except for the text string to be searched.
   288      * @return cloned object
   289      * @stable ICU 2.0
   290      */
   291     virtual SearchIterator* safeClone(void) const = 0;
   293     /**
   294      * Returns the first index at which the string text matches the search 
   295      * pattern. The iterator is adjusted so that its current index (as 
   296      * returned by <tt>getOffset</tt>) is the match position if one 
   297      * was found.
   298      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
   299      * the iterator will be adjusted to the index USEARCH_DONE
   300      * @param  status for errors if it occurs
   301      * @return The character index of the first match, or 
   302      *         <tt>USEARCH_DONE</tt> if there are no matches.
   303      * @see #getOffset
   304      * @stable ICU 2.0
   305      */
   306     int32_t first(UErrorCode &status);
   308     /**
   309      * Returns the first index equal or greater than <tt>position</tt> at which the 
   310      * string text matches the search pattern. The iterator is adjusted so 
   311      * that its current index (as returned by <tt>getOffset</tt>) is the 
   312      * match position if one was found.
   313      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
   314      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
   315      * @param  position where search if to start from. If position is less
   316      *             than or greater than the text range for searching, 
   317      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
   318      * @param  status for errors if it occurs
   319      * @return The character index of the first match following 
   320      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 
   321      *         matches.
   322      * @see #getOffset
   323      * @stable ICU 2.0
   324      */
   325     int32_t following(int32_t position, UErrorCode &status);
   327     /**
   328      * Returns the last index in the target text at which it matches the 
   329      * search pattern. The iterator is adjusted so that its current index 
   330      * (as returned by <tt>getOffset</tt>) is the match position if one was 
   331      * found.
   332      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
   333      * the iterator will be adjusted to the index USEARCH_DONE.
   334      * @param  status for errors if it occurs
   335      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 
   336      *         there are no matches.
   337      * @see #getOffset
   338      * @stable ICU 2.0
   339      */
   340     int32_t last(UErrorCode &status);
   342     /**
   343      * Returns the first index less than <tt>position</tt> at which the string 
   344      * text matches the search pattern. The iterator is adjusted so that its 
   345      * current index (as returned by <tt>getOffset</tt>) is the match 
   346      * position if one was found. If a match is not found, 
   347      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 
   348      * adjusted to the index USEARCH_DONE
   349      * <p>
   350      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
   351      * result match is always less than <tt>position</tt>.
   352      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
   353      * <tt>position</tt>.
   354      *
   355      * @param  position where search is to start from. If position is less
   356      *             than or greater than the text range for searching, 
   357      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
   358      * @param  status for errors if it occurs
   359      * @return The character index of the first match preceding 
   360      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 
   361      *         no matches.
   362      * @see #getOffset
   363      * @stable ICU 2.0
   364      */
   365     int32_t preceding(int32_t position, UErrorCode &status);
   367     /**
   368      * Returns the index of the next point at which the text matches the
   369      * search pattern, starting from the current position
   370      * The iterator is adjusted so that its current index (as returned by 
   371      * <tt>getOffset</tt>) is the match position if one was found.
   372      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
   373      * the iterator will be adjusted to a position after the end of the text 
   374      * string.
   375      * @param  status for errors if it occurs
   376      * @return The index of the next match after the current position,
   377      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
   378      * @see #getOffset
   379      * @stable ICU 2.0
   380      */
   381      int32_t next(UErrorCode &status);
   383     /**
   384      * Returns the index of the previous point at which the string text 
   385      * matches the search pattern, starting at the current position.
   386      * The iterator is adjusted so that its current index (as returned by 
   387      * <tt>getOffset</tt>) is the match position if one was found.
   388      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
   389      * the iterator will be adjusted to the index USEARCH_DONE
   390      * @param  status for errors if it occurs
   391      * @return The index of the previous match before the current position,
   392      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
   393      * @see #getOffset
   394      * @stable ICU 2.0
   395      */
   396     int32_t previous(UErrorCode &status);
   398     /** 
   399     * Resets the iteration.
   400     * Search will begin at the start of the text string if a forward 
   401     * iteration is initiated before a backwards iteration. Otherwise if a 
   402     * backwards iteration is initiated before a forwards iteration, the 
   403     * search will begin at the end of the text string.    
   404     * @stable ICU 2.0
   405     */
   406     virtual void reset();
   408 protected:
   409     // protected data members ---------------------------------------------
   411     /**
   412     * C search data struct
   413     * @stable ICU 2.0
   414     */
   415     USearch *m_search_;
   417     /**
   418     * Break iterator.
   419     * Currently the C++ breakiterator does not have getRules etc to reproduce
   420     * another in C. Hence we keep the original around and do the verification
   421     * at the end of the match. The user is responsible for deleting this
   422     * break iterator.
   423     * @stable ICU 2.0
   424     */
   425     BreakIterator *m_breakiterator_;
   427     /**
   428     * Unicode string version of the search text
   429     * @stable ICU 2.0
   430     */
   431     UnicodeString  m_text_;
   433     // protected constructors and destructors -----------------------------
   435     /**
   436     * Default constructor.
   437     * Initializes data to the default values.
   438     * @stable ICU 2.0
   439     */
   440     SearchIterator();
   442     /**
   443      * Constructor for use by subclasses.
   444      * @param text The target text to be searched.
   445      * @param breakiter A {@link BreakIterator} that is used to restrict the 
   446      *                points at which matches are detected. If 
   447      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
   448      *                match, but the match's start or end index is not a 
   449      *                boundary as determined by the <tt>BreakIterator</tt>, 
   450      *                the match is rejected and <tt>handleNext</tt> or 
   451      *                <tt>handlePrev</tt> is called again. If this parameter 
   452      *                is <tt>NULL</tt>, no break detection is attempted.  
   453      * @see #handleNext
   454      * @see #handlePrev
   455      * @stable ICU 2.0
   456      */
   457     SearchIterator(const UnicodeString &text, 
   458                          BreakIterator *breakiter = NULL);
   460     /**
   461      * Constructor for use by subclasses.
   462      * <p>
   463      * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
   464      * will be done during searching for this version. The block of text 
   465      * in <tt>CharacterIterator</tt> will be used as it is.
   466      * @param text The target text to be searched.
   467      * @param breakiter A {@link BreakIterator} that is used to restrict the 
   468      *                points at which matches are detected. If 
   469      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
   470      *                match, but the match's start or end index is not a 
   471      *                boundary as determined by the <tt>BreakIterator</tt>, 
   472      *                the match is rejected and <tt>handleNext</tt> or 
   473      *                <tt>handlePrev</tt> is called again. If this parameter 
   474      *                is <tt>NULL</tt>, no break detection is attempted.
   475      * @see #handleNext
   476      * @see #handlePrev
   477      * @stable ICU 2.0
   478      */
   479     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
   481     // protected methods --------------------------------------------------
   483     /**
   484      * Assignment operator. Sets this iterator to have the same behavior,
   485      * and iterate over the same text, as the one passed in.
   486      * @param that instance to be copied.
   487      * @stable ICU 2.0
   488      */
   489     SearchIterator & operator=(const SearchIterator &that);
   491     /**
   492      * Abstract method which subclasses override to provide the mechanism
   493      * for finding the next match in the target text. This allows different
   494      * subclasses to provide different search algorithms.
   495      * <p>
   496      * If a match is found, the implementation should return the index at
   497      * which the match starts and should call 
   498      * <tt>setMatchLength</tt> with the number of characters 
   499      * in the target text that make up the match. If no match is found, the 
   500      * method should return USEARCH_DONE.
   501      * <p>
   502      * @param position The index in the target text at which the search 
   503      *                 should start.
   504      * @param status for error codes if it occurs.
   505      * @return index at which the match starts, else if match is not found 
   506      *         USEARCH_DONE is returned
   507      * @see #setMatchLength
   508      * @stable ICU 2.0
   509      */
   510     virtual int32_t handleNext(int32_t position, UErrorCode &status) 
   511                                                                          = 0;
   513     /**
   514      * Abstract method which subclasses override to provide the mechanism for
   515      * finding the previous match in the target text. This allows different
   516      * subclasses to provide different search algorithms.
   517      * <p>
   518      * If a match is found, the implementation should return the index at
   519      * which the match starts and should call 
   520      * <tt>setMatchLength</tt> with the number of characters 
   521      * in the target text that make up the match. If no match is found, the 
   522      * method should return USEARCH_DONE.
   523      * <p>
   524      * @param position The index in the target text at which the search 
   525      *                 should start.
   526      * @param status for error codes if it occurs.
   527      * @return index at which the match starts, else if match is not found 
   528      *         USEARCH_DONE is returned
   529      * @see #setMatchLength
   530      * @stable ICU 2.0
   531      */
   532      virtual int32_t handlePrev(int32_t position, UErrorCode &status) 
   533                                                                          = 0;
   535     /**
   536      * Sets the length of the currently matched string in the text string to
   537      * be searched.
   538      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
   539      * methods should call this when they find a match in the target text.
   540      * @param length length of the matched text.
   541      * @see #handleNext
   542      * @see #handlePrev
   543      * @stable ICU 2.0
   544      */
   545     virtual void setMatchLength(int32_t length);
   547     /**
   548      * Sets the offset of the currently matched string in the text string to
   549      * be searched.
   550      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
   551      * methods should call this when they find a match in the target text.
   552      * @param position start offset of the matched text.
   553      * @see #handleNext
   554      * @see #handlePrev
   555      * @stable ICU 2.0
   556      */
   557     virtual void setMatchStart(int32_t position);
   559     /**
   560     * sets match not found 
   561     * @stable ICU 2.0
   562     */
   563     void setMatchNotFound();
   564 };
   566 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
   567 {
   568    return !operator==(that); 
   569 }
   570 U_NAMESPACE_END
   572 #endif /* #if !UCONFIG_NO_COLLATION */
   574 #endif

mercurial