intl/icu/source/common/dictbe.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /**
     2  *******************************************************************************
     3  * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
     4  * and others. All Rights Reserved.                                            *
     5  *******************************************************************************
     6  */
     8 #ifndef DICTBE_H
     9 #define DICTBE_H
    11 #include "unicode/utypes.h"
    12 #include "unicode/uniset.h"
    13 #include "unicode/utext.h"
    15 #include "brkeng.h"
    17 U_NAMESPACE_BEGIN
    19 class DictionaryMatcher;
    21 /*******************************************************************
    22  * DictionaryBreakEngine
    23  */
    25 /**
    26  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
    27  * dictionary to determine language-specific breaks.</p>
    28  *
    29  * <p>After it is constructed a DictionaryBreakEngine may be shared between
    30  * threads without synchronization.</p>
    31  */
    32 class DictionaryBreakEngine : public LanguageBreakEngine {
    33  private:
    34     /**
    35      * The set of characters handled by this engine
    36      * @internal
    37      */
    39   UnicodeSet    fSet;
    41     /**
    42      * The set of break types handled by this engine
    43      * @internal
    44      */
    46   uint32_t      fTypes;
    48   /**
    49    * <p>Default constructor.</p>
    50    *
    51    */
    52   DictionaryBreakEngine();
    54  public:
    56   /**
    57    * <p>Constructor setting the break types handled.</p>
    58    *
    59    * @param breakTypes A bitmap of types handled by the engine.
    60    */
    61   DictionaryBreakEngine( uint32_t breakTypes );
    63   /**
    64    * <p>Virtual destructor.</p>
    65    */
    66   virtual ~DictionaryBreakEngine();
    68   /**
    69    * <p>Indicate whether this engine handles a particular character for
    70    * a particular kind of break.</p>
    71    *
    72    * @param c A character which begins a run that the engine might handle
    73    * @param breakType The type of text break which the caller wants to determine
    74    * @return TRUE if this engine handles the particular character and break
    75    * type.
    76    */
    77   virtual UBool handles( UChar32 c, int32_t breakType ) const;
    79   /**
    80    * <p>Find any breaks within a run in the supplied text.</p>
    81    *
    82    * @param text A UText representing the text. The iterator is left at
    83    * the end of the run of characters which the engine is capable of handling 
    84    * that starts from the first (or last) character in the range.
    85    * @param startPos The start of the run within the supplied text.
    86    * @param endPos The end of the run within the supplied text.
    87    * @param reverse Whether the caller is looking for breaks in a reverse
    88    * direction.
    89    * @param breakType The type of break desired, or -1.
    90    * @param foundBreaks An allocated C array of the breaks found, if any
    91    * @return The number of breaks found.
    92    */
    93   virtual int32_t findBreaks( UText *text,
    94                               int32_t startPos,
    95                               int32_t endPos,
    96                               UBool reverse,
    97                               int32_t breakType,
    98                               UStack &foundBreaks ) const;
   100  protected:
   102  /**
   103   * <p>Set the character set handled by this engine.</p>
   104   *
   105   * @param set A UnicodeSet of the set of characters handled by the engine
   106   */
   107   virtual void setCharacters( const UnicodeSet &set );
   109  /**
   110   * <p>Set the break types handled by this engine.</p>
   111   *
   112   * @param breakTypes A bitmap of types handled by the engine.
   113   */
   114 //  virtual void setBreakTypes( uint32_t breakTypes );
   116  /**
   117   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
   118   *
   119   * @param text A UText representing the text
   120   * @param rangeStart The start of the range of dictionary characters
   121   * @param rangeEnd The end of the range of dictionary characters
   122   * @param foundBreaks Output of C array of int32_t break positions, or 0
   123   * @return The number of breaks found
   124   */
   125   virtual int32_t divideUpDictionaryRange( UText *text,
   126                                            int32_t rangeStart,
   127                                            int32_t rangeEnd,
   128                                            UStack &foundBreaks ) const = 0;
   130 };
   132 /*******************************************************************
   133  * ThaiBreakEngine
   134  */
   136 /**
   137  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
   138  * dictionary and heuristics to determine Thai-specific breaks.</p>
   139  *
   140  * <p>After it is constructed a ThaiBreakEngine may be shared between
   141  * threads without synchronization.</p>
   142  */
   143 class ThaiBreakEngine : public DictionaryBreakEngine {
   144  private:
   145     /**
   146      * The set of characters handled by this engine
   147      * @internal
   148      */
   150   UnicodeSet                fThaiWordSet;
   151   UnicodeSet                fEndWordSet;
   152   UnicodeSet                fBeginWordSet;
   153   UnicodeSet                fSuffixSet;
   154   UnicodeSet                fMarkSet;
   155   DictionaryMatcher  *fDictionary;
   157  public:
   159   /**
   160    * <p>Default constructor.</p>
   161    *
   162    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
   163    * engine is deleted.
   164    */
   165   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
   167   /**
   168    * <p>Virtual destructor.</p>
   169    */
   170   virtual ~ThaiBreakEngine();
   172  protected:
   173  /**
   174   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
   175   *
   176   * @param text A UText representing the text
   177   * @param rangeStart The start of the range of dictionary characters
   178   * @param rangeEnd The end of the range of dictionary characters
   179   * @param foundBreaks Output of C array of int32_t break positions, or 0
   180   * @return The number of breaks found
   181   */
   182   virtual int32_t divideUpDictionaryRange( UText *text,
   183                                            int32_t rangeStart,
   184                                            int32_t rangeEnd,
   185                                            UStack &foundBreaks ) const;
   187 };
   189 /*******************************************************************
   190  * LaoBreakEngine
   191  */
   193 /**
   194  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
   195  * dictionary and heuristics to determine Lao-specific breaks.</p>
   196  *
   197  * <p>After it is constructed a LaoBreakEngine may be shared between
   198  * threads without synchronization.</p>
   199  */
   200 class LaoBreakEngine : public DictionaryBreakEngine {
   201  private:
   202     /**
   203      * The set of characters handled by this engine
   204      * @internal
   205      */
   207   UnicodeSet                fLaoWordSet;
   208   UnicodeSet                fEndWordSet;
   209   UnicodeSet                fBeginWordSet;
   210   UnicodeSet                fMarkSet;
   211   DictionaryMatcher  *fDictionary;
   213  public:
   215   /**
   216    * <p>Default constructor.</p>
   217    *
   218    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
   219    * engine is deleted.
   220    */
   221   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
   223   /**
   224    * <p>Virtual destructor.</p>
   225    */
   226   virtual ~LaoBreakEngine();
   228  protected:
   229  /**
   230   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
   231   *
   232   * @param text A UText representing the text
   233   * @param rangeStart The start of the range of dictionary characters
   234   * @param rangeEnd The end of the range of dictionary characters
   235   * @param foundBreaks Output of C array of int32_t break positions, or 0
   236   * @return The number of breaks found
   237   */
   238   virtual int32_t divideUpDictionaryRange( UText *text,
   239                                            int32_t rangeStart,
   240                                            int32_t rangeEnd,
   241                                            UStack &foundBreaks ) const;
   243 };
   245 /******************************************************************* 
   246  * KhmerBreakEngine 
   247  */ 
   249 /** 
   250  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 
   251  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 
   252  * 
   253  * <p>After it is constructed a KhmerBreakEngine may be shared between 
   254  * threads without synchronization.</p> 
   255  */ 
   256 class KhmerBreakEngine : public DictionaryBreakEngine { 
   257  private: 
   258     /** 
   259      * The set of characters handled by this engine 
   260      * @internal 
   261      */ 
   263   UnicodeSet                fKhmerWordSet; 
   264   UnicodeSet                fEndWordSet; 
   265   UnicodeSet                fBeginWordSet; 
   266   UnicodeSet                fMarkSet; 
   267   DictionaryMatcher  *fDictionary; 
   269  public: 
   271   /** 
   272    * <p>Default constructor.</p> 
   273    * 
   274    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 
   275    * engine is deleted. 
   276    */ 
   277   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 
   279   /** 
   280    * <p>Virtual destructor.</p> 
   281    */ 
   282   virtual ~KhmerBreakEngine(); 
   284  protected: 
   285  /** 
   286   * <p>Divide up a range of known dictionary characters.</p> 
   287   * 
   288   * @param text A UText representing the text 
   289   * @param rangeStart The start of the range of dictionary characters 
   290   * @param rangeEnd The end of the range of dictionary characters 
   291   * @param foundBreaks Output of C array of int32_t break positions, or 0 
   292   * @return The number of breaks found 
   293   */ 
   294   virtual int32_t divideUpDictionaryRange( UText *text, 
   295                                            int32_t rangeStart, 
   296                                            int32_t rangeEnd, 
   297                                            UStack &foundBreaks ) const; 
   299 }; 
   301 #if !UCONFIG_NO_NORMALIZATION
   303 /*******************************************************************
   304  * CjkBreakEngine
   305  */
   307 //indicates language/script that the CjkBreakEngine will handle
   308 enum LanguageType {
   309     kKorean,
   310     kChineseJapanese
   311 };
   313 /**
   314  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
   315  * dictionary with costs associated with each word and
   316  * Viterbi decoding to determine CJK-specific breaks.</p>
   317  */
   318 class CjkBreakEngine : public DictionaryBreakEngine {
   319  protected:
   320     /**
   321      * The set of characters handled by this engine
   322      * @internal
   323      */
   324   UnicodeSet                fHangulWordSet;
   325   UnicodeSet                fHanWordSet;
   326   UnicodeSet                fKatakanaWordSet;
   327   UnicodeSet                fHiraganaWordSet;
   329   DictionaryMatcher  *fDictionary;
   331  public:
   333     /**
   334      * <p>Default constructor.</p>
   335      *
   336      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
   337      * engine is deleted. The DictionaryMatcher must contain costs for each word
   338      * in order for the dictionary to work properly.
   339      */
   340   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
   342     /**
   343      * <p>Virtual destructor.</p>
   344      */
   345   virtual ~CjkBreakEngine();
   347  protected:
   348     /**
   349      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
   350      *
   351      * @param text A UText representing the text
   352      * @param rangeStart The start of the range of dictionary characters
   353      * @param rangeEnd The end of the range of dictionary characters
   354      * @param foundBreaks Output of C array of int32_t break positions, or 0
   355      * @return The number of breaks found
   356      */
   357   virtual int32_t divideUpDictionaryRange( UText *text,
   358           int32_t rangeStart,
   359           int32_t rangeEnd,
   360           UStack &foundBreaks ) const;
   362 };
   364 #endif
   366 U_NAMESPACE_END
   368     /* DICTBE_H */
   369 #endif

mercurial