1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/dictbe.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,369 @@ 1.4 +/** 1.5 + ******************************************************************************* 1.6 + * Copyright (C) 2006,2012-2013, International Business Machines Corporation * 1.7 + * and others. All Rights Reserved. * 1.8 + ******************************************************************************* 1.9 + */ 1.10 + 1.11 +#ifndef DICTBE_H 1.12 +#define DICTBE_H 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 +#include "unicode/uniset.h" 1.16 +#include "unicode/utext.h" 1.17 + 1.18 +#include "brkeng.h" 1.19 + 1.20 +U_NAMESPACE_BEGIN 1.21 + 1.22 +class DictionaryMatcher; 1.23 + 1.24 +/******************************************************************* 1.25 + * DictionaryBreakEngine 1.26 + */ 1.27 + 1.28 +/** 1.29 + * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a 1.30 + * dictionary to determine language-specific breaks.</p> 1.31 + * 1.32 + * <p>After it is constructed a DictionaryBreakEngine may be shared between 1.33 + * threads without synchronization.</p> 1.34 + */ 1.35 +class DictionaryBreakEngine : public LanguageBreakEngine { 1.36 + private: 1.37 + /** 1.38 + * The set of characters handled by this engine 1.39 + * @internal 1.40 + */ 1.41 + 1.42 + UnicodeSet fSet; 1.43 + 1.44 + /** 1.45 + * The set of break types handled by this engine 1.46 + * @internal 1.47 + */ 1.48 + 1.49 + uint32_t fTypes; 1.50 + 1.51 + /** 1.52 + * <p>Default constructor.</p> 1.53 + * 1.54 + */ 1.55 + DictionaryBreakEngine(); 1.56 + 1.57 + public: 1.58 + 1.59 + /** 1.60 + * <p>Constructor setting the break types handled.</p> 1.61 + * 1.62 + * @param breakTypes A bitmap of types handled by the engine. 1.63 + */ 1.64 + DictionaryBreakEngine( uint32_t breakTypes ); 1.65 + 1.66 + /** 1.67 + * <p>Virtual destructor.</p> 1.68 + */ 1.69 + virtual ~DictionaryBreakEngine(); 1.70 + 1.71 + /** 1.72 + * <p>Indicate whether this engine handles a particular character for 1.73 + * a particular kind of break.</p> 1.74 + * 1.75 + * @param c A character which begins a run that the engine might handle 1.76 + * @param breakType The type of text break which the caller wants to determine 1.77 + * @return TRUE if this engine handles the particular character and break 1.78 + * type. 1.79 + */ 1.80 + virtual UBool handles( UChar32 c, int32_t breakType ) const; 1.81 + 1.82 + /** 1.83 + * <p>Find any breaks within a run in the supplied text.</p> 1.84 + * 1.85 + * @param text A UText representing the text. The iterator is left at 1.86 + * the end of the run of characters which the engine is capable of handling 1.87 + * that starts from the first (or last) character in the range. 1.88 + * @param startPos The start of the run within the supplied text. 1.89 + * @param endPos The end of the run within the supplied text. 1.90 + * @param reverse Whether the caller is looking for breaks in a reverse 1.91 + * direction. 1.92 + * @param breakType The type of break desired, or -1. 1.93 + * @param foundBreaks An allocated C array of the breaks found, if any 1.94 + * @return The number of breaks found. 1.95 + */ 1.96 + virtual int32_t findBreaks( UText *text, 1.97 + int32_t startPos, 1.98 + int32_t endPos, 1.99 + UBool reverse, 1.100 + int32_t breakType, 1.101 + UStack &foundBreaks ) const; 1.102 + 1.103 + protected: 1.104 + 1.105 + /** 1.106 + * <p>Set the character set handled by this engine.</p> 1.107 + * 1.108 + * @param set A UnicodeSet of the set of characters handled by the engine 1.109 + */ 1.110 + virtual void setCharacters( const UnicodeSet &set ); 1.111 + 1.112 + /** 1.113 + * <p>Set the break types handled by this engine.</p> 1.114 + * 1.115 + * @param breakTypes A bitmap of types handled by the engine. 1.116 + */ 1.117 +// virtual void setBreakTypes( uint32_t breakTypes ); 1.118 + 1.119 + /** 1.120 + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1.121 + * 1.122 + * @param text A UText representing the text 1.123 + * @param rangeStart The start of the range of dictionary characters 1.124 + * @param rangeEnd The end of the range of dictionary characters 1.125 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.126 + * @return The number of breaks found 1.127 + */ 1.128 + virtual int32_t divideUpDictionaryRange( UText *text, 1.129 + int32_t rangeStart, 1.130 + int32_t rangeEnd, 1.131 + UStack &foundBreaks ) const = 0; 1.132 + 1.133 +}; 1.134 + 1.135 +/******************************************************************* 1.136 + * ThaiBreakEngine 1.137 + */ 1.138 + 1.139 +/** 1.140 + * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a 1.141 + * dictionary and heuristics to determine Thai-specific breaks.</p> 1.142 + * 1.143 + * <p>After it is constructed a ThaiBreakEngine may be shared between 1.144 + * threads without synchronization.</p> 1.145 + */ 1.146 +class ThaiBreakEngine : public DictionaryBreakEngine { 1.147 + private: 1.148 + /** 1.149 + * The set of characters handled by this engine 1.150 + * @internal 1.151 + */ 1.152 + 1.153 + UnicodeSet fThaiWordSet; 1.154 + UnicodeSet fEndWordSet; 1.155 + UnicodeSet fBeginWordSet; 1.156 + UnicodeSet fSuffixSet; 1.157 + UnicodeSet fMarkSet; 1.158 + DictionaryMatcher *fDictionary; 1.159 + 1.160 + public: 1.161 + 1.162 + /** 1.163 + * <p>Default constructor.</p> 1.164 + * 1.165 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 1.166 + * engine is deleted. 1.167 + */ 1.168 + ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 1.169 + 1.170 + /** 1.171 + * <p>Virtual destructor.</p> 1.172 + */ 1.173 + virtual ~ThaiBreakEngine(); 1.174 + 1.175 + protected: 1.176 + /** 1.177 + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1.178 + * 1.179 + * @param text A UText representing the text 1.180 + * @param rangeStart The start of the range of dictionary characters 1.181 + * @param rangeEnd The end of the range of dictionary characters 1.182 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.183 + * @return The number of breaks found 1.184 + */ 1.185 + virtual int32_t divideUpDictionaryRange( UText *text, 1.186 + int32_t rangeStart, 1.187 + int32_t rangeEnd, 1.188 + UStack &foundBreaks ) const; 1.189 + 1.190 +}; 1.191 + 1.192 +/******************************************************************* 1.193 + * LaoBreakEngine 1.194 + */ 1.195 + 1.196 +/** 1.197 + * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a 1.198 + * dictionary and heuristics to determine Lao-specific breaks.</p> 1.199 + * 1.200 + * <p>After it is constructed a LaoBreakEngine may be shared between 1.201 + * threads without synchronization.</p> 1.202 + */ 1.203 +class LaoBreakEngine : public DictionaryBreakEngine { 1.204 + private: 1.205 + /** 1.206 + * The set of characters handled by this engine 1.207 + * @internal 1.208 + */ 1.209 + 1.210 + UnicodeSet fLaoWordSet; 1.211 + UnicodeSet fEndWordSet; 1.212 + UnicodeSet fBeginWordSet; 1.213 + UnicodeSet fMarkSet; 1.214 + DictionaryMatcher *fDictionary; 1.215 + 1.216 + public: 1.217 + 1.218 + /** 1.219 + * <p>Default constructor.</p> 1.220 + * 1.221 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 1.222 + * engine is deleted. 1.223 + */ 1.224 + LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 1.225 + 1.226 + /** 1.227 + * <p>Virtual destructor.</p> 1.228 + */ 1.229 + virtual ~LaoBreakEngine(); 1.230 + 1.231 + protected: 1.232 + /** 1.233 + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1.234 + * 1.235 + * @param text A UText representing the text 1.236 + * @param rangeStart The start of the range of dictionary characters 1.237 + * @param rangeEnd The end of the range of dictionary characters 1.238 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.239 + * @return The number of breaks found 1.240 + */ 1.241 + virtual int32_t divideUpDictionaryRange( UText *text, 1.242 + int32_t rangeStart, 1.243 + int32_t rangeEnd, 1.244 + UStack &foundBreaks ) const; 1.245 + 1.246 +}; 1.247 + 1.248 +/******************************************************************* 1.249 + * KhmerBreakEngine 1.250 + */ 1.251 + 1.252 +/** 1.253 + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 1.254 + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> 1.255 + * 1.256 + * <p>After it is constructed a KhmerBreakEngine may be shared between 1.257 + * threads without synchronization.</p> 1.258 + */ 1.259 +class KhmerBreakEngine : public DictionaryBreakEngine { 1.260 + private: 1.261 + /** 1.262 + * The set of characters handled by this engine 1.263 + * @internal 1.264 + */ 1.265 + 1.266 + UnicodeSet fKhmerWordSet; 1.267 + UnicodeSet fEndWordSet; 1.268 + UnicodeSet fBeginWordSet; 1.269 + UnicodeSet fMarkSet; 1.270 + DictionaryMatcher *fDictionary; 1.271 + 1.272 + public: 1.273 + 1.274 + /** 1.275 + * <p>Default constructor.</p> 1.276 + * 1.277 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 1.278 + * engine is deleted. 1.279 + */ 1.280 + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); 1.281 + 1.282 + /** 1.283 + * <p>Virtual destructor.</p> 1.284 + */ 1.285 + virtual ~KhmerBreakEngine(); 1.286 + 1.287 + protected: 1.288 + /** 1.289 + * <p>Divide up a range of known dictionary characters.</p> 1.290 + * 1.291 + * @param text A UText representing the text 1.292 + * @param rangeStart The start of the range of dictionary characters 1.293 + * @param rangeEnd The end of the range of dictionary characters 1.294 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.295 + * @return The number of breaks found 1.296 + */ 1.297 + virtual int32_t divideUpDictionaryRange( UText *text, 1.298 + int32_t rangeStart, 1.299 + int32_t rangeEnd, 1.300 + UStack &foundBreaks ) const; 1.301 + 1.302 +}; 1.303 + 1.304 +#if !UCONFIG_NO_NORMALIZATION 1.305 + 1.306 +/******************************************************************* 1.307 + * CjkBreakEngine 1.308 + */ 1.309 + 1.310 +//indicates language/script that the CjkBreakEngine will handle 1.311 +enum LanguageType { 1.312 + kKorean, 1.313 + kChineseJapanese 1.314 +}; 1.315 + 1.316 +/** 1.317 + * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 1.318 + * dictionary with costs associated with each word and 1.319 + * Viterbi decoding to determine CJK-specific breaks.</p> 1.320 + */ 1.321 +class CjkBreakEngine : public DictionaryBreakEngine { 1.322 + protected: 1.323 + /** 1.324 + * The set of characters handled by this engine 1.325 + * @internal 1.326 + */ 1.327 + UnicodeSet fHangulWordSet; 1.328 + UnicodeSet fHanWordSet; 1.329 + UnicodeSet fKatakanaWordSet; 1.330 + UnicodeSet fHiraganaWordSet; 1.331 + 1.332 + DictionaryMatcher *fDictionary; 1.333 + 1.334 + public: 1.335 + 1.336 + /** 1.337 + * <p>Default constructor.</p> 1.338 + * 1.339 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the 1.340 + * engine is deleted. The DictionaryMatcher must contain costs for each word 1.341 + * in order for the dictionary to work properly. 1.342 + */ 1.343 + CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); 1.344 + 1.345 + /** 1.346 + * <p>Virtual destructor.</p> 1.347 + */ 1.348 + virtual ~CjkBreakEngine(); 1.349 + 1.350 + protected: 1.351 + /** 1.352 + * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 1.353 + * 1.354 + * @param text A UText representing the text 1.355 + * @param rangeStart The start of the range of dictionary characters 1.356 + * @param rangeEnd The end of the range of dictionary characters 1.357 + * @param foundBreaks Output of C array of int32_t break positions, or 0 1.358 + * @return The number of breaks found 1.359 + */ 1.360 + virtual int32_t divideUpDictionaryRange( UText *text, 1.361 + int32_t rangeStart, 1.362 + int32_t rangeEnd, 1.363 + UStack &foundBreaks ) const; 1.364 + 1.365 +}; 1.366 + 1.367 +#endif 1.368 + 1.369 +U_NAMESPACE_END 1.370 + 1.371 + /* DICTBE_H */ 1.372 +#endif