intl/icu/source/common/dictbe.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /**
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation *
michael@0 4 * and others. All Rights Reserved. *
michael@0 5 *******************************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #ifndef DICTBE_H
michael@0 9 #define DICTBE_H
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12 #include "unicode/uniset.h"
michael@0 13 #include "unicode/utext.h"
michael@0 14
michael@0 15 #include "brkeng.h"
michael@0 16
michael@0 17 U_NAMESPACE_BEGIN
michael@0 18
michael@0 19 class DictionaryMatcher;
michael@0 20
michael@0 21 /*******************************************************************
michael@0 22 * DictionaryBreakEngine
michael@0 23 */
michael@0 24
michael@0 25 /**
michael@0 26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
michael@0 27 * dictionary to determine language-specific breaks.</p>
michael@0 28 *
michael@0 29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
michael@0 30 * threads without synchronization.</p>
michael@0 31 */
michael@0 32 class DictionaryBreakEngine : public LanguageBreakEngine {
michael@0 33 private:
michael@0 34 /**
michael@0 35 * The set of characters handled by this engine
michael@0 36 * @internal
michael@0 37 */
michael@0 38
michael@0 39 UnicodeSet fSet;
michael@0 40
michael@0 41 /**
michael@0 42 * The set of break types handled by this engine
michael@0 43 * @internal
michael@0 44 */
michael@0 45
michael@0 46 uint32_t fTypes;
michael@0 47
michael@0 48 /**
michael@0 49 * <p>Default constructor.</p>
michael@0 50 *
michael@0 51 */
michael@0 52 DictionaryBreakEngine();
michael@0 53
michael@0 54 public:
michael@0 55
michael@0 56 /**
michael@0 57 * <p>Constructor setting the break types handled.</p>
michael@0 58 *
michael@0 59 * @param breakTypes A bitmap of types handled by the engine.
michael@0 60 */
michael@0 61 DictionaryBreakEngine( uint32_t breakTypes );
michael@0 62
michael@0 63 /**
michael@0 64 * <p>Virtual destructor.</p>
michael@0 65 */
michael@0 66 virtual ~DictionaryBreakEngine();
michael@0 67
michael@0 68 /**
michael@0 69 * <p>Indicate whether this engine handles a particular character for
michael@0 70 * a particular kind of break.</p>
michael@0 71 *
michael@0 72 * @param c A character which begins a run that the engine might handle
michael@0 73 * @param breakType The type of text break which the caller wants to determine
michael@0 74 * @return TRUE if this engine handles the particular character and break
michael@0 75 * type.
michael@0 76 */
michael@0 77 virtual UBool handles( UChar32 c, int32_t breakType ) const;
michael@0 78
michael@0 79 /**
michael@0 80 * <p>Find any breaks within a run in the supplied text.</p>
michael@0 81 *
michael@0 82 * @param text A UText representing the text. The iterator is left at
michael@0 83 * the end of the run of characters which the engine is capable of handling
michael@0 84 * that starts from the first (or last) character in the range.
michael@0 85 * @param startPos The start of the run within the supplied text.
michael@0 86 * @param endPos The end of the run within the supplied text.
michael@0 87 * @param reverse Whether the caller is looking for breaks in a reverse
michael@0 88 * direction.
michael@0 89 * @param breakType The type of break desired, or -1.
michael@0 90 * @param foundBreaks An allocated C array of the breaks found, if any
michael@0 91 * @return The number of breaks found.
michael@0 92 */
michael@0 93 virtual int32_t findBreaks( UText *text,
michael@0 94 int32_t startPos,
michael@0 95 int32_t endPos,
michael@0 96 UBool reverse,
michael@0 97 int32_t breakType,
michael@0 98 UStack &foundBreaks ) const;
michael@0 99
michael@0 100 protected:
michael@0 101
michael@0 102 /**
michael@0 103 * <p>Set the character set handled by this engine.</p>
michael@0 104 *
michael@0 105 * @param set A UnicodeSet of the set of characters handled by the engine
michael@0 106 */
michael@0 107 virtual void setCharacters( const UnicodeSet &set );
michael@0 108
michael@0 109 /**
michael@0 110 * <p>Set the break types handled by this engine.</p>
michael@0 111 *
michael@0 112 * @param breakTypes A bitmap of types handled by the engine.
michael@0 113 */
michael@0 114 // virtual void setBreakTypes( uint32_t breakTypes );
michael@0 115
michael@0 116 /**
michael@0 117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
michael@0 118 *
michael@0 119 * @param text A UText representing the text
michael@0 120 * @param rangeStart The start of the range of dictionary characters
michael@0 121 * @param rangeEnd The end of the range of dictionary characters
michael@0 122 * @param foundBreaks Output of C array of int32_t break positions, or 0
michael@0 123 * @return The number of breaks found
michael@0 124 */
michael@0 125 virtual int32_t divideUpDictionaryRange( UText *text,
michael@0 126 int32_t rangeStart,
michael@0 127 int32_t rangeEnd,
michael@0 128 UStack &foundBreaks ) const = 0;
michael@0 129
michael@0 130 };
michael@0 131
michael@0 132 /*******************************************************************
michael@0 133 * ThaiBreakEngine
michael@0 134 */
michael@0 135
michael@0 136 /**
michael@0 137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
michael@0 138 * dictionary and heuristics to determine Thai-specific breaks.</p>
michael@0 139 *
michael@0 140 * <p>After it is constructed a ThaiBreakEngine may be shared between
michael@0 141 * threads without synchronization.</p>
michael@0 142 */
michael@0 143 class ThaiBreakEngine : public DictionaryBreakEngine {
michael@0 144 private:
michael@0 145 /**
michael@0 146 * The set of characters handled by this engine
michael@0 147 * @internal
michael@0 148 */
michael@0 149
michael@0 150 UnicodeSet fThaiWordSet;
michael@0 151 UnicodeSet fEndWordSet;
michael@0 152 UnicodeSet fBeginWordSet;
michael@0 153 UnicodeSet fSuffixSet;
michael@0 154 UnicodeSet fMarkSet;
michael@0 155 DictionaryMatcher *fDictionary;
michael@0 156
michael@0 157 public:
michael@0 158
michael@0 159 /**
michael@0 160 * <p>Default constructor.</p>
michael@0 161 *
michael@0 162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
michael@0 163 * engine is deleted.
michael@0 164 */
michael@0 165 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
michael@0 166
michael@0 167 /**
michael@0 168 * <p>Virtual destructor.</p>
michael@0 169 */
michael@0 170 virtual ~ThaiBreakEngine();
michael@0 171
michael@0 172 protected:
michael@0 173 /**
michael@0 174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
michael@0 175 *
michael@0 176 * @param text A UText representing the text
michael@0 177 * @param rangeStart The start of the range of dictionary characters
michael@0 178 * @param rangeEnd The end of the range of dictionary characters
michael@0 179 * @param foundBreaks Output of C array of int32_t break positions, or 0
michael@0 180 * @return The number of breaks found
michael@0 181 */
michael@0 182 virtual int32_t divideUpDictionaryRange( UText *text,
michael@0 183 int32_t rangeStart,
michael@0 184 int32_t rangeEnd,
michael@0 185 UStack &foundBreaks ) const;
michael@0 186
michael@0 187 };
michael@0 188
michael@0 189 /*******************************************************************
michael@0 190 * LaoBreakEngine
michael@0 191 */
michael@0 192
michael@0 193 /**
michael@0 194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
michael@0 195 * dictionary and heuristics to determine Lao-specific breaks.</p>
michael@0 196 *
michael@0 197 * <p>After it is constructed a LaoBreakEngine may be shared between
michael@0 198 * threads without synchronization.</p>
michael@0 199 */
michael@0 200 class LaoBreakEngine : public DictionaryBreakEngine {
michael@0 201 private:
michael@0 202 /**
michael@0 203 * The set of characters handled by this engine
michael@0 204 * @internal
michael@0 205 */
michael@0 206
michael@0 207 UnicodeSet fLaoWordSet;
michael@0 208 UnicodeSet fEndWordSet;
michael@0 209 UnicodeSet fBeginWordSet;
michael@0 210 UnicodeSet fMarkSet;
michael@0 211 DictionaryMatcher *fDictionary;
michael@0 212
michael@0 213 public:
michael@0 214
michael@0 215 /**
michael@0 216 * <p>Default constructor.</p>
michael@0 217 *
michael@0 218 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
michael@0 219 * engine is deleted.
michael@0 220 */
michael@0 221 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
michael@0 222
michael@0 223 /**
michael@0 224 * <p>Virtual destructor.</p>
michael@0 225 */
michael@0 226 virtual ~LaoBreakEngine();
michael@0 227
michael@0 228 protected:
michael@0 229 /**
michael@0 230 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
michael@0 231 *
michael@0 232 * @param text A UText representing the text
michael@0 233 * @param rangeStart The start of the range of dictionary characters
michael@0 234 * @param rangeEnd The end of the range of dictionary characters
michael@0 235 * @param foundBreaks Output of C array of int32_t break positions, or 0
michael@0 236 * @return The number of breaks found
michael@0 237 */
michael@0 238 virtual int32_t divideUpDictionaryRange( UText *text,
michael@0 239 int32_t rangeStart,
michael@0 240 int32_t rangeEnd,
michael@0 241 UStack &foundBreaks ) const;
michael@0 242
michael@0 243 };
michael@0 244
michael@0 245 /*******************************************************************
michael@0 246 * KhmerBreakEngine
michael@0 247 */
michael@0 248
michael@0 249 /**
michael@0 250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
michael@0 251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
michael@0 252 *
michael@0 253 * <p>After it is constructed a KhmerBreakEngine may be shared between
michael@0 254 * threads without synchronization.</p>
michael@0 255 */
michael@0 256 class KhmerBreakEngine : public DictionaryBreakEngine {
michael@0 257 private:
michael@0 258 /**
michael@0 259 * The set of characters handled by this engine
michael@0 260 * @internal
michael@0 261 */
michael@0 262
michael@0 263 UnicodeSet fKhmerWordSet;
michael@0 264 UnicodeSet fEndWordSet;
michael@0 265 UnicodeSet fBeginWordSet;
michael@0 266 UnicodeSet fMarkSet;
michael@0 267 DictionaryMatcher *fDictionary;
michael@0 268
michael@0 269 public:
michael@0 270
michael@0 271 /**
michael@0 272 * <p>Default constructor.</p>
michael@0 273 *
michael@0 274 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
michael@0 275 * engine is deleted.
michael@0 276 */
michael@0 277 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
michael@0 278
michael@0 279 /**
michael@0 280 * <p>Virtual destructor.</p>
michael@0 281 */
michael@0 282 virtual ~KhmerBreakEngine();
michael@0 283
michael@0 284 protected:
michael@0 285 /**
michael@0 286 * <p>Divide up a range of known dictionary characters.</p>
michael@0 287 *
michael@0 288 * @param text A UText representing the text
michael@0 289 * @param rangeStart The start of the range of dictionary characters
michael@0 290 * @param rangeEnd The end of the range of dictionary characters
michael@0 291 * @param foundBreaks Output of C array of int32_t break positions, or 0
michael@0 292 * @return The number of breaks found
michael@0 293 */
michael@0 294 virtual int32_t divideUpDictionaryRange( UText *text,
michael@0 295 int32_t rangeStart,
michael@0 296 int32_t rangeEnd,
michael@0 297 UStack &foundBreaks ) const;
michael@0 298
michael@0 299 };
michael@0 300
michael@0 301 #if !UCONFIG_NO_NORMALIZATION
michael@0 302
michael@0 303 /*******************************************************************
michael@0 304 * CjkBreakEngine
michael@0 305 */
michael@0 306
michael@0 307 //indicates language/script that the CjkBreakEngine will handle
michael@0 308 enum LanguageType {
michael@0 309 kKorean,
michael@0 310 kChineseJapanese
michael@0 311 };
michael@0 312
michael@0 313 /**
michael@0 314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
michael@0 315 * dictionary with costs associated with each word and
michael@0 316 * Viterbi decoding to determine CJK-specific breaks.</p>
michael@0 317 */
michael@0 318 class CjkBreakEngine : public DictionaryBreakEngine {
michael@0 319 protected:
michael@0 320 /**
michael@0 321 * The set of characters handled by this engine
michael@0 322 * @internal
michael@0 323 */
michael@0 324 UnicodeSet fHangulWordSet;
michael@0 325 UnicodeSet fHanWordSet;
michael@0 326 UnicodeSet fKatakanaWordSet;
michael@0 327 UnicodeSet fHiraganaWordSet;
michael@0 328
michael@0 329 DictionaryMatcher *fDictionary;
michael@0 330
michael@0 331 public:
michael@0 332
michael@0 333 /**
michael@0 334 * <p>Default constructor.</p>
michael@0 335 *
michael@0 336 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
michael@0 337 * engine is deleted. The DictionaryMatcher must contain costs for each word
michael@0 338 * in order for the dictionary to work properly.
michael@0 339 */
michael@0 340 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
michael@0 341
michael@0 342 /**
michael@0 343 * <p>Virtual destructor.</p>
michael@0 344 */
michael@0 345 virtual ~CjkBreakEngine();
michael@0 346
michael@0 347 protected:
michael@0 348 /**
michael@0 349 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
michael@0 350 *
michael@0 351 * @param text A UText representing the text
michael@0 352 * @param rangeStart The start of the range of dictionary characters
michael@0 353 * @param rangeEnd The end of the range of dictionary characters
michael@0 354 * @param foundBreaks Output of C array of int32_t break positions, or 0
michael@0 355 * @return The number of breaks found
michael@0 356 */
michael@0 357 virtual int32_t divideUpDictionaryRange( UText *text,
michael@0 358 int32_t rangeStart,
michael@0 359 int32_t rangeEnd,
michael@0 360 UStack &foundBreaks ) const;
michael@0 361
michael@0 362 };
michael@0 363
michael@0 364 #endif
michael@0 365
michael@0 366 U_NAMESPACE_END
michael@0 367
michael@0 368 /* DICTBE_H */
michael@0 369 #endif

mercurial