|
1 /** |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation * |
|
4 * and others. All Rights Reserved. * |
|
5 ******************************************************************************* |
|
6 */ |
|
7 |
|
8 #ifndef DICTBE_H |
|
9 #define DICTBE_H |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 #include "unicode/uniset.h" |
|
13 #include "unicode/utext.h" |
|
14 |
|
15 #include "brkeng.h" |
|
16 |
|
17 U_NAMESPACE_BEGIN |
|
18 |
|
19 class DictionaryMatcher; |
|
20 |
|
21 /******************************************************************* |
|
22 * DictionaryBreakEngine |
|
23 */ |
|
24 |
|
25 /** |
|
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a |
|
27 * dictionary to determine language-specific breaks.</p> |
|
28 * |
|
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between |
|
30 * threads without synchronization.</p> |
|
31 */ |
|
32 class DictionaryBreakEngine : public LanguageBreakEngine { |
|
33 private: |
|
34 /** |
|
35 * The set of characters handled by this engine |
|
36 * @internal |
|
37 */ |
|
38 |
|
39 UnicodeSet fSet; |
|
40 |
|
41 /** |
|
42 * The set of break types handled by this engine |
|
43 * @internal |
|
44 */ |
|
45 |
|
46 uint32_t fTypes; |
|
47 |
|
48 /** |
|
49 * <p>Default constructor.</p> |
|
50 * |
|
51 */ |
|
52 DictionaryBreakEngine(); |
|
53 |
|
54 public: |
|
55 |
|
56 /** |
|
57 * <p>Constructor setting the break types handled.</p> |
|
58 * |
|
59 * @param breakTypes A bitmap of types handled by the engine. |
|
60 */ |
|
61 DictionaryBreakEngine( uint32_t breakTypes ); |
|
62 |
|
63 /** |
|
64 * <p>Virtual destructor.</p> |
|
65 */ |
|
66 virtual ~DictionaryBreakEngine(); |
|
67 |
|
68 /** |
|
69 * <p>Indicate whether this engine handles a particular character for |
|
70 * a particular kind of break.</p> |
|
71 * |
|
72 * @param c A character which begins a run that the engine might handle |
|
73 * @param breakType The type of text break which the caller wants to determine |
|
74 * @return TRUE if this engine handles the particular character and break |
|
75 * type. |
|
76 */ |
|
77 virtual UBool handles( UChar32 c, int32_t breakType ) const; |
|
78 |
|
79 /** |
|
80 * <p>Find any breaks within a run in the supplied text.</p> |
|
81 * |
|
82 * @param text A UText representing the text. The iterator is left at |
|
83 * the end of the run of characters which the engine is capable of handling |
|
84 * that starts from the first (or last) character in the range. |
|
85 * @param startPos The start of the run within the supplied text. |
|
86 * @param endPos The end of the run within the supplied text. |
|
87 * @param reverse Whether the caller is looking for breaks in a reverse |
|
88 * direction. |
|
89 * @param breakType The type of break desired, or -1. |
|
90 * @param foundBreaks An allocated C array of the breaks found, if any |
|
91 * @return The number of breaks found. |
|
92 */ |
|
93 virtual int32_t findBreaks( UText *text, |
|
94 int32_t startPos, |
|
95 int32_t endPos, |
|
96 UBool reverse, |
|
97 int32_t breakType, |
|
98 UStack &foundBreaks ) const; |
|
99 |
|
100 protected: |
|
101 |
|
102 /** |
|
103 * <p>Set the character set handled by this engine.</p> |
|
104 * |
|
105 * @param set A UnicodeSet of the set of characters handled by the engine |
|
106 */ |
|
107 virtual void setCharacters( const UnicodeSet &set ); |
|
108 |
|
109 /** |
|
110 * <p>Set the break types handled by this engine.</p> |
|
111 * |
|
112 * @param breakTypes A bitmap of types handled by the engine. |
|
113 */ |
|
114 // virtual void setBreakTypes( uint32_t breakTypes ); |
|
115 |
|
116 /** |
|
117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
|
118 * |
|
119 * @param text A UText representing the text |
|
120 * @param rangeStart The start of the range of dictionary characters |
|
121 * @param rangeEnd The end of the range of dictionary characters |
|
122 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
|
123 * @return The number of breaks found |
|
124 */ |
|
125 virtual int32_t divideUpDictionaryRange( UText *text, |
|
126 int32_t rangeStart, |
|
127 int32_t rangeEnd, |
|
128 UStack &foundBreaks ) const = 0; |
|
129 |
|
130 }; |
|
131 |
|
132 /******************************************************************* |
|
133 * ThaiBreakEngine |
|
134 */ |
|
135 |
|
136 /** |
|
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a |
|
138 * dictionary and heuristics to determine Thai-specific breaks.</p> |
|
139 * |
|
140 * <p>After it is constructed a ThaiBreakEngine may be shared between |
|
141 * threads without synchronization.</p> |
|
142 */ |
|
143 class ThaiBreakEngine : public DictionaryBreakEngine { |
|
144 private: |
|
145 /** |
|
146 * The set of characters handled by this engine |
|
147 * @internal |
|
148 */ |
|
149 |
|
150 UnicodeSet fThaiWordSet; |
|
151 UnicodeSet fEndWordSet; |
|
152 UnicodeSet fBeginWordSet; |
|
153 UnicodeSet fSuffixSet; |
|
154 UnicodeSet fMarkSet; |
|
155 DictionaryMatcher *fDictionary; |
|
156 |
|
157 public: |
|
158 |
|
159 /** |
|
160 * <p>Default constructor.</p> |
|
161 * |
|
162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
|
163 * engine is deleted. |
|
164 */ |
|
165 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
|
166 |
|
167 /** |
|
168 * <p>Virtual destructor.</p> |
|
169 */ |
|
170 virtual ~ThaiBreakEngine(); |
|
171 |
|
172 protected: |
|
173 /** |
|
174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
|
175 * |
|
176 * @param text A UText representing the text |
|
177 * @param rangeStart The start of the range of dictionary characters |
|
178 * @param rangeEnd The end of the range of dictionary characters |
|
179 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
|
180 * @return The number of breaks found |
|
181 */ |
|
182 virtual int32_t divideUpDictionaryRange( UText *text, |
|
183 int32_t rangeStart, |
|
184 int32_t rangeEnd, |
|
185 UStack &foundBreaks ) const; |
|
186 |
|
187 }; |
|
188 |
|
189 /******************************************************************* |
|
190 * LaoBreakEngine |
|
191 */ |
|
192 |
|
193 /** |
|
194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a |
|
195 * dictionary and heuristics to determine Lao-specific breaks.</p> |
|
196 * |
|
197 * <p>After it is constructed a LaoBreakEngine may be shared between |
|
198 * threads without synchronization.</p> |
|
199 */ |
|
200 class LaoBreakEngine : public DictionaryBreakEngine { |
|
201 private: |
|
202 /** |
|
203 * The set of characters handled by this engine |
|
204 * @internal |
|
205 */ |
|
206 |
|
207 UnicodeSet fLaoWordSet; |
|
208 UnicodeSet fEndWordSet; |
|
209 UnicodeSet fBeginWordSet; |
|
210 UnicodeSet fMarkSet; |
|
211 DictionaryMatcher *fDictionary; |
|
212 |
|
213 public: |
|
214 |
|
215 /** |
|
216 * <p>Default constructor.</p> |
|
217 * |
|
218 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
|
219 * engine is deleted. |
|
220 */ |
|
221 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
|
222 |
|
223 /** |
|
224 * <p>Virtual destructor.</p> |
|
225 */ |
|
226 virtual ~LaoBreakEngine(); |
|
227 |
|
228 protected: |
|
229 /** |
|
230 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
|
231 * |
|
232 * @param text A UText representing the text |
|
233 * @param rangeStart The start of the range of dictionary characters |
|
234 * @param rangeEnd The end of the range of dictionary characters |
|
235 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
|
236 * @return The number of breaks found |
|
237 */ |
|
238 virtual int32_t divideUpDictionaryRange( UText *text, |
|
239 int32_t rangeStart, |
|
240 int32_t rangeEnd, |
|
241 UStack &foundBreaks ) const; |
|
242 |
|
243 }; |
|
244 |
|
245 /******************************************************************* |
|
246 * KhmerBreakEngine |
|
247 */ |
|
248 |
|
249 /** |
|
250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a |
|
251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> |
|
252 * |
|
253 * <p>After it is constructed a KhmerBreakEngine may be shared between |
|
254 * threads without synchronization.</p> |
|
255 */ |
|
256 class KhmerBreakEngine : public DictionaryBreakEngine { |
|
257 private: |
|
258 /** |
|
259 * The set of characters handled by this engine |
|
260 * @internal |
|
261 */ |
|
262 |
|
263 UnicodeSet fKhmerWordSet; |
|
264 UnicodeSet fEndWordSet; |
|
265 UnicodeSet fBeginWordSet; |
|
266 UnicodeSet fMarkSet; |
|
267 DictionaryMatcher *fDictionary; |
|
268 |
|
269 public: |
|
270 |
|
271 /** |
|
272 * <p>Default constructor.</p> |
|
273 * |
|
274 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
|
275 * engine is deleted. |
|
276 */ |
|
277 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
|
278 |
|
279 /** |
|
280 * <p>Virtual destructor.</p> |
|
281 */ |
|
282 virtual ~KhmerBreakEngine(); |
|
283 |
|
284 protected: |
|
285 /** |
|
286 * <p>Divide up a range of known dictionary characters.</p> |
|
287 * |
|
288 * @param text A UText representing the text |
|
289 * @param rangeStart The start of the range of dictionary characters |
|
290 * @param rangeEnd The end of the range of dictionary characters |
|
291 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
|
292 * @return The number of breaks found |
|
293 */ |
|
294 virtual int32_t divideUpDictionaryRange( UText *text, |
|
295 int32_t rangeStart, |
|
296 int32_t rangeEnd, |
|
297 UStack &foundBreaks ) const; |
|
298 |
|
299 }; |
|
300 |
|
301 #if !UCONFIG_NO_NORMALIZATION |
|
302 |
|
303 /******************************************************************* |
|
304 * CjkBreakEngine |
|
305 */ |
|
306 |
|
307 //indicates language/script that the CjkBreakEngine will handle |
|
308 enum LanguageType { |
|
309 kKorean, |
|
310 kChineseJapanese |
|
311 }; |
|
312 |
|
313 /** |
|
314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a |
|
315 * dictionary with costs associated with each word and |
|
316 * Viterbi decoding to determine CJK-specific breaks.</p> |
|
317 */ |
|
318 class CjkBreakEngine : public DictionaryBreakEngine { |
|
319 protected: |
|
320 /** |
|
321 * The set of characters handled by this engine |
|
322 * @internal |
|
323 */ |
|
324 UnicodeSet fHangulWordSet; |
|
325 UnicodeSet fHanWordSet; |
|
326 UnicodeSet fKatakanaWordSet; |
|
327 UnicodeSet fHiraganaWordSet; |
|
328 |
|
329 DictionaryMatcher *fDictionary; |
|
330 |
|
331 public: |
|
332 |
|
333 /** |
|
334 * <p>Default constructor.</p> |
|
335 * |
|
336 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
|
337 * engine is deleted. The DictionaryMatcher must contain costs for each word |
|
338 * in order for the dictionary to work properly. |
|
339 */ |
|
340 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); |
|
341 |
|
342 /** |
|
343 * <p>Virtual destructor.</p> |
|
344 */ |
|
345 virtual ~CjkBreakEngine(); |
|
346 |
|
347 protected: |
|
348 /** |
|
349 * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
|
350 * |
|
351 * @param text A UText representing the text |
|
352 * @param rangeStart The start of the range of dictionary characters |
|
353 * @param rangeEnd The end of the range of dictionary characters |
|
354 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
|
355 * @return The number of breaks found |
|
356 */ |
|
357 virtual int32_t divideUpDictionaryRange( UText *text, |
|
358 int32_t rangeStart, |
|
359 int32_t rangeEnd, |
|
360 UStack &foundBreaks ) const; |
|
361 |
|
362 }; |
|
363 |
|
364 #endif |
|
365 |
|
366 U_NAMESPACE_END |
|
367 |
|
368 /* DICTBE_H */ |
|
369 #endif |