|
1 /** |
|
2 ************************************************************************************ |
|
3 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
|
4 * All Rights Reserved. * |
|
5 ************************************************************************************ |
|
6 */ |
|
7 |
|
8 #ifndef BRKENG_H |
|
9 #define BRKENG_H |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 #include "unicode/uobject.h" |
|
13 #include "unicode/utext.h" |
|
14 #include "unicode/uscript.h" |
|
15 |
|
16 U_NAMESPACE_BEGIN |
|
17 |
|
18 class UnicodeSet; |
|
19 class UStack; |
|
20 class DictionaryMatcher; |
|
21 |
|
22 /******************************************************************* |
|
23 * LanguageBreakEngine |
|
24 */ |
|
25 |
|
26 /** |
|
27 * <p>LanguageBreakEngines implement language-specific knowledge for |
|
28 * finding text boundaries within a run of characters belonging to a |
|
29 * specific set. The boundaries will be of a specific kind, e.g. word, |
|
30 * line, etc.</p> |
|
31 * |
|
32 * <p>LanguageBreakEngines should normally be implemented so as to |
|
33 * be shared between threads without locking.</p> |
|
34 */ |
|
35 class LanguageBreakEngine : public UMemory { |
|
36 public: |
|
37 |
|
38 /** |
|
39 * <p>Default constructor.</p> |
|
40 * |
|
41 */ |
|
42 LanguageBreakEngine(); |
|
43 |
|
44 /** |
|
45 * <p>Virtual destructor.</p> |
|
46 */ |
|
47 virtual ~LanguageBreakEngine(); |
|
48 |
|
49 /** |
|
50 * <p>Indicate whether this engine handles a particular character for |
|
51 * a particular kind of break.</p> |
|
52 * |
|
53 * @param c A character which begins a run that the engine might handle |
|
54 * @param breakType The type of text break which the caller wants to determine |
|
55 * @return TRUE if this engine handles the particular character and break |
|
56 * type. |
|
57 */ |
|
58 virtual UBool handles(UChar32 c, int32_t breakType) const = 0; |
|
59 |
|
60 /** |
|
61 * <p>Find any breaks within a run in the supplied text.</p> |
|
62 * |
|
63 * @param text A UText representing the text. The |
|
64 * iterator is left at the end of the run of characters which the engine |
|
65 * is capable of handling. |
|
66 * @param startPos The start of the run within the supplied text. |
|
67 * @param endPos The end of the run within the supplied text. |
|
68 * @param reverse Whether the caller is looking for breaks in a reverse |
|
69 * direction. |
|
70 * @param breakType The type of break desired, or -1. |
|
71 * @param foundBreaks An allocated C array of the breaks found, if any |
|
72 * @return The number of breaks found. |
|
73 */ |
|
74 virtual int32_t findBreaks( UText *text, |
|
75 int32_t startPos, |
|
76 int32_t endPos, |
|
77 UBool reverse, |
|
78 int32_t breakType, |
|
79 UStack &foundBreaks ) const = 0; |
|
80 |
|
81 }; |
|
82 |
|
83 /******************************************************************* |
|
84 * LanguageBreakFactory |
|
85 */ |
|
86 |
|
87 /** |
|
88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine |
|
89 * that can determine breaks for characters in a specific set, if |
|
90 * such an object can be found.</p> |
|
91 * |
|
92 * <p>If a LanguageBreakFactory is to be shared between threads, |
|
93 * appropriate synchronization must be used; there is none internal |
|
94 * to the factory.</p> |
|
95 * |
|
96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can |
|
97 * normally be shared between threads without synchronization, unless |
|
98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> |
|
99 * |
|
100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine |
|
101 * it returns when it itself is deleted, unless the specific subclass of |
|
102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should |
|
103 * not be deleted until the LanguageBreakEngines it has returned are no |
|
104 * longer needed.</p> |
|
105 */ |
|
106 class LanguageBreakFactory : public UMemory { |
|
107 public: |
|
108 |
|
109 /** |
|
110 * <p>Default constructor.</p> |
|
111 * |
|
112 */ |
|
113 LanguageBreakFactory(); |
|
114 |
|
115 /** |
|
116 * <p>Virtual destructor.</p> |
|
117 */ |
|
118 virtual ~LanguageBreakFactory(); |
|
119 |
|
120 /** |
|
121 * <p>Find and return a LanguageBreakEngine that can find the desired |
|
122 * kind of break for the set of characters to which the supplied |
|
123 * character belongs. It is up to the set of available engines to |
|
124 * determine what the sets of characters are.</p> |
|
125 * |
|
126 * @param c A character that begins a run for which a LanguageBreakEngine is |
|
127 * sought. |
|
128 * @param breakType The kind of text break for which a LanguageBreakEngine is |
|
129 * sought. |
|
130 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
|
131 */ |
|
132 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; |
|
133 |
|
134 }; |
|
135 |
|
136 /******************************************************************* |
|
137 * UnhandledEngine |
|
138 */ |
|
139 |
|
140 /** |
|
141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that |
|
142 * handles characters that no other LanguageBreakEngine is available to |
|
143 * handle. It is told the character and the type of break; at its |
|
144 * discretion it may handle more than the specified character (e.g., |
|
145 * the entire script to which that character belongs.</p> |
|
146 * |
|
147 * <p>UnhandledEngines may not be shared between threads without |
|
148 * external synchronization.</p> |
|
149 */ |
|
150 |
|
151 class UnhandledEngine : public LanguageBreakEngine { |
|
152 private: |
|
153 |
|
154 /** |
|
155 * The sets of characters handled, for each break type |
|
156 * @internal |
|
157 */ |
|
158 |
|
159 UnicodeSet *fHandled[4]; |
|
160 |
|
161 public: |
|
162 |
|
163 /** |
|
164 * <p>Default constructor.</p> |
|
165 * |
|
166 */ |
|
167 UnhandledEngine(UErrorCode &status); |
|
168 |
|
169 /** |
|
170 * <p>Virtual destructor.</p> |
|
171 */ |
|
172 virtual ~UnhandledEngine(); |
|
173 |
|
174 /** |
|
175 * <p>Indicate whether this engine handles a particular character for |
|
176 * a particular kind of break.</p> |
|
177 * |
|
178 * @param c A character which begins a run that the engine might handle |
|
179 * @param breakType The type of text break which the caller wants to determine |
|
180 * @return TRUE if this engine handles the particular character and break |
|
181 * type. |
|
182 */ |
|
183 virtual UBool handles(UChar32 c, int32_t breakType) const; |
|
184 |
|
185 /** |
|
186 * <p>Find any breaks within a run in the supplied text.</p> |
|
187 * |
|
188 * @param text A UText representing the text (TODO: UText). The |
|
189 * iterator is left at the end of the run of characters which the engine |
|
190 * is capable of handling. |
|
191 * @param startPos The start of the run within the supplied text. |
|
192 * @param endPos The end of the run within the supplied text. |
|
193 * @param reverse Whether the caller is looking for breaks in a reverse |
|
194 * direction. |
|
195 * @param breakType The type of break desired, or -1. |
|
196 * @param foundBreaks An allocated C array of the breaks found, if any |
|
197 * @return The number of breaks found. |
|
198 */ |
|
199 virtual int32_t findBreaks( UText *text, |
|
200 int32_t startPos, |
|
201 int32_t endPos, |
|
202 UBool reverse, |
|
203 int32_t breakType, |
|
204 UStack &foundBreaks ) const; |
|
205 |
|
206 /** |
|
207 * <p>Tell the engine to handle a particular character and break type.</p> |
|
208 * |
|
209 * @param c A character which the engine should handle |
|
210 * @param breakType The type of text break for which the engine should handle c |
|
211 */ |
|
212 virtual void handleCharacter(UChar32 c, int32_t breakType); |
|
213 |
|
214 }; |
|
215 |
|
216 /******************************************************************* |
|
217 * ICULanguageBreakFactory |
|
218 */ |
|
219 |
|
220 /** |
|
221 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for |
|
222 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary |
|
223 * data in the ICU data file.</p> |
|
224 */ |
|
225 class ICULanguageBreakFactory : public LanguageBreakFactory { |
|
226 private: |
|
227 |
|
228 /** |
|
229 * The stack of break engines created by this factory |
|
230 * @internal |
|
231 */ |
|
232 |
|
233 UStack *fEngines; |
|
234 |
|
235 public: |
|
236 |
|
237 /** |
|
238 * <p>Standard constructor.</p> |
|
239 * |
|
240 */ |
|
241 ICULanguageBreakFactory(UErrorCode &status); |
|
242 |
|
243 /** |
|
244 * <p>Virtual destructor.</p> |
|
245 */ |
|
246 virtual ~ICULanguageBreakFactory(); |
|
247 |
|
248 /** |
|
249 * <p>Find and return a LanguageBreakEngine that can find the desired |
|
250 * kind of break for the set of characters to which the supplied |
|
251 * character belongs. It is up to the set of available engines to |
|
252 * determine what the sets of characters are.</p> |
|
253 * |
|
254 * @param c A character that begins a run for which a LanguageBreakEngine is |
|
255 * sought. |
|
256 * @param breakType The kind of text break for which a LanguageBreakEngine is |
|
257 * sought. |
|
258 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
|
259 */ |
|
260 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); |
|
261 |
|
262 protected: |
|
263 /** |
|
264 * <p>Create a LanguageBreakEngine for the set of characters to which |
|
265 * the supplied character belongs, for the specified break type.</p> |
|
266 * |
|
267 * @param c A character that begins a run for which a LanguageBreakEngine is |
|
268 * sought. |
|
269 * @param breakType The kind of text break for which a LanguageBreakEngine is |
|
270 * sought. |
|
271 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
|
272 */ |
|
273 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); |
|
274 |
|
275 /** |
|
276 * <p>Create a DictionaryMatcher for the specified script and break type.</p> |
|
277 * @param script An ISO 15924 script code that identifies the dictionary to be |
|
278 * created. |
|
279 * @param breakType The kind of text break for which a dictionary is |
|
280 * sought. |
|
281 * @return A DictionaryMatcher with the desired characteristics, or NULL. |
|
282 */ |
|
283 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); |
|
284 }; |
|
285 |
|
286 U_NAMESPACE_END |
|
287 |
|
288 /* BRKENG_H */ |
|
289 #endif |