| |
1 /** |
| |
2 ************************************************************************************ |
| |
3 * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
| |
4 * All Rights Reserved. * |
| |
5 ************************************************************************************ |
| |
6 */ |
| |
7 |
| |
8 #ifndef BRKENG_H |
| |
9 #define BRKENG_H |
| |
10 |
| |
11 #include "unicode/utypes.h" |
| |
12 #include "unicode/uobject.h" |
| |
13 #include "unicode/utext.h" |
| |
14 #include "unicode/uscript.h" |
| |
15 |
| |
16 U_NAMESPACE_BEGIN |
| |
17 |
| |
18 class UnicodeSet; |
| |
19 class UStack; |
| |
20 class DictionaryMatcher; |
| |
21 |
| |
22 /******************************************************************* |
| |
23 * LanguageBreakEngine |
| |
24 */ |
| |
25 |
| |
26 /** |
| |
27 * <p>LanguageBreakEngines implement language-specific knowledge for |
| |
28 * finding text boundaries within a run of characters belonging to a |
| |
29 * specific set. The boundaries will be of a specific kind, e.g. word, |
| |
30 * line, etc.</p> |
| |
31 * |
| |
32 * <p>LanguageBreakEngines should normally be implemented so as to |
| |
33 * be shared between threads without locking.</p> |
| |
34 */ |
| |
35 class LanguageBreakEngine : public UMemory { |
| |
36 public: |
| |
37 |
| |
38 /** |
| |
39 * <p>Default constructor.</p> |
| |
40 * |
| |
41 */ |
| |
42 LanguageBreakEngine(); |
| |
43 |
| |
44 /** |
| |
45 * <p>Virtual destructor.</p> |
| |
46 */ |
| |
47 virtual ~LanguageBreakEngine(); |
| |
48 |
| |
49 /** |
| |
50 * <p>Indicate whether this engine handles a particular character for |
| |
51 * a particular kind of break.</p> |
| |
52 * |
| |
53 * @param c A character which begins a run that the engine might handle |
| |
54 * @param breakType The type of text break which the caller wants to determine |
| |
55 * @return TRUE if this engine handles the particular character and break |
| |
56 * type. |
| |
57 */ |
| |
58 virtual UBool handles(UChar32 c, int32_t breakType) const = 0; |
| |
59 |
| |
60 /** |
| |
61 * <p>Find any breaks within a run in the supplied text.</p> |
| |
62 * |
| |
63 * @param text A UText representing the text. The |
| |
64 * iterator is left at the end of the run of characters which the engine |
| |
65 * is capable of handling. |
| |
66 * @param startPos The start of the run within the supplied text. |
| |
67 * @param endPos The end of the run within the supplied text. |
| |
68 * @param reverse Whether the caller is looking for breaks in a reverse |
| |
69 * direction. |
| |
70 * @param breakType The type of break desired, or -1. |
| |
71 * @param foundBreaks An allocated C array of the breaks found, if any |
| |
72 * @return The number of breaks found. |
| |
73 */ |
| |
74 virtual int32_t findBreaks( UText *text, |
| |
75 int32_t startPos, |
| |
76 int32_t endPos, |
| |
77 UBool reverse, |
| |
78 int32_t breakType, |
| |
79 UStack &foundBreaks ) const = 0; |
| |
80 |
| |
81 }; |
| |
82 |
| |
83 /******************************************************************* |
| |
84 * LanguageBreakFactory |
| |
85 */ |
| |
86 |
| |
87 /** |
| |
88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine |
| |
89 * that can determine breaks for characters in a specific set, if |
| |
90 * such an object can be found.</p> |
| |
91 * |
| |
92 * <p>If a LanguageBreakFactory is to be shared between threads, |
| |
93 * appropriate synchronization must be used; there is none internal |
| |
94 * to the factory.</p> |
| |
95 * |
| |
96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can |
| |
97 * normally be shared between threads without synchronization, unless |
| |
98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p> |
| |
99 * |
| |
100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine |
| |
101 * it returns when it itself is deleted, unless the specific subclass of |
| |
102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should |
| |
103 * not be deleted until the LanguageBreakEngines it has returned are no |
| |
104 * longer needed.</p> |
| |
105 */ |
| |
106 class LanguageBreakFactory : public UMemory { |
| |
107 public: |
| |
108 |
| |
109 /** |
| |
110 * <p>Default constructor.</p> |
| |
111 * |
| |
112 */ |
| |
113 LanguageBreakFactory(); |
| |
114 |
| |
115 /** |
| |
116 * <p>Virtual destructor.</p> |
| |
117 */ |
| |
118 virtual ~LanguageBreakFactory(); |
| |
119 |
| |
120 /** |
| |
121 * <p>Find and return a LanguageBreakEngine that can find the desired |
| |
122 * kind of break for the set of characters to which the supplied |
| |
123 * character belongs. It is up to the set of available engines to |
| |
124 * determine what the sets of characters are.</p> |
| |
125 * |
| |
126 * @param c A character that begins a run for which a LanguageBreakEngine is |
| |
127 * sought. |
| |
128 * @param breakType The kind of text break for which a LanguageBreakEngine is |
| |
129 * sought. |
| |
130 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| |
131 */ |
| |
132 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; |
| |
133 |
| |
134 }; |
| |
135 |
| |
136 /******************************************************************* |
| |
137 * UnhandledEngine |
| |
138 */ |
| |
139 |
| |
140 /** |
| |
141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that |
| |
142 * handles characters that no other LanguageBreakEngine is available to |
| |
143 * handle. It is told the character and the type of break; at its |
| |
144 * discretion it may handle more than the specified character (e.g., |
| |
145 * the entire script to which that character belongs.</p> |
| |
146 * |
| |
147 * <p>UnhandledEngines may not be shared between threads without |
| |
148 * external synchronization.</p> |
| |
149 */ |
| |
150 |
| |
151 class UnhandledEngine : public LanguageBreakEngine { |
| |
152 private: |
| |
153 |
| |
154 /** |
| |
155 * The sets of characters handled, for each break type |
| |
156 * @internal |
| |
157 */ |
| |
158 |
| |
159 UnicodeSet *fHandled[4]; |
| |
160 |
| |
161 public: |
| |
162 |
| |
163 /** |
| |
164 * <p>Default constructor.</p> |
| |
165 * |
| |
166 */ |
| |
167 UnhandledEngine(UErrorCode &status); |
| |
168 |
| |
169 /** |
| |
170 * <p>Virtual destructor.</p> |
| |
171 */ |
| |
172 virtual ~UnhandledEngine(); |
| |
173 |
| |
174 /** |
| |
175 * <p>Indicate whether this engine handles a particular character for |
| |
176 * a particular kind of break.</p> |
| |
177 * |
| |
178 * @param c A character which begins a run that the engine might handle |
| |
179 * @param breakType The type of text break which the caller wants to determine |
| |
180 * @return TRUE if this engine handles the particular character and break |
| |
181 * type. |
| |
182 */ |
| |
183 virtual UBool handles(UChar32 c, int32_t breakType) const; |
| |
184 |
| |
185 /** |
| |
186 * <p>Find any breaks within a run in the supplied text.</p> |
| |
187 * |
| |
188 * @param text A UText representing the text (TODO: UText). The |
| |
189 * iterator is left at the end of the run of characters which the engine |
| |
190 * is capable of handling. |
| |
191 * @param startPos The start of the run within the supplied text. |
| |
192 * @param endPos The end of the run within the supplied text. |
| |
193 * @param reverse Whether the caller is looking for breaks in a reverse |
| |
194 * direction. |
| |
195 * @param breakType The type of break desired, or -1. |
| |
196 * @param foundBreaks An allocated C array of the breaks found, if any |
| |
197 * @return The number of breaks found. |
| |
198 */ |
| |
199 virtual int32_t findBreaks( UText *text, |
| |
200 int32_t startPos, |
| |
201 int32_t endPos, |
| |
202 UBool reverse, |
| |
203 int32_t breakType, |
| |
204 UStack &foundBreaks ) const; |
| |
205 |
| |
206 /** |
| |
207 * <p>Tell the engine to handle a particular character and break type.</p> |
| |
208 * |
| |
209 * @param c A character which the engine should handle |
| |
210 * @param breakType The type of text break for which the engine should handle c |
| |
211 */ |
| |
212 virtual void handleCharacter(UChar32 c, int32_t breakType); |
| |
213 |
| |
214 }; |
| |
215 |
| |
216 /******************************************************************* |
| |
217 * ICULanguageBreakFactory |
| |
218 */ |
| |
219 |
| |
220 /** |
| |
221 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for |
| |
222 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary |
| |
223 * data in the ICU data file.</p> |
| |
224 */ |
| |
225 class ICULanguageBreakFactory : public LanguageBreakFactory { |
| |
226 private: |
| |
227 |
| |
228 /** |
| |
229 * The stack of break engines created by this factory |
| |
230 * @internal |
| |
231 */ |
| |
232 |
| |
233 UStack *fEngines; |
| |
234 |
| |
235 public: |
| |
236 |
| |
237 /** |
| |
238 * <p>Standard constructor.</p> |
| |
239 * |
| |
240 */ |
| |
241 ICULanguageBreakFactory(UErrorCode &status); |
| |
242 |
| |
243 /** |
| |
244 * <p>Virtual destructor.</p> |
| |
245 */ |
| |
246 virtual ~ICULanguageBreakFactory(); |
| |
247 |
| |
248 /** |
| |
249 * <p>Find and return a LanguageBreakEngine that can find the desired |
| |
250 * kind of break for the set of characters to which the supplied |
| |
251 * character belongs. It is up to the set of available engines to |
| |
252 * determine what the sets of characters are.</p> |
| |
253 * |
| |
254 * @param c A character that begins a run for which a LanguageBreakEngine is |
| |
255 * sought. |
| |
256 * @param breakType The kind of text break for which a LanguageBreakEngine is |
| |
257 * sought. |
| |
258 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| |
259 */ |
| |
260 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); |
| |
261 |
| |
262 protected: |
| |
263 /** |
| |
264 * <p>Create a LanguageBreakEngine for the set of characters to which |
| |
265 * the supplied character belongs, for the specified break type.</p> |
| |
266 * |
| |
267 * @param c A character that begins a run for which a LanguageBreakEngine is |
| |
268 * sought. |
| |
269 * @param breakType The kind of text break for which a LanguageBreakEngine is |
| |
270 * sought. |
| |
271 * @return A LanguageBreakEngine with the desired characteristics, or 0. |
| |
272 */ |
| |
273 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); |
| |
274 |
| |
275 /** |
| |
276 * <p>Create a DictionaryMatcher for the specified script and break type.</p> |
| |
277 * @param script An ISO 15924 script code that identifies the dictionary to be |
| |
278 * created. |
| |
279 * @param breakType The kind of text break for which a dictionary is |
| |
280 * sought. |
| |
281 * @return A DictionaryMatcher with the desired characteristics, or NULL. |
| |
282 */ |
| |
283 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); |
| |
284 }; |
| |
285 |
| |
286 U_NAMESPACE_END |
| |
287 |
| |
288 /* BRKENG_H */ |
| |
289 #endif |