|
1 /* |
|
2 ************************************************************************************ |
|
3 * Copyright (C) 2006-2013, International Business Machines Corporation |
|
4 * and others. All Rights Reserved. |
|
5 ************************************************************************************ |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #if !UCONFIG_NO_BREAK_ITERATION |
|
11 |
|
12 #include "brkeng.h" |
|
13 #include "dictbe.h" |
|
14 #include "unicode/uchar.h" |
|
15 #include "unicode/uniset.h" |
|
16 #include "unicode/chariter.h" |
|
17 #include "unicode/ures.h" |
|
18 #include "unicode/udata.h" |
|
19 #include "unicode/putil.h" |
|
20 #include "unicode/ustring.h" |
|
21 #include "unicode/uscript.h" |
|
22 #include "unicode/ucharstrie.h" |
|
23 #include "unicode/bytestrie.h" |
|
24 #include "charstr.h" |
|
25 #include "dictionarydata.h" |
|
26 #include "uvector.h" |
|
27 #include "umutex.h" |
|
28 #include "uresimp.h" |
|
29 #include "ubrkimpl.h" |
|
30 |
|
31 U_NAMESPACE_BEGIN |
|
32 |
|
33 /* |
|
34 ****************************************************************** |
|
35 */ |
|
36 |
|
37 LanguageBreakEngine::LanguageBreakEngine() { |
|
38 } |
|
39 |
|
40 LanguageBreakEngine::~LanguageBreakEngine() { |
|
41 } |
|
42 |
|
43 /* |
|
44 ****************************************************************** |
|
45 */ |
|
46 |
|
47 LanguageBreakFactory::LanguageBreakFactory() { |
|
48 } |
|
49 |
|
50 LanguageBreakFactory::~LanguageBreakFactory() { |
|
51 } |
|
52 |
|
53 /* |
|
54 ****************************************************************** |
|
55 */ |
|
56 |
|
57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { |
|
58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { |
|
59 fHandled[i] = 0; |
|
60 } |
|
61 } |
|
62 |
|
63 UnhandledEngine::~UnhandledEngine() { |
|
64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { |
|
65 if (fHandled[i] != 0) { |
|
66 delete fHandled[i]; |
|
67 } |
|
68 } |
|
69 } |
|
70 |
|
71 UBool |
|
72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { |
|
73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) |
|
74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); |
|
75 } |
|
76 |
|
77 int32_t |
|
78 UnhandledEngine::findBreaks( UText *text, |
|
79 int32_t startPos, |
|
80 int32_t endPos, |
|
81 UBool reverse, |
|
82 int32_t breakType, |
|
83 UStack &/*foundBreaks*/ ) const { |
|
84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { |
|
85 UChar32 c = utext_current32(text); |
|
86 if (reverse) { |
|
87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { |
|
88 c = utext_previous32(text); |
|
89 } |
|
90 } |
|
91 else { |
|
92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { |
|
93 utext_next32(text); // TODO: recast loop to work with post-increment operations. |
|
94 c = utext_current32(text); |
|
95 } |
|
96 } |
|
97 } |
|
98 return 0; |
|
99 } |
|
100 |
|
101 void |
|
102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { |
|
103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { |
|
104 if (fHandled[breakType] == 0) { |
|
105 fHandled[breakType] = new UnicodeSet(); |
|
106 if (fHandled[breakType] == 0) { |
|
107 return; |
|
108 } |
|
109 } |
|
110 if (!fHandled[breakType]->contains(c)) { |
|
111 UErrorCode status = U_ZERO_ERROR; |
|
112 // Apply the entire script of the character. |
|
113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); |
|
114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); |
|
115 } |
|
116 } |
|
117 } |
|
118 |
|
119 /* |
|
120 ****************************************************************** |
|
121 */ |
|
122 |
|
123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { |
|
124 fEngines = 0; |
|
125 } |
|
126 |
|
127 ICULanguageBreakFactory::~ICULanguageBreakFactory() { |
|
128 if (fEngines != 0) { |
|
129 delete fEngines; |
|
130 } |
|
131 } |
|
132 |
|
133 U_NAMESPACE_END |
|
134 U_CDECL_BEGIN |
|
135 static void U_CALLCONV _deleteEngine(void *obj) { |
|
136 delete (const icu::LanguageBreakEngine *) obj; |
|
137 } |
|
138 U_CDECL_END |
|
139 U_NAMESPACE_BEGIN |
|
140 |
|
141 const LanguageBreakEngine * |
|
142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { |
|
143 UBool needsInit; |
|
144 int32_t i; |
|
145 const LanguageBreakEngine *lbe = NULL; |
|
146 UErrorCode status = U_ZERO_ERROR; |
|
147 |
|
148 // TODO: The global mutex should not be used. |
|
149 // The global mutex should only be used for short periods. |
|
150 // A ICULanguageBreakFactory specific mutex should be used. |
|
151 umtx_lock(NULL); |
|
152 needsInit = (UBool)(fEngines == NULL); |
|
153 if (!needsInit) { |
|
154 i = fEngines->size(); |
|
155 while (--i >= 0) { |
|
156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
|
157 if (lbe != NULL && lbe->handles(c, breakType)) { |
|
158 break; |
|
159 } |
|
160 lbe = NULL; |
|
161 } |
|
162 } |
|
163 umtx_unlock(NULL); |
|
164 |
|
165 if (lbe != NULL) { |
|
166 return lbe; |
|
167 } |
|
168 |
|
169 if (needsInit) { |
|
170 UStack *engines = new UStack(_deleteEngine, NULL, status); |
|
171 if (U_SUCCESS(status) && engines == NULL) { |
|
172 status = U_MEMORY_ALLOCATION_ERROR; |
|
173 } |
|
174 else if (U_FAILURE(status)) { |
|
175 delete engines; |
|
176 engines = NULL; |
|
177 } |
|
178 else { |
|
179 umtx_lock(NULL); |
|
180 if (fEngines == NULL) { |
|
181 fEngines = engines; |
|
182 engines = NULL; |
|
183 } |
|
184 umtx_unlock(NULL); |
|
185 delete engines; |
|
186 } |
|
187 } |
|
188 |
|
189 if (fEngines == NULL) { |
|
190 return NULL; |
|
191 } |
|
192 |
|
193 // We didn't find an engine the first time through, or there was no |
|
194 // stack. Create an engine. |
|
195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); |
|
196 |
|
197 // Now get the lock, and see if someone else has created it in the |
|
198 // meantime |
|
199 umtx_lock(NULL); |
|
200 i = fEngines->size(); |
|
201 while (--i >= 0) { |
|
202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
|
203 if (lbe != NULL && lbe->handles(c, breakType)) { |
|
204 break; |
|
205 } |
|
206 lbe = NULL; |
|
207 } |
|
208 if (lbe == NULL && newlbe != NULL) { |
|
209 fEngines->push((void *)newlbe, status); |
|
210 lbe = newlbe; |
|
211 newlbe = NULL; |
|
212 } |
|
213 umtx_unlock(NULL); |
|
214 |
|
215 delete newlbe; |
|
216 |
|
217 return lbe; |
|
218 } |
|
219 |
|
220 const LanguageBreakEngine * |
|
221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { |
|
222 UErrorCode status = U_ZERO_ERROR; |
|
223 UScriptCode code = uscript_getScript(c, &status); |
|
224 if (U_SUCCESS(status)) { |
|
225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); |
|
226 if (m != NULL) { |
|
227 const LanguageBreakEngine *engine = NULL; |
|
228 switch(code) { |
|
229 case USCRIPT_THAI: |
|
230 engine = new ThaiBreakEngine(m, status); |
|
231 break; |
|
232 case USCRIPT_LAO: |
|
233 engine = new LaoBreakEngine(m, status); |
|
234 break; |
|
235 case USCRIPT_KHMER: |
|
236 engine = new KhmerBreakEngine(m, status); |
|
237 break; |
|
238 |
|
239 #if !UCONFIG_NO_NORMALIZATION |
|
240 // CJK not available w/o normalization |
|
241 case USCRIPT_HANGUL: |
|
242 engine = new CjkBreakEngine(m, kKorean, status); |
|
243 break; |
|
244 |
|
245 // use same BreakEngine and dictionary for both Chinese and Japanese |
|
246 case USCRIPT_HIRAGANA: |
|
247 case USCRIPT_KATAKANA: |
|
248 case USCRIPT_HAN: |
|
249 engine = new CjkBreakEngine(m, kChineseJapanese, status); |
|
250 break; |
|
251 #if 0 |
|
252 // TODO: Have to get some characters with script=common handled |
|
253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting |
|
254 // them to CjkBreakEngine does not work. The engine has to |
|
255 // special-case them. |
|
256 case USCRIPT_COMMON: |
|
257 { |
|
258 UBlockCode block = ublock_getCode(code); |
|
259 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
|
260 engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
|
261 break; |
|
262 } |
|
263 #endif |
|
264 #endif |
|
265 |
|
266 default: |
|
267 break; |
|
268 } |
|
269 if (engine == NULL) { |
|
270 delete m; |
|
271 } |
|
272 else if (U_FAILURE(status)) { |
|
273 delete engine; |
|
274 engine = NULL; |
|
275 } |
|
276 return engine; |
|
277 } |
|
278 } |
|
279 return NULL; |
|
280 } |
|
281 |
|
282 DictionaryMatcher * |
|
283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { |
|
284 UErrorCode status = U_ZERO_ERROR; |
|
285 // open root from brkitr tree. |
|
286 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); |
|
287 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); |
|
288 int32_t dictnlength = 0; |
|
289 const UChar *dictfname = |
|
290 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); |
|
291 if (U_FAILURE(status)) { |
|
292 ures_close(b); |
|
293 return NULL; |
|
294 } |
|
295 CharString dictnbuf; |
|
296 CharString ext; |
|
297 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot |
|
298 if (extStart != NULL) { |
|
299 int32_t len = (int32_t)(extStart - dictfname); |
|
300 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); |
|
301 dictnlength = len; |
|
302 } |
|
303 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); |
|
304 ures_close(b); |
|
305 |
|
306 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); |
|
307 if (U_SUCCESS(status)) { |
|
308 // build trie |
|
309 const uint8_t *data = (const uint8_t *)udata_getMemory(file); |
|
310 const int32_t *indexes = (const int32_t *)data; |
|
311 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; |
|
312 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
|
313 DictionaryMatcher *m = NULL; |
|
314 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
|
315 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; |
|
316 const char *characters = (const char *)(data + offset); |
|
317 m = new BytesDictionaryMatcher(characters, transform, file); |
|
318 } |
|
319 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
|
320 const UChar *characters = (const UChar *)(data + offset); |
|
321 m = new UCharsDictionaryMatcher(characters, file); |
|
322 } |
|
323 if (m == NULL) { |
|
324 // no matcher exists to take ownership - either we are an invalid |
|
325 // type or memory allocation failed |
|
326 udata_close(file); |
|
327 } |
|
328 return m; |
|
329 } else if (dictfname != NULL) { |
|
330 // we don't have a dictionary matcher. |
|
331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected |
|
332 status = U_ZERO_ERROR; |
|
333 return NULL; |
|
334 } |
|
335 return NULL; |
|
336 } |
|
337 |
|
338 U_NAMESPACE_END |
|
339 |
|
340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |