|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 2005-2013, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 */ |
|
7 |
|
8 #include "unicode/utypes.h" |
|
9 |
|
10 #if !UCONFIG_NO_CONVERSION |
|
11 |
|
12 #include "unicode/ucsdet.h" |
|
13 |
|
14 #include "csdetect.h" |
|
15 #include "csmatch.h" |
|
16 #include "uenumimp.h" |
|
17 |
|
18 #include "cmemory.h" |
|
19 #include "cstring.h" |
|
20 #include "umutex.h" |
|
21 #include "ucln_in.h" |
|
22 #include "uarrsort.h" |
|
23 #include "inputext.h" |
|
24 #include "csrsbcs.h" |
|
25 #include "csrmbcs.h" |
|
26 #include "csrutf8.h" |
|
27 #include "csrucode.h" |
|
28 #include "csr2022.h" |
|
29 |
|
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
|
31 |
|
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
|
33 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
|
34 |
|
35 U_NAMESPACE_BEGIN |
|
36 |
|
37 struct CSRecognizerInfo : public UMemory { |
|
38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) |
|
39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; |
|
40 |
|
41 ~CSRecognizerInfo() {delete recognizer;}; |
|
42 |
|
43 CharsetRecognizer *recognizer; |
|
44 UBool isDefaultEnabled; |
|
45 }; |
|
46 |
|
47 U_NAMESPACE_END |
|
48 |
|
49 static icu::CSRecognizerInfo **fCSRecognizers = NULL; |
|
50 static icu::UInitOnce gCSRecognizersInitOnce; |
|
51 static int32_t fCSRecognizers_size = 0; |
|
52 |
|
53 U_CDECL_BEGIN |
|
54 static UBool U_CALLCONV csdet_cleanup(void) |
|
55 { |
|
56 U_NAMESPACE_USE |
|
57 if (fCSRecognizers != NULL) { |
|
58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { |
|
59 delete fCSRecognizers[r]; |
|
60 fCSRecognizers[r] = NULL; |
|
61 } |
|
62 |
|
63 DELETE_ARRAY(fCSRecognizers); |
|
64 fCSRecognizers = NULL; |
|
65 fCSRecognizers_size = 0; |
|
66 } |
|
67 gCSRecognizersInitOnce.reset(); |
|
68 |
|
69 return TRUE; |
|
70 } |
|
71 |
|
72 static int32_t U_CALLCONV |
|
73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right) |
|
74 { |
|
75 U_NAMESPACE_USE |
|
76 |
|
77 const CharsetMatch **csm_l = (const CharsetMatch **) left; |
|
78 const CharsetMatch **csm_r = (const CharsetMatch **) right; |
|
79 |
|
80 // NOTE: compare is backwards to sort from highest to lowest. |
|
81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); |
|
82 } |
|
83 |
|
84 static void U_CALLCONV initRecognizers(UErrorCode &status) { |
|
85 U_NAMESPACE_USE |
|
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); |
|
87 CSRecognizerInfo *tempArray[] = { |
|
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), |
|
89 |
|
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), |
|
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), |
|
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), |
|
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), |
|
94 |
|
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), |
|
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), |
|
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), |
|
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), |
|
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), |
|
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), |
|
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), |
|
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), |
|
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), |
|
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), |
|
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), |
|
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), |
|
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), |
|
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), |
|
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), |
|
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), |
|
111 |
|
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), |
|
113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), |
|
114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), |
|
115 |
|
116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), |
|
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), |
|
118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), |
|
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) |
|
120 }; |
|
121 int32_t rCount = ARRAY_SIZE(tempArray); |
|
122 |
|
123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); |
|
124 |
|
125 if (fCSRecognizers == NULL) { |
|
126 status = U_MEMORY_ALLOCATION_ERROR; |
|
127 } |
|
128 else { |
|
129 fCSRecognizers_size = rCount; |
|
130 for (int32_t r = 0; r < rCount; r += 1) { |
|
131 fCSRecognizers[r] = tempArray[r]; |
|
132 if (fCSRecognizers[r] == NULL) { |
|
133 status = U_MEMORY_ALLOCATION_ERROR; |
|
134 } |
|
135 } |
|
136 } |
|
137 } |
|
138 |
|
139 U_CDECL_END |
|
140 |
|
141 U_NAMESPACE_BEGIN |
|
142 |
|
143 void CharsetDetector::setRecognizers(UErrorCode &status) |
|
144 { |
|
145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); |
|
146 } |
|
147 |
|
148 CharsetDetector::CharsetDetector(UErrorCode &status) |
|
149 : textIn(new InputText(status)), resultArray(NULL), |
|
150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), |
|
151 fEnabledRecognizers(NULL) |
|
152 { |
|
153 if (U_FAILURE(status)) { |
|
154 return; |
|
155 } |
|
156 |
|
157 setRecognizers(status); |
|
158 |
|
159 if (U_FAILURE(status)) { |
|
160 return; |
|
161 } |
|
162 |
|
163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); |
|
164 |
|
165 if (resultArray == NULL) { |
|
166 status = U_MEMORY_ALLOCATION_ERROR; |
|
167 return; |
|
168 } |
|
169 |
|
170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
|
171 resultArray[i] = new CharsetMatch(); |
|
172 |
|
173 if (resultArray[i] == NULL) { |
|
174 status = U_MEMORY_ALLOCATION_ERROR; |
|
175 break; |
|
176 } |
|
177 } |
|
178 } |
|
179 |
|
180 CharsetDetector::~CharsetDetector() |
|
181 { |
|
182 delete textIn; |
|
183 |
|
184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
|
185 delete resultArray[i]; |
|
186 } |
|
187 |
|
188 uprv_free(resultArray); |
|
189 |
|
190 if (fEnabledRecognizers) { |
|
191 uprv_free(fEnabledRecognizers); |
|
192 } |
|
193 } |
|
194 |
|
195 void CharsetDetector::setText(const char *in, int32_t len) |
|
196 { |
|
197 textIn->setText(in, len); |
|
198 fFreshTextSet = TRUE; |
|
199 } |
|
200 |
|
201 UBool CharsetDetector::setStripTagsFlag(UBool flag) |
|
202 { |
|
203 UBool temp = fStripTags; |
|
204 fStripTags = flag; |
|
205 fFreshTextSet = TRUE; |
|
206 return temp; |
|
207 } |
|
208 |
|
209 UBool CharsetDetector::getStripTagsFlag() const |
|
210 { |
|
211 return fStripTags; |
|
212 } |
|
213 |
|
214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const |
|
215 { |
|
216 textIn->setDeclaredEncoding(encoding,len); |
|
217 } |
|
218 |
|
219 int32_t CharsetDetector::getDetectableCount() |
|
220 { |
|
221 UErrorCode status = U_ZERO_ERROR; |
|
222 |
|
223 setRecognizers(status); |
|
224 |
|
225 return fCSRecognizers_size; |
|
226 } |
|
227 |
|
228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status) |
|
229 { |
|
230 int32_t maxMatchesFound = 0; |
|
231 |
|
232 detectAll(maxMatchesFound, status); |
|
233 |
|
234 if(maxMatchesFound > 0) { |
|
235 return resultArray[0]; |
|
236 } else { |
|
237 return NULL; |
|
238 } |
|
239 } |
|
240 |
|
241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) |
|
242 { |
|
243 if(!textIn->isSet()) { |
|
244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set |
|
245 |
|
246 return NULL; |
|
247 } else if (fFreshTextSet) { |
|
248 CharsetRecognizer *csr; |
|
249 int32_t i; |
|
250 |
|
251 textIn->MungeInput(fStripTags); |
|
252 |
|
253 // Iterate over all possible charsets, remember all that |
|
254 // give a match quality > 0. |
|
255 resultCount = 0; |
|
256 for (i = 0; i < fCSRecognizers_size; i += 1) { |
|
257 csr = fCSRecognizers[i]->recognizer; |
|
258 if (csr->match(textIn, resultArray[resultCount])) { |
|
259 resultCount++; |
|
260 } |
|
261 } |
|
262 |
|
263 if (resultCount > 1) { |
|
264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); |
|
265 } |
|
266 fFreshTextSet = FALSE; |
|
267 } |
|
268 |
|
269 maxMatchesFound = resultCount; |
|
270 |
|
271 return resultArray; |
|
272 } |
|
273 |
|
274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) |
|
275 { |
|
276 if (U_FAILURE(status)) { |
|
277 return; |
|
278 } |
|
279 |
|
280 int32_t modIdx = -1; |
|
281 UBool isDefaultVal = FALSE; |
|
282 for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
|
283 CSRecognizerInfo *csrinfo = fCSRecognizers[i]; |
|
284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { |
|
285 modIdx = i; |
|
286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled); |
|
287 break; |
|
288 } |
|
289 } |
|
290 if (modIdx < 0) { |
|
291 // No matching encoding found |
|
292 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
293 return; |
|
294 } |
|
295 |
|
296 if (fEnabledRecognizers == NULL && !isDefaultVal) { |
|
297 // Create an array storing the non default setting |
|
298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); |
|
299 if (fEnabledRecognizers == NULL) { |
|
300 status = U_MEMORY_ALLOCATION_ERROR; |
|
301 return; |
|
302 } |
|
303 // Initialize the array with default info |
|
304 for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
|
305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; |
|
306 } |
|
307 } |
|
308 |
|
309 if (fEnabledRecognizers != NULL) { |
|
310 fEnabledRecognizers[modIdx] = enabled; |
|
311 } |
|
312 } |
|
313 |
|
314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const |
|
315 { |
|
316 if( index > fCSRecognizers_size-1 || index < 0) { |
|
317 status = U_INDEX_OUTOFBOUNDS_ERROR; |
|
318 |
|
319 return 0; |
|
320 } else { |
|
321 return fCSRecognizers[index]->getName(); |
|
322 } |
|
323 }*/ |
|
324 |
|
325 U_NAMESPACE_END |
|
326 |
|
327 U_CDECL_BEGIN |
|
328 typedef struct { |
|
329 int32_t currIndex; |
|
330 UBool all; |
|
331 UBool *enabledRecognizers; |
|
332 } Context; |
|
333 |
|
334 |
|
335 |
|
336 static void U_CALLCONV |
|
337 enumClose(UEnumeration *en) { |
|
338 if(en->context != NULL) { |
|
339 DELETE_ARRAY(en->context); |
|
340 } |
|
341 |
|
342 DELETE_ARRAY(en); |
|
343 } |
|
344 |
|
345 static int32_t U_CALLCONV |
|
346 enumCount(UEnumeration *en, UErrorCode *) { |
|
347 if (((Context *)en->context)->all) { |
|
348 // ucsdet_getAllDetectableCharsets, all charset detector names |
|
349 return fCSRecognizers_size; |
|
350 } |
|
351 |
|
352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones |
|
353 int32_t count = 0; |
|
354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; |
|
355 if (enabledArray != NULL) { |
|
356 // custom set |
|
357 for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
|
358 if (enabledArray[i]) { |
|
359 count++; |
|
360 } |
|
361 } |
|
362 } else { |
|
363 // default set |
|
364 for (int32_t i = 0; i < fCSRecognizers_size; i++) { |
|
365 if (fCSRecognizers[i]->isDefaultEnabled) { |
|
366 count++; |
|
367 } |
|
368 } |
|
369 } |
|
370 return count; |
|
371 } |
|
372 |
|
373 static const char* U_CALLCONV |
|
374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { |
|
375 const char *currName = NULL; |
|
376 |
|
377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) { |
|
378 if (((Context *)en->context)->all) { |
|
379 // ucsdet_getAllDetectableCharsets, all charset detector names |
|
380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
|
381 ((Context *)en->context)->currIndex++; |
|
382 } else { |
|
383 // ucsdet_getDetectableCharsets |
|
384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; |
|
385 if (enabledArray != NULL) { |
|
386 // custome set |
|
387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { |
|
388 if (enabledArray[((Context *)en->context)->currIndex]) { |
|
389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
|
390 } |
|
391 ((Context *)en->context)->currIndex++; |
|
392 } |
|
393 } else { |
|
394 // default set |
|
395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { |
|
396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { |
|
397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); |
|
398 } |
|
399 ((Context *)en->context)->currIndex++; |
|
400 } |
|
401 } |
|
402 } |
|
403 } |
|
404 |
|
405 if(resultLength != NULL) { |
|
406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); |
|
407 } |
|
408 |
|
409 return currName; |
|
410 } |
|
411 |
|
412 |
|
413 static void U_CALLCONV |
|
414 enumReset(UEnumeration *en, UErrorCode *) { |
|
415 ((Context *)en->context)->currIndex = 0; |
|
416 } |
|
417 |
|
418 static const UEnumeration gCSDetEnumeration = { |
|
419 NULL, |
|
420 NULL, |
|
421 enumClose, |
|
422 enumCount, |
|
423 uenum_unextDefault, |
|
424 enumNext, |
|
425 enumReset |
|
426 }; |
|
427 |
|
428 U_CDECL_END |
|
429 |
|
430 U_NAMESPACE_BEGIN |
|
431 |
|
432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) |
|
433 { |
|
434 |
|
435 /* Initialize recognized charsets. */ |
|
436 setRecognizers(status); |
|
437 |
|
438 if(U_FAILURE(status)) { |
|
439 return 0; |
|
440 } |
|
441 |
|
442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
|
443 if (en == NULL) { |
|
444 status = U_MEMORY_ALLOCATION_ERROR; |
|
445 return 0; |
|
446 } |
|
447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
|
448 en->context = (void*)NEW_ARRAY(Context, 1); |
|
449 if (en->context == NULL) { |
|
450 status = U_MEMORY_ALLOCATION_ERROR; |
|
451 DELETE_ARRAY(en); |
|
452 return 0; |
|
453 } |
|
454 uprv_memset(en->context, 0, sizeof(Context)); |
|
455 ((Context*)en->context)->all = TRUE; |
|
456 return en; |
|
457 } |
|
458 |
|
459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const |
|
460 { |
|
461 if(U_FAILURE(status)) { |
|
462 return 0; |
|
463 } |
|
464 |
|
465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
|
466 if (en == NULL) { |
|
467 status = U_MEMORY_ALLOCATION_ERROR; |
|
468 return 0; |
|
469 } |
|
470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
|
471 en->context = (void*)NEW_ARRAY(Context, 1); |
|
472 if (en->context == NULL) { |
|
473 status = U_MEMORY_ALLOCATION_ERROR; |
|
474 DELETE_ARRAY(en); |
|
475 return 0; |
|
476 } |
|
477 uprv_memset(en->context, 0, sizeof(Context)); |
|
478 ((Context*)en->context)->all = FALSE; |
|
479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; |
|
480 return en; |
|
481 } |
|
482 |
|
483 U_NAMESPACE_END |
|
484 |
|
485 #endif |