1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/unicode/ucsdet.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,413 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2013, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + * file name: ucsdet.h 1.10 + * encoding: US-ASCII 1.11 + * indentation:4 1.12 + * 1.13 + * created on: 2005Aug04 1.14 + * created by: Andy Heninger 1.15 + * 1.16 + * ICU Character Set Detection, API for C 1.17 + * 1.18 + * Draft version 18 Oct 2005 1.19 + * 1.20 + */ 1.21 + 1.22 +#ifndef __UCSDET_H 1.23 +#define __UCSDET_H 1.24 + 1.25 +#include "unicode/utypes.h" 1.26 + 1.27 +#if !UCONFIG_NO_CONVERSION 1.28 + 1.29 +#include "unicode/localpointer.h" 1.30 +#include "unicode/uenum.h" 1.31 + 1.32 +/** 1.33 + * \file 1.34 + * \brief C API: Charset Detection API 1.35 + * 1.36 + * This API provides a facility for detecting the 1.37 + * charset or encoding of character data in an unknown text format. 1.38 + * The input data can be from an array of bytes. 1.39 + * <p> 1.40 + * Character set detection is at best an imprecise operation. The detection 1.41 + * process will attempt to identify the charset that best matches the characteristics 1.42 + * of the byte data, but the process is partly statistical in nature, and 1.43 + * the results can not be guaranteed to always be correct. 1.44 + * <p> 1.45 + * For best accuracy in charset detection, the input data should be primarily 1.46 + * in a single language, and a minimum of a few hundred bytes worth of plain text 1.47 + * in the language are needed. The detection process will attempt to 1.48 + * ignore html or xml style markup that could otherwise obscure the content. 1.49 + */ 1.50 + 1.51 + 1.52 +struct UCharsetDetector; 1.53 +/** 1.54 + * Structure representing a charset detector 1.55 + * @stable ICU 3.6 1.56 + */ 1.57 +typedef struct UCharsetDetector UCharsetDetector; 1.58 + 1.59 +struct UCharsetMatch; 1.60 +/** 1.61 + * Opaque structure representing a match that was identified 1.62 + * from a charset detection operation. 1.63 + * @stable ICU 3.6 1.64 + */ 1.65 +typedef struct UCharsetMatch UCharsetMatch; 1.66 + 1.67 +/** 1.68 + * Open a charset detector. 1.69 + * 1.70 + * @param status Any error conditions occurring during the open 1.71 + * operation are reported back in this variable. 1.72 + * @return the newly opened charset detector. 1.73 + * @stable ICU 3.6 1.74 + */ 1.75 +U_STABLE UCharsetDetector * U_EXPORT2 1.76 +ucsdet_open(UErrorCode *status); 1.77 + 1.78 +/** 1.79 + * Close a charset detector. All storage and any other resources 1.80 + * owned by this charset detector will be released. Failure to 1.81 + * close a charset detector when finished with it can result in 1.82 + * memory leaks in the application. 1.83 + * 1.84 + * @param ucsd The charset detector to be closed. 1.85 + * @stable ICU 3.6 1.86 + */ 1.87 +U_STABLE void U_EXPORT2 1.88 +ucsdet_close(UCharsetDetector *ucsd); 1.89 + 1.90 +#if U_SHOW_CPLUSPLUS_API 1.91 + 1.92 +U_NAMESPACE_BEGIN 1.93 + 1.94 +/** 1.95 + * \class LocalUCharsetDetectorPointer 1.96 + * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 1.97 + * For most methods see the LocalPointerBase base class. 1.98 + * 1.99 + * @see LocalPointerBase 1.100 + * @see LocalPointer 1.101 + * @stable ICU 4.4 1.102 + */ 1.103 +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 1.104 + 1.105 +U_NAMESPACE_END 1.106 + 1.107 +#endif 1.108 + 1.109 +/** 1.110 + * Set the input byte data whose charset is to detected. 1.111 + * 1.112 + * Ownership of the input text byte array remains with the caller. 1.113 + * The input string must not be altered or deleted until the charset 1.114 + * detector is either closed or reset to refer to different input text. 1.115 + * 1.116 + * @param ucsd the charset detector to be used. 1.117 + * @param textIn the input text of unknown encoding. . 1.118 + * @param len the length of the input text, or -1 if the text 1.119 + * is NUL terminated. 1.120 + * @param status any error conditions are reported back in this variable. 1.121 + * 1.122 + * @stable ICU 3.6 1.123 + */ 1.124 +U_STABLE void U_EXPORT2 1.125 +ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 1.126 + 1.127 + 1.128 +/** Set the declared encoding for charset detection. 1.129 + * The declared encoding of an input text is an encoding obtained 1.130 + * by the user from an http header or xml declaration or similar source that 1.131 + * can be provided as an additional hint to the charset detector. 1.132 + * 1.133 + * How and whether the declared encoding will be used during the 1.134 + * detection process is TBD. 1.135 + * 1.136 + * @param ucsd the charset detector to be used. 1.137 + * @param encoding an encoding for the current data obtained from 1.138 + * a header or declaration or other source outside 1.139 + * of the byte data itself. 1.140 + * @param length the length of the encoding name, or -1 if the name string 1.141 + * is NUL terminated. 1.142 + * @param status any error conditions are reported back in this variable. 1.143 + * 1.144 + * @stable ICU 3.6 1.145 + */ 1.146 +U_STABLE void U_EXPORT2 1.147 +ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 1.148 + 1.149 + 1.150 +/** 1.151 + * Return the charset that best matches the supplied input data. 1.152 + * 1.153 + * Note though, that because the detection 1.154 + * only looks at the start of the input data, 1.155 + * there is a possibility that the returned charset will fail to handle 1.156 + * the full set of input data. 1.157 + * <p> 1.158 + * The returned UCharsetMatch object is owned by the UCharsetDetector. 1.159 + * It will remain valid until the detector input is reset, or until 1.160 + * the detector is closed. 1.161 + * <p> 1.162 + * The function will fail if 1.163 + * <ul> 1.164 + * <li>no charset appears to match the data.</li> 1.165 + * <li>no input text has been provided</li> 1.166 + * </ul> 1.167 + * 1.168 + * @param ucsd the charset detector to be used. 1.169 + * @param status any error conditions are reported back in this variable. 1.170 + * @return a UCharsetMatch representing the best matching charset, 1.171 + * or NULL if no charset matches the byte data. 1.172 + * 1.173 + * @stable ICU 3.6 1.174 + */ 1.175 +U_STABLE const UCharsetMatch * U_EXPORT2 1.176 +ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 1.177 + 1.178 + 1.179 +/** 1.180 + * Find all charset matches that appear to be consistent with the input, 1.181 + * returning an array of results. The results are ordered with the 1.182 + * best quality match first. 1.183 + * 1.184 + * Because the detection only looks at a limited amount of the 1.185 + * input byte data, some of the returned charsets may fail to handle 1.186 + * the all of input data. 1.187 + * <p> 1.188 + * The returned UCharsetMatch objects are owned by the UCharsetDetector. 1.189 + * They will remain valid until the detector is closed or modified 1.190 + * 1.191 + * <p> 1.192 + * Return an error if 1.193 + * <ul> 1.194 + * <li>no charsets appear to match the input data.</li> 1.195 + * <li>no input text has been provided</li> 1.196 + * </ul> 1.197 + * 1.198 + * @param ucsd the charset detector to be used. 1.199 + * @param matchesFound pointer to a variable that will be set to the 1.200 + * number of charsets identified that are consistent with 1.201 + * the input data. Output only. 1.202 + * @param status any error conditions are reported back in this variable. 1.203 + * @return A pointer to an array of pointers to UCharSetMatch objects. 1.204 + * This array, and the UCharSetMatch instances to which it refers, 1.205 + * are owned by the UCharsetDetector, and will remain valid until 1.206 + * the detector is closed or modified. 1.207 + * @stable ICU 3.6 1.208 + */ 1.209 +U_STABLE const UCharsetMatch ** U_EXPORT2 1.210 +ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 1.211 + 1.212 + 1.213 + 1.214 +/** 1.215 + * Get the name of the charset represented by a UCharsetMatch. 1.216 + * 1.217 + * The storage for the returned name string is owned by the 1.218 + * UCharsetMatch, and will remain valid while the UCharsetMatch 1.219 + * is valid. 1.220 + * 1.221 + * The name returned is suitable for use with the ICU conversion APIs. 1.222 + * 1.223 + * @param ucsm The charset match object. 1.224 + * @param status Any error conditions are reported back in this variable. 1.225 + * @return The name of the matching charset. 1.226 + * 1.227 + * @stable ICU 3.6 1.228 + */ 1.229 +U_STABLE const char * U_EXPORT2 1.230 +ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 1.231 + 1.232 +/** 1.233 + * Get a confidence number for the quality of the match of the byte 1.234 + * data with the charset. Confidence numbers range from zero to 100, 1.235 + * with 100 representing complete confidence and zero representing 1.236 + * no confidence. 1.237 + * 1.238 + * The confidence values are somewhat arbitrary. They define an 1.239 + * an ordering within the results for any single detection operation 1.240 + * but are not generally comparable between the results for different input. 1.241 + * 1.242 + * A confidence value of ten does have a general meaning - it is used 1.243 + * for charsets that can represent the input data, but for which there 1.244 + * is no other indication that suggests that the charset is the correct one. 1.245 + * Pure 7 bit ASCII data, for example, is compatible with a 1.246 + * great many charsets, most of which will appear as possible matches 1.247 + * with a confidence of 10. 1.248 + * 1.249 + * @param ucsm The charset match object. 1.250 + * @param status Any error conditions are reported back in this variable. 1.251 + * @return A confidence number for the charset match. 1.252 + * 1.253 + * @stable ICU 3.6 1.254 + */ 1.255 +U_STABLE int32_t U_EXPORT2 1.256 +ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 1.257 + 1.258 +/** 1.259 + * Get the RFC 3066 code for the language of the input data. 1.260 + * 1.261 + * The Charset Detection service is intended primarily for detecting 1.262 + * charsets, not language. For some, but not all, charsets, a language is 1.263 + * identified as a byproduct of the detection process, and that is what 1.264 + * is returned by this function. 1.265 + * 1.266 + * CAUTION: 1.267 + * 1. Language information is not available for input data encoded in 1.268 + * all charsets. In particular, no language is identified 1.269 + * for UTF-8 input data. 1.270 + * 1.271 + * 2. Closely related languages may sometimes be confused. 1.272 + * 1.273 + * If more accurate language detection is required, a linguistic 1.274 + * analysis package should be used. 1.275 + * 1.276 + * The storage for the returned name string is owned by the 1.277 + * UCharsetMatch, and will remain valid while the UCharsetMatch 1.278 + * is valid. 1.279 + * 1.280 + * @param ucsm The charset match object. 1.281 + * @param status Any error conditions are reported back in this variable. 1.282 + * @return The RFC 3066 code for the language of the input data, or 1.283 + * an empty string if the language could not be determined. 1.284 + * 1.285 + * @stable ICU 3.6 1.286 + */ 1.287 +U_STABLE const char * U_EXPORT2 1.288 +ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 1.289 + 1.290 + 1.291 +/** 1.292 + * Get the entire input text as a UChar string, placing it into 1.293 + * a caller-supplied buffer. A terminating 1.294 + * NUL character will be appended to the buffer if space is available. 1.295 + * 1.296 + * The number of UChars in the output string, not including the terminating 1.297 + * NUL, is returned. 1.298 + * 1.299 + * If the supplied buffer is smaller than required to hold the output, 1.300 + * the contents of the buffer are undefined. The full output string length 1.301 + * (in UChars) is returned as always, and can be used to allocate a buffer 1.302 + * of the correct size. 1.303 + * 1.304 + * 1.305 + * @param ucsm The charset match object. 1.306 + * @param buf A UChar buffer to be filled with the converted text data. 1.307 + * @param cap The capacity of the buffer in UChars. 1.308 + * @param status Any error conditions are reported back in this variable. 1.309 + * @return The number of UChars in the output string. 1.310 + * 1.311 + * @stable ICU 3.6 1.312 + */ 1.313 +U_STABLE int32_t U_EXPORT2 1.314 +ucsdet_getUChars(const UCharsetMatch *ucsm, 1.315 + UChar *buf, int32_t cap, UErrorCode *status); 1.316 + 1.317 + 1.318 + 1.319 +/** 1.320 + * Get an iterator over the set of all detectable charsets - 1.321 + * over the charsets that are known to the charset detection 1.322 + * service. 1.323 + * 1.324 + * The returned UEnumeration provides access to the names of 1.325 + * the charsets. 1.326 + * 1.327 + * <p> 1.328 + * The state of the Charset detector that is passed in does not 1.329 + * affect the result of this function, but requiring a valid, open 1.330 + * charset detector as a parameter insures that the charset detection 1.331 + * service has been safely initialized and that the required detection 1.332 + * data is available. 1.333 + * 1.334 + * <p> 1.335 + * <b>Note:</b> Multiple different charset encodings in a same family may use 1.336 + * a single shared name in this implementation. For example, this method returns 1.337 + * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 1.338 + * (Windows Latin 1). However, actual detection result could be "windows-1252" 1.339 + * when the input data matches Latin 1 code points with any points only available 1.340 + * in "windows-1252". 1.341 + * 1.342 + * @param ucsd a Charset detector. 1.343 + * @param status Any error conditions are reported back in this variable. 1.344 + * @return an iterator providing access to the detectable charset names. 1.345 + * @stable ICU 3.6 1.346 + */ 1.347 +U_STABLE UEnumeration * U_EXPORT2 1.348 +ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 1.349 + 1.350 +/** 1.351 + * Test whether input filtering is enabled for this charset detector. 1.352 + * Input filtering removes text that appears to be HTML or xml 1.353 + * markup from the input before applying the code page detection 1.354 + * heuristics. 1.355 + * 1.356 + * @param ucsd The charset detector to check. 1.357 + * @return TRUE if filtering is enabled. 1.358 + * @stable ICU 3.6 1.359 + */ 1.360 + 1.361 +U_STABLE UBool U_EXPORT2 1.362 +ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 1.363 + 1.364 + 1.365 +/** 1.366 + * Enable filtering of input text. If filtering is enabled, 1.367 + * text within angle brackets ("<" and ">") will be removed 1.368 + * before detection, which will remove most HTML or xml markup. 1.369 + * 1.370 + * @param ucsd the charset detector to be modified. 1.371 + * @param filter <code>true</code> to enable input text filtering. 1.372 + * @return The previous setting. 1.373 + * 1.374 + * @stable ICU 3.6 1.375 + */ 1.376 +U_STABLE UBool U_EXPORT2 1.377 +ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 1.378 + 1.379 +#ifndef U_HIDE_INTERNAL_API 1.380 +/** 1.381 + * Get an iterator over the set of detectable charsets - 1.382 + * over the charsets that are enabled by the specified charset detector. 1.383 + * 1.384 + * The returned UEnumeration provides access to the names of 1.385 + * the charsets. 1.386 + * 1.387 + * @param ucsd a Charset detector. 1.388 + * @param status Any error conditions are reported back in this variable. 1.389 + * @return an iterator providing access to the detectable charset names by 1.390 + * the specified charset detector. 1.391 + * @internal 1.392 + */ 1.393 +U_INTERNAL UEnumeration * U_EXPORT2 1.394 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 1.395 + 1.396 +/** 1.397 + * Enable or disable individual charset encoding. 1.398 + * A name of charset encoding must be included in the names returned by 1.399 + * {@link #getAllDetectableCharsets()}. 1.400 + * 1.401 + * @param ucsd a Charset detector. 1.402 + * @param encoding encoding the name of charset encoding. 1.403 + * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the 1.404 + * charset encoding. 1.405 + * @param status receives the return status. When the name of charset encoding 1.406 + * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 1.407 + * @internal 1.408 + */ 1.409 +U_INTERNAL void U_EXPORT2 1.410 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 1.411 +#endif /* U_HIDE_INTERNAL_API */ 1.412 + 1.413 +#endif 1.414 +#endif /* __UCSDET_H */ 1.415 + 1.416 +