intl/icu/source/i18n/unicode/ucsdet.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucsdet.h
michael@0 7 * encoding: US-ASCII
michael@0 8 * indentation:4
michael@0 9 *
michael@0 10 * created on: 2005Aug04
michael@0 11 * created by: Andy Heninger
michael@0 12 *
michael@0 13 * ICU Character Set Detection, API for C
michael@0 14 *
michael@0 15 * Draft version 18 Oct 2005
michael@0 16 *
michael@0 17 */
michael@0 18
michael@0 19 #ifndef __UCSDET_H
michael@0 20 #define __UCSDET_H
michael@0 21
michael@0 22 #include "unicode/utypes.h"
michael@0 23
michael@0 24 #if !UCONFIG_NO_CONVERSION
michael@0 25
michael@0 26 #include "unicode/localpointer.h"
michael@0 27 #include "unicode/uenum.h"
michael@0 28
michael@0 29 /**
michael@0 30 * \file
michael@0 31 * \brief C API: Charset Detection API
michael@0 32 *
michael@0 33 * This API provides a facility for detecting the
michael@0 34 * charset or encoding of character data in an unknown text format.
michael@0 35 * The input data can be from an array of bytes.
michael@0 36 * <p>
michael@0 37 * Character set detection is at best an imprecise operation. The detection
michael@0 38 * process will attempt to identify the charset that best matches the characteristics
michael@0 39 * of the byte data, but the process is partly statistical in nature, and
michael@0 40 * the results can not be guaranteed to always be correct.
michael@0 41 * <p>
michael@0 42 * For best accuracy in charset detection, the input data should be primarily
michael@0 43 * in a single language, and a minimum of a few hundred bytes worth of plain text
michael@0 44 * in the language are needed. The detection process will attempt to
michael@0 45 * ignore html or xml style markup that could otherwise obscure the content.
michael@0 46 */
michael@0 47
michael@0 48
michael@0 49 struct UCharsetDetector;
michael@0 50 /**
michael@0 51 * Structure representing a charset detector
michael@0 52 * @stable ICU 3.6
michael@0 53 */
michael@0 54 typedef struct UCharsetDetector UCharsetDetector;
michael@0 55
michael@0 56 struct UCharsetMatch;
michael@0 57 /**
michael@0 58 * Opaque structure representing a match that was identified
michael@0 59 * from a charset detection operation.
michael@0 60 * @stable ICU 3.6
michael@0 61 */
michael@0 62 typedef struct UCharsetMatch UCharsetMatch;
michael@0 63
michael@0 64 /**
michael@0 65 * Open a charset detector.
michael@0 66 *
michael@0 67 * @param status Any error conditions occurring during the open
michael@0 68 * operation are reported back in this variable.
michael@0 69 * @return the newly opened charset detector.
michael@0 70 * @stable ICU 3.6
michael@0 71 */
michael@0 72 U_STABLE UCharsetDetector * U_EXPORT2
michael@0 73 ucsdet_open(UErrorCode *status);
michael@0 74
michael@0 75 /**
michael@0 76 * Close a charset detector. All storage and any other resources
michael@0 77 * owned by this charset detector will be released. Failure to
michael@0 78 * close a charset detector when finished with it can result in
michael@0 79 * memory leaks in the application.
michael@0 80 *
michael@0 81 * @param ucsd The charset detector to be closed.
michael@0 82 * @stable ICU 3.6
michael@0 83 */
michael@0 84 U_STABLE void U_EXPORT2
michael@0 85 ucsdet_close(UCharsetDetector *ucsd);
michael@0 86
michael@0 87 #if U_SHOW_CPLUSPLUS_API
michael@0 88
michael@0 89 U_NAMESPACE_BEGIN
michael@0 90
michael@0 91 /**
michael@0 92 * \class LocalUCharsetDetectorPointer
michael@0 93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
michael@0 94 * For most methods see the LocalPointerBase base class.
michael@0 95 *
michael@0 96 * @see LocalPointerBase
michael@0 97 * @see LocalPointer
michael@0 98 * @stable ICU 4.4
michael@0 99 */
michael@0 100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
michael@0 101
michael@0 102 U_NAMESPACE_END
michael@0 103
michael@0 104 #endif
michael@0 105
michael@0 106 /**
michael@0 107 * Set the input byte data whose charset is to detected.
michael@0 108 *
michael@0 109 * Ownership of the input text byte array remains with the caller.
michael@0 110 * The input string must not be altered or deleted until the charset
michael@0 111 * detector is either closed or reset to refer to different input text.
michael@0 112 *
michael@0 113 * @param ucsd the charset detector to be used.
michael@0 114 * @param textIn the input text of unknown encoding. .
michael@0 115 * @param len the length of the input text, or -1 if the text
michael@0 116 * is NUL terminated.
michael@0 117 * @param status any error conditions are reported back in this variable.
michael@0 118 *
michael@0 119 * @stable ICU 3.6
michael@0 120 */
michael@0 121 U_STABLE void U_EXPORT2
michael@0 122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
michael@0 123
michael@0 124
michael@0 125 /** Set the declared encoding for charset detection.
michael@0 126 * The declared encoding of an input text is an encoding obtained
michael@0 127 * by the user from an http header or xml declaration or similar source that
michael@0 128 * can be provided as an additional hint to the charset detector.
michael@0 129 *
michael@0 130 * How and whether the declared encoding will be used during the
michael@0 131 * detection process is TBD.
michael@0 132 *
michael@0 133 * @param ucsd the charset detector to be used.
michael@0 134 * @param encoding an encoding for the current data obtained from
michael@0 135 * a header or declaration or other source outside
michael@0 136 * of the byte data itself.
michael@0 137 * @param length the length of the encoding name, or -1 if the name string
michael@0 138 * is NUL terminated.
michael@0 139 * @param status any error conditions are reported back in this variable.
michael@0 140 *
michael@0 141 * @stable ICU 3.6
michael@0 142 */
michael@0 143 U_STABLE void U_EXPORT2
michael@0 144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
michael@0 145
michael@0 146
michael@0 147 /**
michael@0 148 * Return the charset that best matches the supplied input data.
michael@0 149 *
michael@0 150 * Note though, that because the detection
michael@0 151 * only looks at the start of the input data,
michael@0 152 * there is a possibility that the returned charset will fail to handle
michael@0 153 * the full set of input data.
michael@0 154 * <p>
michael@0 155 * The returned UCharsetMatch object is owned by the UCharsetDetector.
michael@0 156 * It will remain valid until the detector input is reset, or until
michael@0 157 * the detector is closed.
michael@0 158 * <p>
michael@0 159 * The function will fail if
michael@0 160 * <ul>
michael@0 161 * <li>no charset appears to match the data.</li>
michael@0 162 * <li>no input text has been provided</li>
michael@0 163 * </ul>
michael@0 164 *
michael@0 165 * @param ucsd the charset detector to be used.
michael@0 166 * @param status any error conditions are reported back in this variable.
michael@0 167 * @return a UCharsetMatch representing the best matching charset,
michael@0 168 * or NULL if no charset matches the byte data.
michael@0 169 *
michael@0 170 * @stable ICU 3.6
michael@0 171 */
michael@0 172 U_STABLE const UCharsetMatch * U_EXPORT2
michael@0 173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
michael@0 174
michael@0 175
michael@0 176 /**
michael@0 177 * Find all charset matches that appear to be consistent with the input,
michael@0 178 * returning an array of results. The results are ordered with the
michael@0 179 * best quality match first.
michael@0 180 *
michael@0 181 * Because the detection only looks at a limited amount of the
michael@0 182 * input byte data, some of the returned charsets may fail to handle
michael@0 183 * the all of input data.
michael@0 184 * <p>
michael@0 185 * The returned UCharsetMatch objects are owned by the UCharsetDetector.
michael@0 186 * They will remain valid until the detector is closed or modified
michael@0 187 *
michael@0 188 * <p>
michael@0 189 * Return an error if
michael@0 190 * <ul>
michael@0 191 * <li>no charsets appear to match the input data.</li>
michael@0 192 * <li>no input text has been provided</li>
michael@0 193 * </ul>
michael@0 194 *
michael@0 195 * @param ucsd the charset detector to be used.
michael@0 196 * @param matchesFound pointer to a variable that will be set to the
michael@0 197 * number of charsets identified that are consistent with
michael@0 198 * the input data. Output only.
michael@0 199 * @param status any error conditions are reported back in this variable.
michael@0 200 * @return A pointer to an array of pointers to UCharSetMatch objects.
michael@0 201 * This array, and the UCharSetMatch instances to which it refers,
michael@0 202 * are owned by the UCharsetDetector, and will remain valid until
michael@0 203 * the detector is closed or modified.
michael@0 204 * @stable ICU 3.6
michael@0 205 */
michael@0 206 U_STABLE const UCharsetMatch ** U_EXPORT2
michael@0 207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
michael@0 208
michael@0 209
michael@0 210
michael@0 211 /**
michael@0 212 * Get the name of the charset represented by a UCharsetMatch.
michael@0 213 *
michael@0 214 * The storage for the returned name string is owned by the
michael@0 215 * UCharsetMatch, and will remain valid while the UCharsetMatch
michael@0 216 * is valid.
michael@0 217 *
michael@0 218 * The name returned is suitable for use with the ICU conversion APIs.
michael@0 219 *
michael@0 220 * @param ucsm The charset match object.
michael@0 221 * @param status Any error conditions are reported back in this variable.
michael@0 222 * @return The name of the matching charset.
michael@0 223 *
michael@0 224 * @stable ICU 3.6
michael@0 225 */
michael@0 226 U_STABLE const char * U_EXPORT2
michael@0 227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
michael@0 228
michael@0 229 /**
michael@0 230 * Get a confidence number for the quality of the match of the byte
michael@0 231 * data with the charset. Confidence numbers range from zero to 100,
michael@0 232 * with 100 representing complete confidence and zero representing
michael@0 233 * no confidence.
michael@0 234 *
michael@0 235 * The confidence values are somewhat arbitrary. They define an
michael@0 236 * an ordering within the results for any single detection operation
michael@0 237 * but are not generally comparable between the results for different input.
michael@0 238 *
michael@0 239 * A confidence value of ten does have a general meaning - it is used
michael@0 240 * for charsets that can represent the input data, but for which there
michael@0 241 * is no other indication that suggests that the charset is the correct one.
michael@0 242 * Pure 7 bit ASCII data, for example, is compatible with a
michael@0 243 * great many charsets, most of which will appear as possible matches
michael@0 244 * with a confidence of 10.
michael@0 245 *
michael@0 246 * @param ucsm The charset match object.
michael@0 247 * @param status Any error conditions are reported back in this variable.
michael@0 248 * @return A confidence number for the charset match.
michael@0 249 *
michael@0 250 * @stable ICU 3.6
michael@0 251 */
michael@0 252 U_STABLE int32_t U_EXPORT2
michael@0 253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
michael@0 254
michael@0 255 /**
michael@0 256 * Get the RFC 3066 code for the language of the input data.
michael@0 257 *
michael@0 258 * The Charset Detection service is intended primarily for detecting
michael@0 259 * charsets, not language. For some, but not all, charsets, a language is
michael@0 260 * identified as a byproduct of the detection process, and that is what
michael@0 261 * is returned by this function.
michael@0 262 *
michael@0 263 * CAUTION:
michael@0 264 * 1. Language information is not available for input data encoded in
michael@0 265 * all charsets. In particular, no language is identified
michael@0 266 * for UTF-8 input data.
michael@0 267 *
michael@0 268 * 2. Closely related languages may sometimes be confused.
michael@0 269 *
michael@0 270 * If more accurate language detection is required, a linguistic
michael@0 271 * analysis package should be used.
michael@0 272 *
michael@0 273 * The storage for the returned name string is owned by the
michael@0 274 * UCharsetMatch, and will remain valid while the UCharsetMatch
michael@0 275 * is valid.
michael@0 276 *
michael@0 277 * @param ucsm The charset match object.
michael@0 278 * @param status Any error conditions are reported back in this variable.
michael@0 279 * @return The RFC 3066 code for the language of the input data, or
michael@0 280 * an empty string if the language could not be determined.
michael@0 281 *
michael@0 282 * @stable ICU 3.6
michael@0 283 */
michael@0 284 U_STABLE const char * U_EXPORT2
michael@0 285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
michael@0 286
michael@0 287
michael@0 288 /**
michael@0 289 * Get the entire input text as a UChar string, placing it into
michael@0 290 * a caller-supplied buffer. A terminating
michael@0 291 * NUL character will be appended to the buffer if space is available.
michael@0 292 *
michael@0 293 * The number of UChars in the output string, not including the terminating
michael@0 294 * NUL, is returned.
michael@0 295 *
michael@0 296 * If the supplied buffer is smaller than required to hold the output,
michael@0 297 * the contents of the buffer are undefined. The full output string length
michael@0 298 * (in UChars) is returned as always, and can be used to allocate a buffer
michael@0 299 * of the correct size.
michael@0 300 *
michael@0 301 *
michael@0 302 * @param ucsm The charset match object.
michael@0 303 * @param buf A UChar buffer to be filled with the converted text data.
michael@0 304 * @param cap The capacity of the buffer in UChars.
michael@0 305 * @param status Any error conditions are reported back in this variable.
michael@0 306 * @return The number of UChars in the output string.
michael@0 307 *
michael@0 308 * @stable ICU 3.6
michael@0 309 */
michael@0 310 U_STABLE int32_t U_EXPORT2
michael@0 311 ucsdet_getUChars(const UCharsetMatch *ucsm,
michael@0 312 UChar *buf, int32_t cap, UErrorCode *status);
michael@0 313
michael@0 314
michael@0 315
michael@0 316 /**
michael@0 317 * Get an iterator over the set of all detectable charsets -
michael@0 318 * over the charsets that are known to the charset detection
michael@0 319 * service.
michael@0 320 *
michael@0 321 * The returned UEnumeration provides access to the names of
michael@0 322 * the charsets.
michael@0 323 *
michael@0 324 * <p>
michael@0 325 * The state of the Charset detector that is passed in does not
michael@0 326 * affect the result of this function, but requiring a valid, open
michael@0 327 * charset detector as a parameter insures that the charset detection
michael@0 328 * service has been safely initialized and that the required detection
michael@0 329 * data is available.
michael@0 330 *
michael@0 331 * <p>
michael@0 332 * <b>Note:</b> Multiple different charset encodings in a same family may use
michael@0 333 * a single shared name in this implementation. For example, this method returns
michael@0 334 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
michael@0 335 * (Windows Latin 1). However, actual detection result could be "windows-1252"
michael@0 336 * when the input data matches Latin 1 code points with any points only available
michael@0 337 * in "windows-1252".
michael@0 338 *
michael@0 339 * @param ucsd a Charset detector.
michael@0 340 * @param status Any error conditions are reported back in this variable.
michael@0 341 * @return an iterator providing access to the detectable charset names.
michael@0 342 * @stable ICU 3.6
michael@0 343 */
michael@0 344 U_STABLE UEnumeration * U_EXPORT2
michael@0 345 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
michael@0 346
michael@0 347 /**
michael@0 348 * Test whether input filtering is enabled for this charset detector.
michael@0 349 * Input filtering removes text that appears to be HTML or xml
michael@0 350 * markup from the input before applying the code page detection
michael@0 351 * heuristics.
michael@0 352 *
michael@0 353 * @param ucsd The charset detector to check.
michael@0 354 * @return TRUE if filtering is enabled.
michael@0 355 * @stable ICU 3.6
michael@0 356 */
michael@0 357
michael@0 358 U_STABLE UBool U_EXPORT2
michael@0 359 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
michael@0 360
michael@0 361
michael@0 362 /**
michael@0 363 * Enable filtering of input text. If filtering is enabled,
michael@0 364 * text within angle brackets ("<" and ">") will be removed
michael@0 365 * before detection, which will remove most HTML or xml markup.
michael@0 366 *
michael@0 367 * @param ucsd the charset detector to be modified.
michael@0 368 * @param filter <code>true</code> to enable input text filtering.
michael@0 369 * @return The previous setting.
michael@0 370 *
michael@0 371 * @stable ICU 3.6
michael@0 372 */
michael@0 373 U_STABLE UBool U_EXPORT2
michael@0 374 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
michael@0 375
michael@0 376 #ifndef U_HIDE_INTERNAL_API
michael@0 377 /**
michael@0 378 * Get an iterator over the set of detectable charsets -
michael@0 379 * over the charsets that are enabled by the specified charset detector.
michael@0 380 *
michael@0 381 * The returned UEnumeration provides access to the names of
michael@0 382 * the charsets.
michael@0 383 *
michael@0 384 * @param ucsd a Charset detector.
michael@0 385 * @param status Any error conditions are reported back in this variable.
michael@0 386 * @return an iterator providing access to the detectable charset names by
michael@0 387 * the specified charset detector.
michael@0 388 * @internal
michael@0 389 */
michael@0 390 U_INTERNAL UEnumeration * U_EXPORT2
michael@0 391 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
michael@0 392
michael@0 393 /**
michael@0 394 * Enable or disable individual charset encoding.
michael@0 395 * A name of charset encoding must be included in the names returned by
michael@0 396 * {@link #getAllDetectableCharsets()}.
michael@0 397 *
michael@0 398 * @param ucsd a Charset detector.
michael@0 399 * @param encoding encoding the name of charset encoding.
michael@0 400 * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
michael@0 401 * charset encoding.
michael@0 402 * @param status receives the return status. When the name of charset encoding
michael@0 403 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
michael@0 404 * @internal
michael@0 405 */
michael@0 406 U_INTERNAL void U_EXPORT2
michael@0 407 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
michael@0 408 #endif /* U_HIDE_INTERNAL_API */
michael@0 409
michael@0 410 #endif
michael@0 411 #endif /* __UCSDET_H */
michael@0 412
michael@0 413

mercurial