intl/icu/source/i18n/unicode/ucsdet.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/unicode/ucsdet.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,413 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2013, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + *   file name:  ucsdet.h
    1.10 + *   encoding:   US-ASCII
    1.11 + *   indentation:4
    1.12 + *
    1.13 + *   created on: 2005Aug04
    1.14 + *   created by: Andy Heninger
    1.15 + *
    1.16 + *   ICU Character Set Detection, API for C
    1.17 + *
    1.18 + *   Draft version 18 Oct 2005
    1.19 + *
    1.20 + */
    1.21 +
    1.22 +#ifndef __UCSDET_H
    1.23 +#define __UCSDET_H
    1.24 +
    1.25 +#include "unicode/utypes.h"
    1.26 +
    1.27 +#if !UCONFIG_NO_CONVERSION
    1.28 +
    1.29 +#include "unicode/localpointer.h"
    1.30 +#include "unicode/uenum.h"
    1.31 +
    1.32 +/**
    1.33 + * \file 
    1.34 + * \brief C API: Charset Detection API
    1.35 + *
    1.36 + * This API provides a facility for detecting the
    1.37 + * charset or encoding of character data in an unknown text format.
    1.38 + * The input data can be from an array of bytes.
    1.39 + * <p>
    1.40 + * Character set detection is at best an imprecise operation.  The detection
    1.41 + * process will attempt to identify the charset that best matches the characteristics
    1.42 + * of the byte data, but the process is partly statistical in nature, and
    1.43 + * the results can not be guaranteed to always be correct.
    1.44 + * <p>
    1.45 + * For best accuracy in charset detection, the input data should be primarily
    1.46 + * in a single language, and a minimum of a few hundred bytes worth of plain text
    1.47 + * in the language are needed.  The detection process will attempt to
    1.48 + * ignore html or xml style markup that could otherwise obscure the content.
    1.49 + */
    1.50 + 
    1.51 +
    1.52 +struct UCharsetDetector;
    1.53 +/**
    1.54 +  * Structure representing a charset detector
    1.55 +  * @stable ICU 3.6
    1.56 +  */
    1.57 +typedef struct UCharsetDetector UCharsetDetector;
    1.58 +
    1.59 +struct UCharsetMatch;
    1.60 +/**
    1.61 +  *  Opaque structure representing a match that was identified
    1.62 +  *  from a charset detection operation.
    1.63 +  *  @stable ICU 3.6
    1.64 +  */
    1.65 +typedef struct UCharsetMatch UCharsetMatch;
    1.66 +
    1.67 +/**
    1.68 +  *  Open a charset detector.
    1.69 +  *
    1.70 +  *  @param status Any error conditions occurring during the open
    1.71 +  *                operation are reported back in this variable.
    1.72 +  *  @return the newly opened charset detector.
    1.73 +  *  @stable ICU 3.6
    1.74 +  */
    1.75 +U_STABLE UCharsetDetector * U_EXPORT2
    1.76 +ucsdet_open(UErrorCode   *status);
    1.77 +
    1.78 +/**
    1.79 +  * Close a charset detector.  All storage and any other resources
    1.80 +  *   owned by this charset detector will be released.  Failure to
    1.81 +  *   close a charset detector when finished with it can result in
    1.82 +  *   memory leaks in the application.
    1.83 +  *
    1.84 +  *  @param ucsd  The charset detector to be closed.
    1.85 +  *  @stable ICU 3.6
    1.86 +  */
    1.87 +U_STABLE void U_EXPORT2
    1.88 +ucsdet_close(UCharsetDetector *ucsd);
    1.89 +
    1.90 +#if U_SHOW_CPLUSPLUS_API
    1.91 +
    1.92 +U_NAMESPACE_BEGIN
    1.93 +
    1.94 +/**
    1.95 + * \class LocalUCharsetDetectorPointer
    1.96 + * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
    1.97 + * For most methods see the LocalPointerBase base class.
    1.98 + *
    1.99 + * @see LocalPointerBase
   1.100 + * @see LocalPointer
   1.101 + * @stable ICU 4.4
   1.102 + */
   1.103 +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
   1.104 +
   1.105 +U_NAMESPACE_END
   1.106 +
   1.107 +#endif
   1.108 +
   1.109 +/**
   1.110 +  * Set the input byte data whose charset is to detected.
   1.111 +  *
   1.112 +  * Ownership of the input  text byte array remains with the caller.
   1.113 +  * The input string must not be altered or deleted until the charset
   1.114 +  * detector is either closed or reset to refer to different input text.
   1.115 +  *
   1.116 +  * @param ucsd   the charset detector to be used.
   1.117 +  * @param textIn the input text of unknown encoding.   .
   1.118 +  * @param len    the length of the input text, or -1 if the text
   1.119 +  *               is NUL terminated.
   1.120 +  * @param status any error conditions are reported back in this variable.
   1.121 +  *
   1.122 +  * @stable ICU 3.6
   1.123 +  */
   1.124 +U_STABLE void U_EXPORT2
   1.125 +ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
   1.126 +
   1.127 +
   1.128 +/** Set the declared encoding for charset detection.
   1.129 + *  The declared encoding of an input text is an encoding obtained
   1.130 + *  by the user from an http header or xml declaration or similar source that
   1.131 + *  can be provided as an additional hint to the charset detector.
   1.132 + *
   1.133 + *  How and whether the declared encoding will be used during the
   1.134 + *  detection process is TBD.
   1.135 + *
   1.136 + * @param ucsd      the charset detector to be used.
   1.137 + * @param encoding  an encoding for the current data obtained from
   1.138 + *                  a header or declaration or other source outside
   1.139 + *                  of the byte data itself.
   1.140 + * @param length    the length of the encoding name, or -1 if the name string
   1.141 + *                  is NUL terminated.
   1.142 + * @param status    any error conditions are reported back in this variable.
   1.143 + *
   1.144 + * @stable ICU 3.6
   1.145 + */
   1.146 +U_STABLE void U_EXPORT2
   1.147 +ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
   1.148 +
   1.149 +
   1.150 +/**
   1.151 + * Return the charset that best matches the supplied input data.
   1.152 + * 
   1.153 + * Note though, that because the detection 
   1.154 + * only looks at the start of the input data,
   1.155 + * there is a possibility that the returned charset will fail to handle
   1.156 + * the full set of input data.
   1.157 + * <p>
   1.158 + * The returned UCharsetMatch object is owned by the UCharsetDetector.
   1.159 + * It will remain valid until the detector input is reset, or until
   1.160 + * the detector is closed.
   1.161 + * <p>
   1.162 + * The function will fail if
   1.163 + *  <ul>
   1.164 + *    <li>no charset appears to match the data.</li>
   1.165 + *    <li>no input text has been provided</li>
   1.166 + *  </ul>
   1.167 + *
   1.168 + * @param ucsd      the charset detector to be used.
   1.169 + * @param status    any error conditions are reported back in this variable.
   1.170 + * @return          a UCharsetMatch  representing the best matching charset,
   1.171 + *                  or NULL if no charset matches the byte data.
   1.172 + *
   1.173 + * @stable ICU 3.6
   1.174 + */
   1.175 +U_STABLE const UCharsetMatch * U_EXPORT2
   1.176 +ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
   1.177 +    
   1.178 +
   1.179 +/**
   1.180 + *  Find all charset matches that appear to be consistent with the input,
   1.181 + *  returning an array of results.  The results are ordered with the
   1.182 + *  best quality match first.
   1.183 + *
   1.184 + *  Because the detection only looks at a limited amount of the
   1.185 + *  input byte data, some of the returned charsets may fail to handle
   1.186 + *  the all of input data.
   1.187 + *  <p>
   1.188 + *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
   1.189 + *  They will remain valid until the detector is closed or modified
   1.190 + *  
   1.191 + * <p>
   1.192 + * Return an error if 
   1.193 + *  <ul>
   1.194 + *    <li>no charsets appear to match the input data.</li>
   1.195 + *    <li>no input text has been provided</li>
   1.196 + *  </ul>
   1.197 + * 
   1.198 + * @param ucsd          the charset detector to be used.
   1.199 + * @param matchesFound  pointer to a variable that will be set to the
   1.200 + *                      number of charsets identified that are consistent with
   1.201 + *                      the input data.  Output only.
   1.202 + * @param status        any error conditions are reported back in this variable.
   1.203 + * @return              A pointer to an array of pointers to UCharSetMatch objects.
   1.204 + *                      This array, and the UCharSetMatch instances to which it refers,
   1.205 + *                      are owned by the UCharsetDetector, and will remain valid until
   1.206 + *                      the detector is closed or modified.
   1.207 + * @stable ICU 3.6
   1.208 + */
   1.209 +U_STABLE const UCharsetMatch ** U_EXPORT2
   1.210 +ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
   1.211 +
   1.212 +
   1.213 +
   1.214 +/**
   1.215 + *  Get the name of the charset represented by a UCharsetMatch.
   1.216 + *
   1.217 + *  The storage for the returned name string is owned by the
   1.218 + *  UCharsetMatch, and will remain valid while the UCharsetMatch
   1.219 + *  is valid.
   1.220 + *
   1.221 + *  The name returned is suitable for use with the ICU conversion APIs.
   1.222 + *
   1.223 + *  @param ucsm    The charset match object.
   1.224 + *  @param status  Any error conditions are reported back in this variable.
   1.225 + *  @return        The name of the matching charset.
   1.226 + *
   1.227 + *  @stable ICU 3.6
   1.228 + */
   1.229 +U_STABLE const char * U_EXPORT2
   1.230 +ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
   1.231 +
   1.232 +/**
   1.233 + *  Get a confidence number for the quality of the match of the byte
   1.234 + *  data with the charset.  Confidence numbers range from zero to 100,
   1.235 + *  with 100 representing complete confidence and zero representing
   1.236 + *  no confidence.
   1.237 + *
   1.238 + *  The confidence values are somewhat arbitrary.  They define an
   1.239 + *  an ordering within the results for any single detection operation
   1.240 + *  but are not generally comparable between the results for different input.
   1.241 + *
   1.242 + *  A confidence value of ten does have a general meaning - it is used
   1.243 + *  for charsets that can represent the input data, but for which there
   1.244 + *  is no other indication that suggests that the charset is the correct one.
   1.245 + *  Pure 7 bit ASCII data, for example, is compatible with a
   1.246 + *  great many charsets, most of which will appear as possible matches
   1.247 + *  with a confidence of 10.
   1.248 + *
   1.249 + *  @param ucsm    The charset match object.
   1.250 + *  @param status  Any error conditions are reported back in this variable.
   1.251 + *  @return        A confidence number for the charset match.
   1.252 + *
   1.253 + *  @stable ICU 3.6
   1.254 + */
   1.255 +U_STABLE int32_t U_EXPORT2
   1.256 +ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
   1.257 +
   1.258 +/**
   1.259 + *  Get the RFC 3066 code for the language of the input data.
   1.260 + *
   1.261 + *  The Charset Detection service is intended primarily for detecting
   1.262 + *  charsets, not language.  For some, but not all, charsets, a language is
   1.263 + *  identified as a byproduct of the detection process, and that is what
   1.264 + *  is returned by this function.
   1.265 + *
   1.266 + *  CAUTION:
   1.267 + *    1.  Language information is not available for input data encoded in
   1.268 + *        all charsets. In particular, no language is identified
   1.269 + *        for UTF-8 input data.
   1.270 + *
   1.271 + *    2.  Closely related languages may sometimes be confused.
   1.272 + *
   1.273 + *  If more accurate language detection is required, a linguistic
   1.274 + *  analysis package should be used.
   1.275 + *
   1.276 + *  The storage for the returned name string is owned by the
   1.277 + *  UCharsetMatch, and will remain valid while the UCharsetMatch
   1.278 + *  is valid.
   1.279 + *
   1.280 + *  @param ucsm    The charset match object.
   1.281 + *  @param status  Any error conditions are reported back in this variable.
   1.282 + *  @return        The RFC 3066 code for the language of the input data, or
   1.283 + *                 an empty string if the language could not be determined.
   1.284 + *
   1.285 + *  @stable ICU 3.6
   1.286 + */
   1.287 +U_STABLE const char * U_EXPORT2
   1.288 +ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
   1.289 +
   1.290 +
   1.291 +/**
   1.292 +  *  Get the entire input text as a UChar string, placing it into
   1.293 +  *  a caller-supplied buffer.  A terminating
   1.294 +  *  NUL character will be appended to the buffer if space is available.
   1.295 +  *
   1.296 +  *  The number of UChars in the output string, not including the terminating
   1.297 +  *  NUL, is returned. 
   1.298 +  *
   1.299 +  *  If the supplied buffer is smaller than required to hold the output,
   1.300 +  *  the contents of the buffer are undefined.  The full output string length
   1.301 +  *  (in UChars) is returned as always, and can be used to allocate a buffer
   1.302 +  *  of the correct size.
   1.303 +  *
   1.304 +  *
   1.305 +  * @param ucsm    The charset match object.
   1.306 +  * @param buf     A UChar buffer to be filled with the converted text data.
   1.307 +  * @param cap     The capacity of the buffer in UChars.
   1.308 +  * @param status  Any error conditions are reported back in this variable.
   1.309 +  * @return        The number of UChars in the output string.
   1.310 +  *
   1.311 +  * @stable ICU 3.6
   1.312 +  */
   1.313 +U_STABLE  int32_t U_EXPORT2
   1.314 +ucsdet_getUChars(const UCharsetMatch *ucsm,
   1.315 +                 UChar *buf, int32_t cap, UErrorCode *status);
   1.316 +
   1.317 +
   1.318 +
   1.319 +/**
   1.320 +  *  Get an iterator over the set of all detectable charsets - 
   1.321 +  *  over the charsets that are known to the charset detection
   1.322 +  *  service.
   1.323 +  *
   1.324 +  *  The returned UEnumeration provides access to the names of
   1.325 +  *  the charsets.
   1.326 +  *
   1.327 +  *  <p>
   1.328 +  *  The state of the Charset detector that is passed in does not
   1.329 +  *  affect the result of this function, but requiring a valid, open
   1.330 +  *  charset detector as a parameter insures that the charset detection
   1.331 +  *  service has been safely initialized and that the required detection
   1.332 +  *  data is available.
   1.333 +  *
   1.334 +  *  <p>
   1.335 +  *  <b>Note:</b> Multiple different charset encodings in a same family may use
   1.336 +  *  a single shared name in this implementation. For example, this method returns
   1.337 +  *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
   1.338 +  *  (Windows Latin 1). However, actual detection result could be "windows-1252"
   1.339 +  *  when the input data matches Latin 1 code points with any points only available
   1.340 +  *  in "windows-1252".
   1.341 +  *
   1.342 +  *  @param ucsd a Charset detector.
   1.343 +  *  @param status  Any error conditions are reported back in this variable.
   1.344 +  *  @return an iterator providing access to the detectable charset names.
   1.345 +  *  @stable ICU 3.6
   1.346 +  */
   1.347 +U_STABLE  UEnumeration * U_EXPORT2
   1.348 +ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
   1.349 +
   1.350 +/**
   1.351 +  *  Test whether input filtering is enabled for this charset detector.
   1.352 +  *  Input filtering removes text that appears to be HTML or xml
   1.353 +  *  markup from the input before applying the code page detection
   1.354 +  *  heuristics.
   1.355 +  *
   1.356 +  *  @param ucsd  The charset detector to check.
   1.357 +  *  @return TRUE if filtering is enabled.
   1.358 +  *  @stable ICU 3.6
   1.359 +  */
   1.360 +
   1.361 +U_STABLE  UBool U_EXPORT2
   1.362 +ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
   1.363 +
   1.364 +
   1.365 +/**
   1.366 + * Enable filtering of input text. If filtering is enabled,
   1.367 + * text within angle brackets ("<" and ">") will be removed
   1.368 + * before detection, which will remove most HTML or xml markup.
   1.369 + *
   1.370 + * @param ucsd   the charset detector to be modified.
   1.371 + * @param filter <code>true</code> to enable input text filtering.
   1.372 + * @return The previous setting.
   1.373 + *
   1.374 + * @stable ICU 3.6
   1.375 + */
   1.376 +U_STABLE  UBool U_EXPORT2
   1.377 +ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
   1.378 +
   1.379 +#ifndef U_HIDE_INTERNAL_API
   1.380 +/**
   1.381 +  *  Get an iterator over the set of detectable charsets -
   1.382 +  *  over the charsets that are enabled by the specified charset detector.
   1.383 +  *
   1.384 +  *  The returned UEnumeration provides access to the names of
   1.385 +  *  the charsets.
   1.386 +  *
   1.387 +  *  @param ucsd a Charset detector.
   1.388 +  *  @param status  Any error conditions are reported back in this variable.
   1.389 +  *  @return an iterator providing access to the detectable charset names by
   1.390 +  *  the specified charset detector.
   1.391 +  *  @internal
   1.392 +  */
   1.393 +U_INTERNAL UEnumeration * U_EXPORT2
   1.394 +ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
   1.395 +
   1.396 +/**
   1.397 +  * Enable or disable individual charset encoding.
   1.398 +  * A name of charset encoding must be included in the names returned by
   1.399 +  * {@link #getAllDetectableCharsets()}.
   1.400 +  *
   1.401 +  * @param ucsd a Charset detector.
   1.402 +  * @param encoding encoding the name of charset encoding.
   1.403 +  * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
   1.404 +  *   charset encoding.
   1.405 +  * @param status receives the return status. When the name of charset encoding
   1.406 +  *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
   1.407 +  * @internal
   1.408 +  */
   1.409 +U_INTERNAL void U_EXPORT2
   1.410 +ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
   1.411 +#endif  /* U_HIDE_INTERNAL_API */
   1.412 +
   1.413 +#endif
   1.414 +#endif   /* __UCSDET_H */
   1.415 +
   1.416 +

mercurial