intl/icu/source/i18n/unicode/ucsdet.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 2005-2013, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  *   file name:  ucsdet.h
     7  *   encoding:   US-ASCII
     8  *   indentation:4
     9  *
    10  *   created on: 2005Aug04
    11  *   created by: Andy Heninger
    12  *
    13  *   ICU Character Set Detection, API for C
    14  *
    15  *   Draft version 18 Oct 2005
    16  *
    17  */
    19 #ifndef __UCSDET_H
    20 #define __UCSDET_H
    22 #include "unicode/utypes.h"
    24 #if !UCONFIG_NO_CONVERSION
    26 #include "unicode/localpointer.h"
    27 #include "unicode/uenum.h"
    29 /**
    30  * \file 
    31  * \brief C API: Charset Detection API
    32  *
    33  * This API provides a facility for detecting the
    34  * charset or encoding of character data in an unknown text format.
    35  * The input data can be from an array of bytes.
    36  * <p>
    37  * Character set detection is at best an imprecise operation.  The detection
    38  * process will attempt to identify the charset that best matches the characteristics
    39  * of the byte data, but the process is partly statistical in nature, and
    40  * the results can not be guaranteed to always be correct.
    41  * <p>
    42  * For best accuracy in charset detection, the input data should be primarily
    43  * in a single language, and a minimum of a few hundred bytes worth of plain text
    44  * in the language are needed.  The detection process will attempt to
    45  * ignore html or xml style markup that could otherwise obscure the content.
    46  */
    49 struct UCharsetDetector;
    50 /**
    51   * Structure representing a charset detector
    52   * @stable ICU 3.6
    53   */
    54 typedef struct UCharsetDetector UCharsetDetector;
    56 struct UCharsetMatch;
    57 /**
    58   *  Opaque structure representing a match that was identified
    59   *  from a charset detection operation.
    60   *  @stable ICU 3.6
    61   */
    62 typedef struct UCharsetMatch UCharsetMatch;
    64 /**
    65   *  Open a charset detector.
    66   *
    67   *  @param status Any error conditions occurring during the open
    68   *                operation are reported back in this variable.
    69   *  @return the newly opened charset detector.
    70   *  @stable ICU 3.6
    71   */
    72 U_STABLE UCharsetDetector * U_EXPORT2
    73 ucsdet_open(UErrorCode   *status);
    75 /**
    76   * Close a charset detector.  All storage and any other resources
    77   *   owned by this charset detector will be released.  Failure to
    78   *   close a charset detector when finished with it can result in
    79   *   memory leaks in the application.
    80   *
    81   *  @param ucsd  The charset detector to be closed.
    82   *  @stable ICU 3.6
    83   */
    84 U_STABLE void U_EXPORT2
    85 ucsdet_close(UCharsetDetector *ucsd);
    87 #if U_SHOW_CPLUSPLUS_API
    89 U_NAMESPACE_BEGIN
    91 /**
    92  * \class LocalUCharsetDetectorPointer
    93  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
    94  * For most methods see the LocalPointerBase base class.
    95  *
    96  * @see LocalPointerBase
    97  * @see LocalPointer
    98  * @stable ICU 4.4
    99  */
   100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
   102 U_NAMESPACE_END
   104 #endif
   106 /**
   107   * Set the input byte data whose charset is to detected.
   108   *
   109   * Ownership of the input  text byte array remains with the caller.
   110   * The input string must not be altered or deleted until the charset
   111   * detector is either closed or reset to refer to different input text.
   112   *
   113   * @param ucsd   the charset detector to be used.
   114   * @param textIn the input text of unknown encoding.   .
   115   * @param len    the length of the input text, or -1 if the text
   116   *               is NUL terminated.
   117   * @param status any error conditions are reported back in this variable.
   118   *
   119   * @stable ICU 3.6
   120   */
   121 U_STABLE void U_EXPORT2
   122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
   125 /** Set the declared encoding for charset detection.
   126  *  The declared encoding of an input text is an encoding obtained
   127  *  by the user from an http header or xml declaration or similar source that
   128  *  can be provided as an additional hint to the charset detector.
   129  *
   130  *  How and whether the declared encoding will be used during the
   131  *  detection process is TBD.
   132  *
   133  * @param ucsd      the charset detector to be used.
   134  * @param encoding  an encoding for the current data obtained from
   135  *                  a header or declaration or other source outside
   136  *                  of the byte data itself.
   137  * @param length    the length of the encoding name, or -1 if the name string
   138  *                  is NUL terminated.
   139  * @param status    any error conditions are reported back in this variable.
   140  *
   141  * @stable ICU 3.6
   142  */
   143 U_STABLE void U_EXPORT2
   144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
   147 /**
   148  * Return the charset that best matches the supplied input data.
   149  * 
   150  * Note though, that because the detection 
   151  * only looks at the start of the input data,
   152  * there is a possibility that the returned charset will fail to handle
   153  * the full set of input data.
   154  * <p>
   155  * The returned UCharsetMatch object is owned by the UCharsetDetector.
   156  * It will remain valid until the detector input is reset, or until
   157  * the detector is closed.
   158  * <p>
   159  * The function will fail if
   160  *  <ul>
   161  *    <li>no charset appears to match the data.</li>
   162  *    <li>no input text has been provided</li>
   163  *  </ul>
   164  *
   165  * @param ucsd      the charset detector to be used.
   166  * @param status    any error conditions are reported back in this variable.
   167  * @return          a UCharsetMatch  representing the best matching charset,
   168  *                  or NULL if no charset matches the byte data.
   169  *
   170  * @stable ICU 3.6
   171  */
   172 U_STABLE const UCharsetMatch * U_EXPORT2
   173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
   176 /**
   177  *  Find all charset matches that appear to be consistent with the input,
   178  *  returning an array of results.  The results are ordered with the
   179  *  best quality match first.
   180  *
   181  *  Because the detection only looks at a limited amount of the
   182  *  input byte data, some of the returned charsets may fail to handle
   183  *  the all of input data.
   184  *  <p>
   185  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
   186  *  They will remain valid until the detector is closed or modified
   187  *  
   188  * <p>
   189  * Return an error if 
   190  *  <ul>
   191  *    <li>no charsets appear to match the input data.</li>
   192  *    <li>no input text has been provided</li>
   193  *  </ul>
   194  * 
   195  * @param ucsd          the charset detector to be used.
   196  * @param matchesFound  pointer to a variable that will be set to the
   197  *                      number of charsets identified that are consistent with
   198  *                      the input data.  Output only.
   199  * @param status        any error conditions are reported back in this variable.
   200  * @return              A pointer to an array of pointers to UCharSetMatch objects.
   201  *                      This array, and the UCharSetMatch instances to which it refers,
   202  *                      are owned by the UCharsetDetector, and will remain valid until
   203  *                      the detector is closed or modified.
   204  * @stable ICU 3.6
   205  */
   206 U_STABLE const UCharsetMatch ** U_EXPORT2
   207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
   211 /**
   212  *  Get the name of the charset represented by a UCharsetMatch.
   213  *
   214  *  The storage for the returned name string is owned by the
   215  *  UCharsetMatch, and will remain valid while the UCharsetMatch
   216  *  is valid.
   217  *
   218  *  The name returned is suitable for use with the ICU conversion APIs.
   219  *
   220  *  @param ucsm    The charset match object.
   221  *  @param status  Any error conditions are reported back in this variable.
   222  *  @return        The name of the matching charset.
   223  *
   224  *  @stable ICU 3.6
   225  */
   226 U_STABLE const char * U_EXPORT2
   227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
   229 /**
   230  *  Get a confidence number for the quality of the match of the byte
   231  *  data with the charset.  Confidence numbers range from zero to 100,
   232  *  with 100 representing complete confidence and zero representing
   233  *  no confidence.
   234  *
   235  *  The confidence values are somewhat arbitrary.  They define an
   236  *  an ordering within the results for any single detection operation
   237  *  but are not generally comparable between the results for different input.
   238  *
   239  *  A confidence value of ten does have a general meaning - it is used
   240  *  for charsets that can represent the input data, but for which there
   241  *  is no other indication that suggests that the charset is the correct one.
   242  *  Pure 7 bit ASCII data, for example, is compatible with a
   243  *  great many charsets, most of which will appear as possible matches
   244  *  with a confidence of 10.
   245  *
   246  *  @param ucsm    The charset match object.
   247  *  @param status  Any error conditions are reported back in this variable.
   248  *  @return        A confidence number for the charset match.
   249  *
   250  *  @stable ICU 3.6
   251  */
   252 U_STABLE int32_t U_EXPORT2
   253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
   255 /**
   256  *  Get the RFC 3066 code for the language of the input data.
   257  *
   258  *  The Charset Detection service is intended primarily for detecting
   259  *  charsets, not language.  For some, but not all, charsets, a language is
   260  *  identified as a byproduct of the detection process, and that is what
   261  *  is returned by this function.
   262  *
   263  *  CAUTION:
   264  *    1.  Language information is not available for input data encoded in
   265  *        all charsets. In particular, no language is identified
   266  *        for UTF-8 input data.
   267  *
   268  *    2.  Closely related languages may sometimes be confused.
   269  *
   270  *  If more accurate language detection is required, a linguistic
   271  *  analysis package should be used.
   272  *
   273  *  The storage for the returned name string is owned by the
   274  *  UCharsetMatch, and will remain valid while the UCharsetMatch
   275  *  is valid.
   276  *
   277  *  @param ucsm    The charset match object.
   278  *  @param status  Any error conditions are reported back in this variable.
   279  *  @return        The RFC 3066 code for the language of the input data, or
   280  *                 an empty string if the language could not be determined.
   281  *
   282  *  @stable ICU 3.6
   283  */
   284 U_STABLE const char * U_EXPORT2
   285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
   288 /**
   289   *  Get the entire input text as a UChar string, placing it into
   290   *  a caller-supplied buffer.  A terminating
   291   *  NUL character will be appended to the buffer if space is available.
   292   *
   293   *  The number of UChars in the output string, not including the terminating
   294   *  NUL, is returned. 
   295   *
   296   *  If the supplied buffer is smaller than required to hold the output,
   297   *  the contents of the buffer are undefined.  The full output string length
   298   *  (in UChars) is returned as always, and can be used to allocate a buffer
   299   *  of the correct size.
   300   *
   301   *
   302   * @param ucsm    The charset match object.
   303   * @param buf     A UChar buffer to be filled with the converted text data.
   304   * @param cap     The capacity of the buffer in UChars.
   305   * @param status  Any error conditions are reported back in this variable.
   306   * @return        The number of UChars in the output string.
   307   *
   308   * @stable ICU 3.6
   309   */
   310 U_STABLE  int32_t U_EXPORT2
   311 ucsdet_getUChars(const UCharsetMatch *ucsm,
   312                  UChar *buf, int32_t cap, UErrorCode *status);
   316 /**
   317   *  Get an iterator over the set of all detectable charsets - 
   318   *  over the charsets that are known to the charset detection
   319   *  service.
   320   *
   321   *  The returned UEnumeration provides access to the names of
   322   *  the charsets.
   323   *
   324   *  <p>
   325   *  The state of the Charset detector that is passed in does not
   326   *  affect the result of this function, but requiring a valid, open
   327   *  charset detector as a parameter insures that the charset detection
   328   *  service has been safely initialized and that the required detection
   329   *  data is available.
   330   *
   331   *  <p>
   332   *  <b>Note:</b> Multiple different charset encodings in a same family may use
   333   *  a single shared name in this implementation. For example, this method returns
   334   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
   335   *  (Windows Latin 1). However, actual detection result could be "windows-1252"
   336   *  when the input data matches Latin 1 code points with any points only available
   337   *  in "windows-1252".
   338   *
   339   *  @param ucsd a Charset detector.
   340   *  @param status  Any error conditions are reported back in this variable.
   341   *  @return an iterator providing access to the detectable charset names.
   342   *  @stable ICU 3.6
   343   */
   344 U_STABLE  UEnumeration * U_EXPORT2
   345 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
   347 /**
   348   *  Test whether input filtering is enabled for this charset detector.
   349   *  Input filtering removes text that appears to be HTML or xml
   350   *  markup from the input before applying the code page detection
   351   *  heuristics.
   352   *
   353   *  @param ucsd  The charset detector to check.
   354   *  @return TRUE if filtering is enabled.
   355   *  @stable ICU 3.6
   356   */
   358 U_STABLE  UBool U_EXPORT2
   359 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
   362 /**
   363  * Enable filtering of input text. If filtering is enabled,
   364  * text within angle brackets ("<" and ">") will be removed
   365  * before detection, which will remove most HTML or xml markup.
   366  *
   367  * @param ucsd   the charset detector to be modified.
   368  * @param filter <code>true</code> to enable input text filtering.
   369  * @return The previous setting.
   370  *
   371  * @stable ICU 3.6
   372  */
   373 U_STABLE  UBool U_EXPORT2
   374 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
   376 #ifndef U_HIDE_INTERNAL_API
   377 /**
   378   *  Get an iterator over the set of detectable charsets -
   379   *  over the charsets that are enabled by the specified charset detector.
   380   *
   381   *  The returned UEnumeration provides access to the names of
   382   *  the charsets.
   383   *
   384   *  @param ucsd a Charset detector.
   385   *  @param status  Any error conditions are reported back in this variable.
   386   *  @return an iterator providing access to the detectable charset names by
   387   *  the specified charset detector.
   388   *  @internal
   389   */
   390 U_INTERNAL UEnumeration * U_EXPORT2
   391 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
   393 /**
   394   * Enable or disable individual charset encoding.
   395   * A name of charset encoding must be included in the names returned by
   396   * {@link #getAllDetectableCharsets()}.
   397   *
   398   * @param ucsd a Charset detector.
   399   * @param encoding encoding the name of charset encoding.
   400   * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
   401   *   charset encoding.
   402   * @param status receives the return status. When the name of charset encoding
   403   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
   404   * @internal
   405   */
   406 U_INTERNAL void U_EXPORT2
   407 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
   408 #endif  /* U_HIDE_INTERNAL_API */
   410 #endif
   411 #endif   /* __UCSDET_H */

mercurial