The Tor Browser: intl/icu/source/i18n/unicode/ucsdet.h@fc2d59ddac77

     1 /*

     2  **********************************************************************

     3  *   Copyright (C) 2005-2013, International Business Machines

     4  *   Corporation and others.  All Rights Reserved.

     5  **********************************************************************

     6  *   file name:  ucsdet.h

     7  *   encoding:   US-ASCII

     8  *   indentation:4

     9  *

    10  *   created on: 2005Aug04

    11  *   created by: Andy Heninger

    12  *

    13  *   ICU Character Set Detection, API for C

    14  *

    15  *   Draft version 18 Oct 2005

    16  *

    17  */

    19 #ifndef __UCSDET_H

    20 #define __UCSDET_H

    22 #include "unicode/utypes.h"

    24 #if !UCONFIG_NO_CONVERSION

    26 #include "unicode/localpointer.h"

    27 #include "unicode/uenum.h"

    29 /**

    30  * \file

    31  * \brief C API: Charset Detection API

    32  *

    33  * This API provides a facility for detecting the

    34  * charset or encoding of character data in an unknown text format.

    35  * The input data can be from an array of bytes.

    36  * <p>

    37  * Character set detection is at best an imprecise operation.  The detection

    38  * process will attempt to identify the charset that best matches the characteristics

    39  * of the byte data, but the process is partly statistical in nature, and

    40  * the results can not be guaranteed to always be correct.

    41  * <p>

    42  * For best accuracy in charset detection, the input data should be primarily

    43  * in a single language, and a minimum of a few hundred bytes worth of plain text

    44  * in the language are needed.  The detection process will attempt to

    45  * ignore html or xml style markup that could otherwise obscure the content.

    46  */

    49 struct UCharsetDetector;

    50 /**

    51   * Structure representing a charset detector

    52   * @stable ICU 3.6

    53   */

    54 typedef struct UCharsetDetector UCharsetDetector;

    56 struct UCharsetMatch;

    57 /**

    58   *  Opaque structure representing a match that was identified

    59   *  from a charset detection operation.

    60   *  @stable ICU 3.6

    61   */

    62 typedef struct UCharsetMatch UCharsetMatch;

    64 /**

    65   *  Open a charset detector.

    66   *

    67   *  @param status Any error conditions occurring during the open

    68   *                operation are reported back in this variable.

    69   *  @return the newly opened charset detector.

    70   *  @stable ICU 3.6

    71   */

    72 U_STABLE UCharsetDetector * U_EXPORT2

    73 ucsdet_open(UErrorCode   *status);

    75 /**

    76   * Close a charset detector.  All storage and any other resources

    77   *   owned by this charset detector will be released.  Failure to

    78   *   close a charset detector when finished with it can result in

    79   *   memory leaks in the application.

    80   *

    81   *  @param ucsd  The charset detector to be closed.

    82   *  @stable ICU 3.6

    83   */

    84 U_STABLE void U_EXPORT2

    85 ucsdet_close(UCharsetDetector *ucsd);

    87 #if U_SHOW_CPLUSPLUS_API

    89 U_NAMESPACE_BEGIN

    91 /**

    92  * \class LocalUCharsetDetectorPointer

    93  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().

    94  * For most methods see the LocalPointerBase base class.

    95  *

    96  * @see LocalPointerBase

    97  * @see LocalPointer

    98  * @stable ICU 4.4

    99  */

   100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);

   102 U_NAMESPACE_END

   104 #endif

   106 /**

   107   * Set the input byte data whose charset is to detected.

   108   *

   109   * Ownership of the input  text byte array remains with the caller.

   110   * The input string must not be altered or deleted until the charset

   111   * detector is either closed or reset to refer to different input text.

   112   *

   113   * @param ucsd   the charset detector to be used.

   114   * @param textIn the input text of unknown encoding.   .

   115   * @param len    the length of the input text, or -1 if the text

   116   *               is NUL terminated.

   117   * @param status any error conditions are reported back in this variable.

   118   *

   119   * @stable ICU 3.6

   120   */

   121 U_STABLE void U_EXPORT2

   122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);

   125 /** Set the declared encoding for charset detection.

   126  *  The declared encoding of an input text is an encoding obtained

   127  *  by the user from an http header or xml declaration or similar source that

   128  *  can be provided as an additional hint to the charset detector.

   129  *

   130  *  How and whether the declared encoding will be used during the

   131  *  detection process is TBD.

   132  *

   133  * @param ucsd      the charset detector to be used.

   134  * @param encoding  an encoding for the current data obtained from

   135  *                  a header or declaration or other source outside

   136  *                  of the byte data itself.

   137  * @param length    the length of the encoding name, or -1 if the name string

   138  *                  is NUL terminated.

   139  * @param status    any error conditions are reported back in this variable.

   140  *

   141  * @stable ICU 3.6

   142  */

   143 U_STABLE void U_EXPORT2

   144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);

   147 /**

   148  * Return the charset that best matches the supplied input data.

   149  *

   150  * Note though, that because the detection

   151  * only looks at the start of the input data,

   152  * there is a possibility that the returned charset will fail to handle

   153  * the full set of input data.

   154  * <p>

   155  * The returned UCharsetMatch object is owned by the UCharsetDetector.

   156  * It will remain valid until the detector input is reset, or until

   157  * the detector is closed.

   158  * <p>

   159  * The function will fail if

   160  *  <ul>

   161  *    <li>no charset appears to match the data.</li>

   162  *    <li>no input text has been provided</li>

   163  *  </ul>

   164  *

   165  * @param ucsd      the charset detector to be used.

   166  * @param status    any error conditions are reported back in this variable.

   167  * @return          a UCharsetMatch  representing the best matching charset,

   168  *                  or NULL if no charset matches the byte data.

   169  *

   170  * @stable ICU 3.6

   171  */

   172 U_STABLE const UCharsetMatch * U_EXPORT2

   173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);

   176 /**

   177  *  Find all charset matches that appear to be consistent with the input,

   178  *  returning an array of results.  The results are ordered with the

   179  *  best quality match first.

   180  *

   181  *  Because the detection only looks at a limited amount of the

   182  *  input byte data, some of the returned charsets may fail to handle

   183  *  the all of input data.

   184  *  <p>

   185  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.

   186  *  They will remain valid until the detector is closed or modified

   187  *

   188  * <p>

   189  * Return an error if

   190  *  <ul>

   191  *    <li>no charsets appear to match the input data.</li>

   192  *    <li>no input text has been provided</li>

   193  *  </ul>

   194  *

   195  * @param ucsd          the charset detector to be used.

   196  * @param matchesFound  pointer to a variable that will be set to the

   197  *                      number of charsets identified that are consistent with

   198  *                      the input data.  Output only.

   199  * @param status        any error conditions are reported back in this variable.

   200  * @return              A pointer to an array of pointers to UCharSetMatch objects.

   201  *                      This array, and the UCharSetMatch instances to which it refers,

   202  *                      are owned by the UCharsetDetector, and will remain valid until

   203  *                      the detector is closed or modified.

   204  * @stable ICU 3.6

   205  */

   206 U_STABLE const UCharsetMatch ** U_EXPORT2

   207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);

   211 /**

   212  *  Get the name of the charset represented by a UCharsetMatch.

   213  *

   214  *  The storage for the returned name string is owned by the

   215  *  UCharsetMatch, and will remain valid while the UCharsetMatch

   216  *  is valid.

   217  *

   218  *  The name returned is suitable for use with the ICU conversion APIs.

   219  *

   220  *  @param ucsm    The charset match object.

   221  *  @param status  Any error conditions are reported back in this variable.

   222  *  @return        The name of the matching charset.

   223  *

   224  *  @stable ICU 3.6

   225  */

   226 U_STABLE const char * U_EXPORT2

   227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);

   229 /**

   230  *  Get a confidence number for the quality of the match of the byte

   231  *  data with the charset.  Confidence numbers range from zero to 100,

   232  *  with 100 representing complete confidence and zero representing

   233  *  no confidence.

   234  *

   235  *  The confidence values are somewhat arbitrary.  They define an

   236  *  an ordering within the results for any single detection operation

   237  *  but are not generally comparable between the results for different input.

   238  *

   239  *  A confidence value of ten does have a general meaning - it is used

   240  *  for charsets that can represent the input data, but for which there

   241  *  is no other indication that suggests that the charset is the correct one.

   242  *  Pure 7 bit ASCII data, for example, is compatible with a

   243  *  great many charsets, most of which will appear as possible matches

   244  *  with a confidence of 10.

   245  *

   246  *  @param ucsm    The charset match object.

   247  *  @param status  Any error conditions are reported back in this variable.

   248  *  @return        A confidence number for the charset match.

   249  *

   250  *  @stable ICU 3.6

   251  */

   252 U_STABLE int32_t U_EXPORT2

   253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);

   255 /**

   256  *  Get the RFC 3066 code for the language of the input data.

   257  *

   258  *  The Charset Detection service is intended primarily for detecting

   259  *  charsets, not language.  For some, but not all, charsets, a language is

   260  *  identified as a byproduct of the detection process, and that is what

   261  *  is returned by this function.

   262  *

   263  *  CAUTION:

   264  *    1.  Language information is not available for input data encoded in

   265  *        all charsets. In particular, no language is identified

   266  *        for UTF-8 input data.

   267  *

   268  *    2.  Closely related languages may sometimes be confused.

   269  *

   270  *  If more accurate language detection is required, a linguistic

   271  *  analysis package should be used.

   272  *

   273  *  The storage for the returned name string is owned by the

   274  *  UCharsetMatch, and will remain valid while the UCharsetMatch

   275  *  is valid.

   276  *

   277  *  @param ucsm    The charset match object.

   278  *  @param status  Any error conditions are reported back in this variable.

   279  *  @return        The RFC 3066 code for the language of the input data, or

   280  *                 an empty string if the language could not be determined.

   281  *

   282  *  @stable ICU 3.6

   283  */

   284 U_STABLE const char * U_EXPORT2

   285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);

   288 /**

   289   *  Get the entire input text as a UChar string, placing it into

   290   *  a caller-supplied buffer.  A terminating

   291   *  NUL character will be appended to the buffer if space is available.

   292   *

   293   *  The number of UChars in the output string, not including the terminating

   294   *  NUL, is returned.

   295   *

   296   *  If the supplied buffer is smaller than required to hold the output,

   297   *  the contents of the buffer are undefined.  The full output string length

   298   *  (in UChars) is returned as always, and can be used to allocate a buffer

   299   *  of the correct size.

   300   *

   301   *

   302   * @param ucsm    The charset match object.

   303   * @param buf     A UChar buffer to be filled with the converted text data.

   304   * @param cap     The capacity of the buffer in UChars.

   305   * @param status  Any error conditions are reported back in this variable.

   306   * @return        The number of UChars in the output string.

   307   *

   308   * @stable ICU 3.6

   309   */

   310 U_STABLE  int32_t U_EXPORT2

   311 ucsdet_getUChars(const UCharsetMatch *ucsm,

   312                  UChar *buf, int32_t cap, UErrorCode *status);

   316 /**

   317   *  Get an iterator over the set of all detectable charsets -

   318   *  over the charsets that are known to the charset detection

   319   *  service.

   320   *

   321   *  The returned UEnumeration provides access to the names of

   322   *  the charsets.

   323   *

   324   *  <p>

   325   *  The state of the Charset detector that is passed in does not

   326   *  affect the result of this function, but requiring a valid, open

   327   *  charset detector as a parameter insures that the charset detection

   328   *  service has been safely initialized and that the required detection

   329   *  data is available.

   330   *

   331   *  <p>

   332   *  <b>Note:</b> Multiple different charset encodings in a same family may use

   333   *  a single shared name in this implementation. For example, this method returns

   334   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"

   335   *  (Windows Latin 1). However, actual detection result could be "windows-1252"

   336   *  when the input data matches Latin 1 code points with any points only available

   337   *  in "windows-1252".

   338   *

   339   *  @param ucsd a Charset detector.

   340   *  @param status  Any error conditions are reported back in this variable.

   341   *  @return an iterator providing access to the detectable charset names.

   342   *  @stable ICU 3.6

   343   */

   344 U_STABLE  UEnumeration * U_EXPORT2

   345 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);

   347 /**

   348   *  Test whether input filtering is enabled for this charset detector.

   349   *  Input filtering removes text that appears to be HTML or xml

   350   *  markup from the input before applying the code page detection

   351   *  heuristics.

   352   *

   353   *  @param ucsd  The charset detector to check.

   354   *  @return TRUE if filtering is enabled.

   355   *  @stable ICU 3.6

   356   */

   358 U_STABLE  UBool U_EXPORT2

   359 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);

   362 /**

   363  * Enable filtering of input text. If filtering is enabled,

   364  * text within angle brackets ("<" and ">") will be removed

   365  * before detection, which will remove most HTML or xml markup.

   366  *

   367  * @param ucsd   the charset detector to be modified.

   368  * @param filter <code>true</code> to enable input text filtering.

   369  * @return The previous setting.

   370  *

   371  * @stable ICU 3.6

   372  */

   373 U_STABLE  UBool U_EXPORT2

   374 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);

   376 #ifndef U_HIDE_INTERNAL_API

   377 /**

   378   *  Get an iterator over the set of detectable charsets -

   379   *  over the charsets that are enabled by the specified charset detector.

   380   *

   381   *  The returned UEnumeration provides access to the names of

   382   *  the charsets.

   383   *

   384   *  @param ucsd a Charset detector.

   385   *  @param status  Any error conditions are reported back in this variable.

   386   *  @return an iterator providing access to the detectable charset names by

   387   *  the specified charset detector.

   388   *  @internal

   389   */

   390 U_INTERNAL UEnumeration * U_EXPORT2

   391 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);

   393 /**

   394   * Enable or disable individual charset encoding.

   395   * A name of charset encoding must be included in the names returned by

   396   * {@link #getAllDetectableCharsets()}.

   397   *

   398   * @param ucsd a Charset detector.

   399   * @param encoding encoding the name of charset encoding.

   400   * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the

   401   *   charset encoding.

   402   * @param status receives the return status. When the name of charset encoding

   403   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.

   404   * @internal

   405   */

   406 U_INTERNAL void U_EXPORT2

   407 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);

   408 #endif  /* U_HIDE_INTERNAL_API */

   410 #endif

   411 #endif   /* __UCSDET_H */

The Tor Browser / file revision

intl/icu/source/i18n/unicode/ucsdet.h@fc2d59ddac77

intl/icu/source/i18n/unicode/ucsdet.h