diff -r 000000000000 -r 6474c204b198 intl/icu/source/i18n/unicode/ucsdet.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/intl/icu/source/i18n/unicode/ucsdet.h Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,413 @@ +/* + ********************************************************************** + * Copyright (C) 2005-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * file name: ucsdet.h + * encoding: US-ASCII + * indentation:4 + * + * created on: 2005Aug04 + * created by: Andy Heninger + * + * ICU Character Set Detection, API for C + * + * Draft version 18 Oct 2005 + * + */ + +#ifndef __UCSDET_H +#define __UCSDET_H + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_CONVERSION + +#include "unicode/localpointer.h" +#include "unicode/uenum.h" + +/** + * \file + * \brief C API: Charset Detection API + * + * This API provides a facility for detecting the + * charset or encoding of character data in an unknown text format. + * The input data can be from an array of bytes. + *
+ * Character set detection is at best an imprecise operation. The detection + * process will attempt to identify the charset that best matches the characteristics + * of the byte data, but the process is partly statistical in nature, and + * the results can not be guaranteed to always be correct. + *
+ * For best accuracy in charset detection, the input data should be primarily + * in a single language, and a minimum of a few hundred bytes worth of plain text + * in the language are needed. The detection process will attempt to + * ignore html or xml style markup that could otherwise obscure the content. + */ + + +struct UCharsetDetector; +/** + * Structure representing a charset detector + * @stable ICU 3.6 + */ +typedef struct UCharsetDetector UCharsetDetector; + +struct UCharsetMatch; +/** + * Opaque structure representing a match that was identified + * from a charset detection operation. + * @stable ICU 3.6 + */ +typedef struct UCharsetMatch UCharsetMatch; + +/** + * Open a charset detector. + * + * @param status Any error conditions occurring during the open + * operation are reported back in this variable. + * @return the newly opened charset detector. + * @stable ICU 3.6 + */ +U_STABLE UCharsetDetector * U_EXPORT2 +ucsdet_open(UErrorCode *status); + +/** + * Close a charset detector. All storage and any other resources + * owned by this charset detector will be released. Failure to + * close a charset detector when finished with it can result in + * memory leaks in the application. + * + * @param ucsd The charset detector to be closed. + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_close(UCharsetDetector *ucsd); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUCharsetDetectorPointer + * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); + +U_NAMESPACE_END + +#endif + +/** + * Set the input byte data whose charset is to detected. + * + * Ownership of the input text byte array remains with the caller. + * The input string must not be altered or deleted until the charset + * detector is either closed or reset to refer to different input text. + * + * @param ucsd the charset detector to be used. + * @param textIn the input text of unknown encoding. . + * @param len the length of the input text, or -1 if the text + * is NUL terminated. + * @param status any error conditions are reported back in this variable. + * + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); + + +/** Set the declared encoding for charset detection. + * The declared encoding of an input text is an encoding obtained + * by the user from an http header or xml declaration or similar source that + * can be provided as an additional hint to the charset detector. + * + * How and whether the declared encoding will be used during the + * detection process is TBD. + * + * @param ucsd the charset detector to be used. + * @param encoding an encoding for the current data obtained from + * a header or declaration or other source outside + * of the byte data itself. + * @param length the length of the encoding name, or -1 if the name string + * is NUL terminated. + * @param status any error conditions are reported back in this variable. + * + * @stable ICU 3.6 + */ +U_STABLE void U_EXPORT2 +ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); + + +/** + * Return the charset that best matches the supplied input data. + * + * Note though, that because the detection + * only looks at the start of the input data, + * there is a possibility that the returned charset will fail to handle + * the full set of input data. + *
+ * The returned UCharsetMatch object is owned by the UCharsetDetector. + * It will remain valid until the detector input is reset, or until + * the detector is closed. + *
+ * The function will fail if + *
+ * The returned UCharsetMatch objects are owned by the UCharsetDetector. + * They will remain valid until the detector is closed or modified + * + *
+ * Return an error if + *
+ * The state of the Charset detector that is passed in does not + * affect the result of this function, but requiring a valid, open + * charset detector as a parameter insures that the charset detection + * service has been safely initialized and that the required detection + * data is available. + * + *
+ * Note: Multiple different charset encodings in a same family may use
+ * a single shared name in this implementation. For example, this method returns
+ * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+ * (Windows Latin 1). However, actual detection result could be "windows-1252"
+ * when the input data matches Latin 1 code points with any points only available
+ * in "windows-1252".
+ *
+ * @param ucsd a Charset detector.
+ * @param status Any error conditions are reported back in this variable.
+ * @return an iterator providing access to the detectable charset names.
+ * @stable ICU 3.6
+ */
+U_STABLE UEnumeration * U_EXPORT2
+ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
+
+/**
+ * Test whether input filtering is enabled for this charset detector.
+ * Input filtering removes text that appears to be HTML or xml
+ * markup from the input before applying the code page detection
+ * heuristics.
+ *
+ * @param ucsd The charset detector to check.
+ * @return TRUE if filtering is enabled.
+ * @stable ICU 3.6
+ */
+
+U_STABLE UBool U_EXPORT2
+ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
+
+
+/**
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection, which will remove most HTML or xml markup.
+ *
+ * @param ucsd the charset detector to be modified.
+ * @param filter true
to enable input text filtering.
+ * @return The previous setting.
+ *
+ * @stable ICU 3.6
+ */
+U_STABLE UBool U_EXPORT2
+ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
+
+#ifndef U_HIDE_INTERNAL_API
+/**
+ * Get an iterator over the set of detectable charsets -
+ * over the charsets that are enabled by the specified charset detector.
+ *
+ * The returned UEnumeration provides access to the names of
+ * the charsets.
+ *
+ * @param ucsd a Charset detector.
+ * @param status Any error conditions are reported back in this variable.
+ * @return an iterator providing access to the detectable charset names by
+ * the specified charset detector.
+ * @internal
+ */
+U_INTERNAL UEnumeration * U_EXPORT2
+ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);
+
+/**
+ * Enable or disable individual charset encoding.
+ * A name of charset encoding must be included in the names returned by
+ * {@link #getAllDetectableCharsets()}.
+ *
+ * @param ucsd a Charset detector.
+ * @param encoding encoding the name of charset encoding.
+ * @param enabled TRUE
to enable, or FALSE
to disable the
+ * charset encoding.
+ * @param status receives the return status. When the name of charset encoding
+ * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
+ * @internal
+ */
+U_INTERNAL void U_EXPORT2
+ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
+#endif /* U_HIDE_INTERNAL_API */
+
+#endif
+#endif /* __UCSDET_H */
+
+