intl/icu/source/i18n/csrmbcs.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/csrmbcs.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,205 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2012, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + */
    1.10 +
    1.11 +#ifndef __CSRMBCS_H
    1.12 +#define __CSRMBCS_H
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_CONVERSION
    1.17 +
    1.18 +#include "csrecog.h"
    1.19 +
    1.20 +U_NAMESPACE_BEGIN
    1.21 +
    1.22 +// "Character"  iterated character class.
    1.23 +//    Recognizers for specific mbcs encodings make their "characters" available
    1.24 +//    by providing a nextChar() function that fills in an instance of IteratedChar
    1.25 +//    with the next char from the input.
    1.26 +//    The returned characters are not converted to Unicode, but remain as the raw
    1.27 +//    bytes (concatenated into an int) from the codepage data.
    1.28 +//
    1.29 +//  For Asian charsets, use the raw input rather than the input that has been
    1.30 +//   stripped of markup.  Detection only considers multi-byte chars, effectively
    1.31 +//   stripping markup anyway, and double byte chars do occur in markup too.
    1.32 +//
    1.33 +class IteratedChar : public UMemory
    1.34 +{
    1.35 +public:
    1.36 +    uint32_t charValue;             // 1-4 bytes from the raw input data
    1.37 +    int32_t  index;
    1.38 +    int32_t  nextIndex;
    1.39 +    UBool    error;
    1.40 +    UBool    done;
    1.41 +
    1.42 +public:
    1.43 +    IteratedChar();
    1.44 +    //void reset();
    1.45 +    int32_t nextByte(InputText* det);
    1.46 +};
    1.47 +
    1.48 +
    1.49 +class CharsetRecog_mbcs : public CharsetRecognizer {
    1.50 +
    1.51 +protected:
    1.52 +    /**
    1.53 +     * Test the match of this charset with the input text data
    1.54 +     *      which is obtained via the CharsetDetector object.
    1.55 +     *
    1.56 +     * @param det  The CharsetDetector, which contains the input text
    1.57 +     *             to be checked for being in this charset.
    1.58 +     * @return     Two values packed into one int  (Damn java, anyhow)
    1.59 +     *             <br/>
    1.60 +     *             bits 0-7:  the match confidence, ranging from 0-100
    1.61 +     *             <br/>
    1.62 +     *             bits 8-15: The match reason, an enum-like value.
    1.63 +     */
    1.64 +    int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
    1.65 +
    1.66 +public:
    1.67 +
    1.68 +    virtual ~CharsetRecog_mbcs();
    1.69 +
    1.70 +    /**
    1.71 +     * Get the IANA name of this charset.
    1.72 +     * @return the charset name.
    1.73 +     */
    1.74 +
    1.75 +    const char *getName() const = 0;
    1.76 +    const char *getLanguage() const = 0;
    1.77 +    UBool match(InputText* input, CharsetMatch *results) const = 0;
    1.78 +
    1.79 +    /**
    1.80 +     * Get the next character (however many bytes it is) from the input data
    1.81 +     *    Subclasses for specific charset encodings must implement this function
    1.82 +     *    to get characters according to the rules of their encoding scheme.
    1.83 +     *
    1.84 +     *  This function is not a method of class IteratedChar only because
    1.85 +     *   that would require a lot of extra derived classes, which is awkward.
    1.86 +     * @param it  The IteratedChar "struct" into which the returned char is placed.
    1.87 +     * @param det The charset detector, which is needed to get at the input byte data
    1.88 +     *            being iterated over.
    1.89 +     * @return    True if a character was returned, false at end of input.
    1.90 +     */
    1.91 +    virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
    1.92 +
    1.93 +};
    1.94 +
    1.95 +
    1.96 +/**
    1.97 + *   Shift-JIS charset recognizer.
    1.98 + *
    1.99 + */
   1.100 +class CharsetRecog_sjis : public CharsetRecog_mbcs {
   1.101 +public:
   1.102 +    virtual ~CharsetRecog_sjis();
   1.103 +
   1.104 +    UBool nextChar(IteratedChar *it, InputText *det) const;
   1.105 +
   1.106 +    UBool match(InputText* input, CharsetMatch *results) const;
   1.107 +
   1.108 +    const char *getName() const;
   1.109 +    const char *getLanguage() const;
   1.110 +
   1.111 +};
   1.112 +
   1.113 +
   1.114 +/**
   1.115 + *   EUC charset recognizers.  One abstract class that provides the common function
   1.116 + *             for getting the next character according to the EUC encoding scheme,
   1.117 + *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
   1.118 + *
   1.119 + */
   1.120 +class CharsetRecog_euc : public CharsetRecog_mbcs
   1.121 +{
   1.122 +public:
   1.123 +    virtual ~CharsetRecog_euc();
   1.124 +
   1.125 +    const char *getName() const = 0;
   1.126 +    const char *getLanguage() const = 0;
   1.127 +
   1.128 +    UBool match(InputText* input, CharsetMatch *results) const = 0;
   1.129 +    /*
   1.130 +     *  (non-Javadoc)
   1.131 +     *  Get the next character value for EUC based encodings.
   1.132 +     *  Character "value" is simply the raw bytes that make up the character
   1.133 +     *     packed into an int.
   1.134 +     */
   1.135 +    UBool nextChar(IteratedChar *it, InputText *det) const;
   1.136 +};
   1.137 +
   1.138 +/**
   1.139 + * The charset recognize for EUC-JP.  A singleton instance of this class
   1.140 + *    is created and kept by the public CharsetDetector class
   1.141 + */
   1.142 +class CharsetRecog_euc_jp : public CharsetRecog_euc
   1.143 +{
   1.144 +public:
   1.145 +    virtual ~CharsetRecog_euc_jp();
   1.146 +
   1.147 +    const char *getName() const;
   1.148 +    const char *getLanguage() const;
   1.149 +
   1.150 +    UBool match(InputText* input, CharsetMatch *results) const;
   1.151 +};
   1.152 +
   1.153 +/**
   1.154 + * The charset recognize for EUC-KR.  A singleton instance of this class
   1.155 + *    is created and kept by the public CharsetDetector class
   1.156 + */
   1.157 +class CharsetRecog_euc_kr : public CharsetRecog_euc
   1.158 +{
   1.159 +public:
   1.160 +    virtual ~CharsetRecog_euc_kr();
   1.161 +
   1.162 +    const char *getName() const;
   1.163 +    const char *getLanguage() const;
   1.164 +
   1.165 +    UBool match(InputText* input, CharsetMatch *results) const;
   1.166 +};
   1.167 +
   1.168 +/**
   1.169 + *
   1.170 + *   Big5 charset recognizer.
   1.171 + *
   1.172 + */
   1.173 +class CharsetRecog_big5 : public CharsetRecog_mbcs
   1.174 +{
   1.175 +public:
   1.176 +    virtual ~CharsetRecog_big5();
   1.177 +
   1.178 +    UBool nextChar(IteratedChar* it, InputText* det) const;
   1.179 +
   1.180 +    const char *getName() const;
   1.181 +    const char *getLanguage() const;
   1.182 +
   1.183 +    UBool match(InputText* input, CharsetMatch *results) const;
   1.184 +};
   1.185 +
   1.186 +
   1.187 +/**
   1.188 + *
   1.189 + *   GB-18030 recognizer. Uses simplified Chinese statistics.
   1.190 + *
   1.191 + */
   1.192 +class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
   1.193 +{
   1.194 +public:
   1.195 +    virtual ~CharsetRecog_gb_18030();
   1.196 +
   1.197 +    UBool nextChar(IteratedChar* it, InputText* det) const;
   1.198 +
   1.199 +    const char *getName() const;
   1.200 +    const char *getLanguage() const;
   1.201 +
   1.202 +    UBool match(InputText* input, CharsetMatch *results) const;
   1.203 +};
   1.204 +
   1.205 +U_NAMESPACE_END
   1.206 +
   1.207 +#endif
   1.208 +#endif /* __CSRMBCS_H */

mercurial