1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csrmbcs.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,205 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2012, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#ifndef __CSRMBCS_H 1.12 +#define __CSRMBCS_H 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_CONVERSION 1.17 + 1.18 +#include "csrecog.h" 1.19 + 1.20 +U_NAMESPACE_BEGIN 1.21 + 1.22 +// "Character" iterated character class. 1.23 +// Recognizers for specific mbcs encodings make their "characters" available 1.24 +// by providing a nextChar() function that fills in an instance of IteratedChar 1.25 +// with the next char from the input. 1.26 +// The returned characters are not converted to Unicode, but remain as the raw 1.27 +// bytes (concatenated into an int) from the codepage data. 1.28 +// 1.29 +// For Asian charsets, use the raw input rather than the input that has been 1.30 +// stripped of markup. Detection only considers multi-byte chars, effectively 1.31 +// stripping markup anyway, and double byte chars do occur in markup too. 1.32 +// 1.33 +class IteratedChar : public UMemory 1.34 +{ 1.35 +public: 1.36 + uint32_t charValue; // 1-4 bytes from the raw input data 1.37 + int32_t index; 1.38 + int32_t nextIndex; 1.39 + UBool error; 1.40 + UBool done; 1.41 + 1.42 +public: 1.43 + IteratedChar(); 1.44 + //void reset(); 1.45 + int32_t nextByte(InputText* det); 1.46 +}; 1.47 + 1.48 + 1.49 +class CharsetRecog_mbcs : public CharsetRecognizer { 1.50 + 1.51 +protected: 1.52 + /** 1.53 + * Test the match of this charset with the input text data 1.54 + * which is obtained via the CharsetDetector object. 1.55 + * 1.56 + * @param det The CharsetDetector, which contains the input text 1.57 + * to be checked for being in this charset. 1.58 + * @return Two values packed into one int (Damn java, anyhow) 1.59 + * <br/> 1.60 + * bits 0-7: the match confidence, ranging from 0-100 1.61 + * <br/> 1.62 + * bits 8-15: The match reason, an enum-like value. 1.63 + */ 1.64 + int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; 1.65 + 1.66 +public: 1.67 + 1.68 + virtual ~CharsetRecog_mbcs(); 1.69 + 1.70 + /** 1.71 + * Get the IANA name of this charset. 1.72 + * @return the charset name. 1.73 + */ 1.74 + 1.75 + const char *getName() const = 0; 1.76 + const char *getLanguage() const = 0; 1.77 + UBool match(InputText* input, CharsetMatch *results) const = 0; 1.78 + 1.79 + /** 1.80 + * Get the next character (however many bytes it is) from the input data 1.81 + * Subclasses for specific charset encodings must implement this function 1.82 + * to get characters according to the rules of their encoding scheme. 1.83 + * 1.84 + * This function is not a method of class IteratedChar only because 1.85 + * that would require a lot of extra derived classes, which is awkward. 1.86 + * @param it The IteratedChar "struct" into which the returned char is placed. 1.87 + * @param det The charset detector, which is needed to get at the input byte data 1.88 + * being iterated over. 1.89 + * @return True if a character was returned, false at end of input. 1.90 + */ 1.91 + virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; 1.92 + 1.93 +}; 1.94 + 1.95 + 1.96 +/** 1.97 + * Shift-JIS charset recognizer. 1.98 + * 1.99 + */ 1.100 +class CharsetRecog_sjis : public CharsetRecog_mbcs { 1.101 +public: 1.102 + virtual ~CharsetRecog_sjis(); 1.103 + 1.104 + UBool nextChar(IteratedChar *it, InputText *det) const; 1.105 + 1.106 + UBool match(InputText* input, CharsetMatch *results) const; 1.107 + 1.108 + const char *getName() const; 1.109 + const char *getLanguage() const; 1.110 + 1.111 +}; 1.112 + 1.113 + 1.114 +/** 1.115 + * EUC charset recognizers. One abstract class that provides the common function 1.116 + * for getting the next character according to the EUC encoding scheme, 1.117 + * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 1.118 + * 1.119 + */ 1.120 +class CharsetRecog_euc : public CharsetRecog_mbcs 1.121 +{ 1.122 +public: 1.123 + virtual ~CharsetRecog_euc(); 1.124 + 1.125 + const char *getName() const = 0; 1.126 + const char *getLanguage() const = 0; 1.127 + 1.128 + UBool match(InputText* input, CharsetMatch *results) const = 0; 1.129 + /* 1.130 + * (non-Javadoc) 1.131 + * Get the next character value for EUC based encodings. 1.132 + * Character "value" is simply the raw bytes that make up the character 1.133 + * packed into an int. 1.134 + */ 1.135 + UBool nextChar(IteratedChar *it, InputText *det) const; 1.136 +}; 1.137 + 1.138 +/** 1.139 + * The charset recognize for EUC-JP. A singleton instance of this class 1.140 + * is created and kept by the public CharsetDetector class 1.141 + */ 1.142 +class CharsetRecog_euc_jp : public CharsetRecog_euc 1.143 +{ 1.144 +public: 1.145 + virtual ~CharsetRecog_euc_jp(); 1.146 + 1.147 + const char *getName() const; 1.148 + const char *getLanguage() const; 1.149 + 1.150 + UBool match(InputText* input, CharsetMatch *results) const; 1.151 +}; 1.152 + 1.153 +/** 1.154 + * The charset recognize for EUC-KR. A singleton instance of this class 1.155 + * is created and kept by the public CharsetDetector class 1.156 + */ 1.157 +class CharsetRecog_euc_kr : public CharsetRecog_euc 1.158 +{ 1.159 +public: 1.160 + virtual ~CharsetRecog_euc_kr(); 1.161 + 1.162 + const char *getName() const; 1.163 + const char *getLanguage() const; 1.164 + 1.165 + UBool match(InputText* input, CharsetMatch *results) const; 1.166 +}; 1.167 + 1.168 +/** 1.169 + * 1.170 + * Big5 charset recognizer. 1.171 + * 1.172 + */ 1.173 +class CharsetRecog_big5 : public CharsetRecog_mbcs 1.174 +{ 1.175 +public: 1.176 + virtual ~CharsetRecog_big5(); 1.177 + 1.178 + UBool nextChar(IteratedChar* it, InputText* det) const; 1.179 + 1.180 + const char *getName() const; 1.181 + const char *getLanguage() const; 1.182 + 1.183 + UBool match(InputText* input, CharsetMatch *results) const; 1.184 +}; 1.185 + 1.186 + 1.187 +/** 1.188 + * 1.189 + * GB-18030 recognizer. Uses simplified Chinese statistics. 1.190 + * 1.191 + */ 1.192 +class CharsetRecog_gb_18030 : public CharsetRecog_mbcs 1.193 +{ 1.194 +public: 1.195 + virtual ~CharsetRecog_gb_18030(); 1.196 + 1.197 + UBool nextChar(IteratedChar* it, InputText* det) const; 1.198 + 1.199 + const char *getName() const; 1.200 + const char *getLanguage() const; 1.201 + 1.202 + UBool match(InputText* input, CharsetMatch *results) const; 1.203 +}; 1.204 + 1.205 +U_NAMESPACE_END 1.206 + 1.207 +#endif 1.208 +#endif /* __CSRMBCS_H */