1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csrsbcs.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,287 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2013, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#ifndef __CSRSBCS_H 1.12 +#define __CSRSBCS_H 1.13 + 1.14 +#include "unicode/uobject.h" 1.15 + 1.16 +#if !UCONFIG_NO_CONVERSION 1.17 + 1.18 +#include "csrecog.h" 1.19 + 1.20 +U_NAMESPACE_BEGIN 1.21 + 1.22 +class NGramParser : public UMemory 1.23 +{ 1.24 +private: 1.25 + int32_t ngram; 1.26 + const int32_t *ngramList; 1.27 + 1.28 + int32_t ngramCount; 1.29 + int32_t hitCount; 1.30 + 1.31 +protected: 1.32 + int32_t byteIndex; 1.33 + const uint8_t *charMap; 1.34 + 1.35 + void addByte(int32_t b); 1.36 + 1.37 +public: 1.38 + NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); 1.39 + 1.40 +private: 1.41 + /* 1.42 + * Binary search for value in table, which must have exactly 64 entries. 1.43 + */ 1.44 + int32_t search(const int32_t *table, int32_t value); 1.45 + 1.46 + void lookup(int32_t thisNgram); 1.47 + 1.48 + virtual int32_t nextByte(InputText *det); 1.49 + virtual void parseCharacters(InputText *det); 1.50 + 1.51 +public: 1.52 + int32_t parse(InputText *det); 1.53 + 1.54 +}; 1.55 + 1.56 +class NGramParser_IBM420 : public NGramParser 1.57 +{ 1.58 +private: 1.59 + int32_t alef; 1.60 + int32_t isLamAlef(int32_t b); 1.61 + int32_t nextByte(InputText *det); 1.62 + void parseCharacters(InputText *det); 1.63 + 1.64 +public: 1.65 + NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap); 1.66 +}; 1.67 + 1.68 + 1.69 +class CharsetRecog_sbcs : public CharsetRecognizer 1.70 +{ 1.71 +public: 1.72 + CharsetRecog_sbcs(); 1.73 + virtual ~CharsetRecog_sbcs(); 1.74 + virtual const char *getName() const = 0; 1.75 + virtual UBool match(InputText *det, CharsetMatch *results) const = 0; 1.76 + virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 1.77 +}; 1.78 + 1.79 +class CharsetRecog_8859_1 : public CharsetRecog_sbcs 1.80 +{ 1.81 +public: 1.82 + virtual ~CharsetRecog_8859_1(); 1.83 + const char *getName() const; 1.84 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.85 +}; 1.86 + 1.87 +class CharsetRecog_8859_2 : public CharsetRecog_sbcs 1.88 +{ 1.89 +public: 1.90 + virtual ~CharsetRecog_8859_2(); 1.91 + const char *getName() const; 1.92 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.93 +}; 1.94 + 1.95 +class CharsetRecog_8859_5 : public CharsetRecog_sbcs 1.96 +{ 1.97 +public: 1.98 + virtual ~CharsetRecog_8859_5(); 1.99 + const char *getName() const; 1.100 +}; 1.101 + 1.102 +class CharsetRecog_8859_6 : public CharsetRecog_sbcs 1.103 +{ 1.104 +public: 1.105 + virtual ~CharsetRecog_8859_6(); 1.106 + 1.107 + const char *getName() const; 1.108 +}; 1.109 + 1.110 +class CharsetRecog_8859_7 : public CharsetRecog_sbcs 1.111 +{ 1.112 +public: 1.113 + virtual ~CharsetRecog_8859_7(); 1.114 + 1.115 + const char *getName() const; 1.116 +}; 1.117 + 1.118 +class CharsetRecog_8859_8 : public CharsetRecog_sbcs 1.119 +{ 1.120 +public: 1.121 + virtual ~CharsetRecog_8859_8(); 1.122 + 1.123 + virtual const char *getName() const; 1.124 +}; 1.125 + 1.126 +class CharsetRecog_8859_9 : public CharsetRecog_sbcs 1.127 +{ 1.128 +public: 1.129 + virtual ~CharsetRecog_8859_9(); 1.130 + 1.131 + const char *getName() const; 1.132 +}; 1.133 + 1.134 + 1.135 + 1.136 +class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 1.137 +{ 1.138 +public: 1.139 + virtual ~CharsetRecog_8859_5_ru(); 1.140 + 1.141 + const char *getLanguage() const; 1.142 + 1.143 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.144 +}; 1.145 + 1.146 +class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 1.147 +{ 1.148 +public: 1.149 + virtual ~CharsetRecog_8859_6_ar(); 1.150 + 1.151 + const char *getLanguage() const; 1.152 + 1.153 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.154 +}; 1.155 + 1.156 +class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 1.157 +{ 1.158 +public: 1.159 + virtual ~CharsetRecog_8859_7_el(); 1.160 + 1.161 + const char *getLanguage() const; 1.162 + 1.163 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.164 +}; 1.165 + 1.166 +class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 1.167 +{ 1.168 +public: 1.169 + virtual ~CharsetRecog_8859_8_I_he(); 1.170 + 1.171 + const char *getName() const; 1.172 + 1.173 + const char *getLanguage() const; 1.174 + 1.175 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.176 +}; 1.177 + 1.178 +class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 1.179 +{ 1.180 +public: 1.181 + virtual ~CharsetRecog_8859_8_he (); 1.182 + 1.183 + const char *getLanguage() const; 1.184 + 1.185 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.186 +}; 1.187 + 1.188 +class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 1.189 +{ 1.190 +public: 1.191 + virtual ~CharsetRecog_8859_9_tr (); 1.192 + 1.193 + const char *getLanguage() const; 1.194 + 1.195 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.196 +}; 1.197 + 1.198 +class CharsetRecog_windows_1256 : public CharsetRecog_sbcs 1.199 +{ 1.200 +public: 1.201 + virtual ~CharsetRecog_windows_1256(); 1.202 + 1.203 + const char *getName() const; 1.204 + 1.205 + const char *getLanguage() const; 1.206 + 1.207 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.208 +}; 1.209 + 1.210 +class CharsetRecog_windows_1251 : public CharsetRecog_sbcs 1.211 +{ 1.212 +public: 1.213 + virtual ~CharsetRecog_windows_1251(); 1.214 + 1.215 + const char *getName() const; 1.216 + 1.217 + const char *getLanguage() const; 1.218 + 1.219 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.220 +}; 1.221 + 1.222 + 1.223 +class CharsetRecog_KOI8_R : public CharsetRecog_sbcs 1.224 +{ 1.225 +public: 1.226 + virtual ~CharsetRecog_KOI8_R(); 1.227 + 1.228 + const char *getName() const; 1.229 + 1.230 + const char *getLanguage() const; 1.231 + 1.232 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.233 +}; 1.234 + 1.235 +class CharsetRecog_IBM424_he : public CharsetRecog_sbcs 1.236 +{ 1.237 +public: 1.238 + virtual ~CharsetRecog_IBM424_he(); 1.239 + 1.240 + const char *getLanguage() const; 1.241 +}; 1.242 + 1.243 +class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { 1.244 +public: 1.245 + virtual ~CharsetRecog_IBM424_he_rtl(); 1.246 + 1.247 + const char *getName() const; 1.248 + 1.249 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.250 +}; 1.251 + 1.252 +class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { 1.253 + virtual ~CharsetRecog_IBM424_he_ltr(); 1.254 + 1.255 + const char *getName() const; 1.256 + 1.257 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.258 +}; 1.259 + 1.260 +class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs 1.261 +{ 1.262 +public: 1.263 + virtual ~CharsetRecog_IBM420_ar(); 1.264 + 1.265 + const char *getLanguage() const; 1.266 + int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const; 1.267 + 1.268 +}; 1.269 + 1.270 +class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { 1.271 +public: 1.272 + virtual ~CharsetRecog_IBM420_ar_rtl(); 1.273 + 1.274 + const char *getName() const; 1.275 + 1.276 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.277 +}; 1.278 + 1.279 +class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { 1.280 + virtual ~CharsetRecog_IBM420_ar_ltr(); 1.281 + 1.282 + const char *getName() const; 1.283 + 1.284 + virtual UBool match(InputText *det, CharsetMatch *results) const; 1.285 +}; 1.286 + 1.287 +U_NAMESPACE_END 1.288 + 1.289 +#endif /* !UCONFIG_NO_CONVERSION */ 1.290 +#endif /* __CSRSBCS_H */