1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/CharDistribution.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,206 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef CharDistribution_h__ 1.10 +#define CharDistribution_h__ 1.11 + 1.12 +#include "nscore.h" 1.13 + 1.14 +#define ENOUGH_DATA_THRESHOLD 1024 1.15 + 1.16 +#define MINIMUM_DATA_THRESHOLD 4 1.17 + 1.18 +class CharDistributionAnalysis 1.19 +{ 1.20 +public: 1.21 + CharDistributionAnalysis() {Reset(false);} 1.22 + 1.23 + //feed a block of data and do distribution analysis 1.24 + void HandleData(const char* aBuf, uint32_t aLen) {} 1.25 + 1.26 + //Feed a character with known length 1.27 + void HandleOneChar(const char* aStr, uint32_t aCharLen) 1.28 + { 1.29 + int32_t order; 1.30 + 1.31 + //we only care about 2-bytes character in our distribution analysis 1.32 + order = (aCharLen == 2) ? GetOrder(aStr) : -1; 1.33 + 1.34 + if (order >= 0) 1.35 + { 1.36 + mTotalChars++; 1.37 + //order is valid 1.38 + if ((uint32_t)order < mTableSize) 1.39 + { 1.40 + if (512 > mCharToFreqOrder[order]) 1.41 + mFreqChars++; 1.42 + } 1.43 + } 1.44 + } 1.45 + 1.46 + //return confidence base on existing data 1.47 + float GetConfidence(void); 1.48 + 1.49 + //Reset analyser, clear any state 1.50 + void Reset(bool aIsPreferredLanguage) 1.51 + { 1.52 + mDone = false; 1.53 + mTotalChars = 0; 1.54 + mFreqChars = 0; 1.55 + mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; 1.56 + } 1.57 + 1.58 + //It is not necessary to receive all data to draw conclusion. For charset detection, 1.59 + // certain amount of data is enough 1.60 + bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} 1.61 + 1.62 +protected: 1.63 + //we do not handle character base on its original encoding string, but 1.64 + //convert this encoding string to a number, here called order. 1.65 + //This allow multiple encoding of a language to share one frequency table 1.66 + virtual int32_t GetOrder(const char* str) {return -1;} 1.67 + 1.68 + //If this flag is set to true, detection is done and conclusion has been made 1.69 + bool mDone; 1.70 + 1.71 + //The number of characters whose frequency order is less than 512 1.72 + uint32_t mFreqChars; 1.73 + 1.74 + //Total character encounted. 1.75 + uint32_t mTotalChars; 1.76 + 1.77 + //Number of hi-byte characters needed to trigger detection 1.78 + uint32_t mDataThreshold; 1.79 + 1.80 + //Mapping table to get frequency order from char order (get from GetOrder()) 1.81 + const int16_t *mCharToFreqOrder; 1.82 + 1.83 + //Size of above table 1.84 + uint32_t mTableSize; 1.85 + 1.86 + //This is a constant value varies from language to language, it is used in 1.87 + //calculating confidence. See my paper for further detail. 1.88 + float mTypicalDistributionRatio; 1.89 +}; 1.90 + 1.91 + 1.92 +class EUCTWDistributionAnalysis: public CharDistributionAnalysis 1.93 +{ 1.94 +public: 1.95 + EUCTWDistributionAnalysis(); 1.96 +protected: 1.97 + 1.98 + //for euc-TW encoding, we are interested 1.99 + // first byte range: 0xc4 -- 0xfe 1.100 + // second byte range: 0xa1 -- 0xfe 1.101 + //no validation needed here. State machine has done that 1.102 + int32_t GetOrder(const char* str) 1.103 + { if ((unsigned char)*str >= (unsigned char)0xc4) 1.104 + return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; 1.105 + else 1.106 + return -1; 1.107 + } 1.108 +}; 1.109 + 1.110 + 1.111 +class EUCKRDistributionAnalysis : public CharDistributionAnalysis 1.112 +{ 1.113 +public: 1.114 + EUCKRDistributionAnalysis(); 1.115 +protected: 1.116 + //for euc-KR encoding, we are interested 1.117 + // first byte range: 0xb0 -- 0xfe 1.118 + // second byte range: 0xa1 -- 0xfe 1.119 + //no validation needed here. State machine has done that 1.120 + int32_t GetOrder(const char* str) 1.121 + { if ((unsigned char)*str >= (unsigned char)0xb0) 1.122 + return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 1.123 + else 1.124 + return -1; 1.125 + } 1.126 +}; 1.127 + 1.128 +class GB2312DistributionAnalysis : public CharDistributionAnalysis 1.129 +{ 1.130 +public: 1.131 + GB2312DistributionAnalysis(); 1.132 +protected: 1.133 + //for GB2312 encoding, we are interested 1.134 + // first byte range: 0xb0 -- 0xfe 1.135 + // second byte range: 0xa1 -- 0xfe 1.136 + //no validation needed here. State machine has done that 1.137 + int32_t GetOrder(const char* str) 1.138 + { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) 1.139 + return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; 1.140 + else 1.141 + return -1; 1.142 + } 1.143 +}; 1.144 + 1.145 + 1.146 +class Big5DistributionAnalysis : public CharDistributionAnalysis 1.147 +{ 1.148 +public: 1.149 + Big5DistributionAnalysis(); 1.150 +protected: 1.151 + //for big5 encoding, we are interested 1.152 + // first byte range: 0xa4 -- 0xfe 1.153 + // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 1.154 + //no validation needed here. State machine has done that 1.155 + int32_t GetOrder(const char* str) 1.156 + { if ((unsigned char)*str >= (unsigned char)0xa4) 1.157 + if ((unsigned char)str[1] >= (unsigned char)0xa1) 1.158 + return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; 1.159 + else 1.160 + return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; 1.161 + else 1.162 + return -1; 1.163 + } 1.164 +}; 1.165 + 1.166 +class SJISDistributionAnalysis : public CharDistributionAnalysis 1.167 +{ 1.168 +public: 1.169 + SJISDistributionAnalysis(); 1.170 +protected: 1.171 + //for sjis encoding, we are interested 1.172 + // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 1.173 + // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 1.174 + //no validation needed here. State machine has done that 1.175 + int32_t GetOrder(const char* str) 1.176 + { 1.177 + int32_t order; 1.178 + if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) 1.179 + order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); 1.180 + else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) 1.181 + order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); 1.182 + else 1.183 + return -1; 1.184 + order += (unsigned char)*(str+1) - 0x40; 1.185 + if ((unsigned char)str[1] > (unsigned char)0x7f) 1.186 + order--; 1.187 + return order; 1.188 + } 1.189 +}; 1.190 + 1.191 +class EUCJPDistributionAnalysis : public CharDistributionAnalysis 1.192 +{ 1.193 +public: 1.194 + EUCJPDistributionAnalysis(); 1.195 +protected: 1.196 + //for euc-JP encoding, we are interested 1.197 + // first byte range: 0xa0 -- 0xfe 1.198 + // second byte range: 0xa1 -- 0xfe 1.199 + //no validation needed here. State machine has done that 1.200 + int32_t GetOrder(const char* str) 1.201 + { if ((unsigned char)*str >= (unsigned char)0xa0) 1.202 + return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; 1.203 + else 1.204 + return -1; 1.205 + } 1.206 +}; 1.207 + 1.208 +#endif //CharDistribution_h__ 1.209 +