Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef CharDistribution_h__
7 #define CharDistribution_h__
9 #include "nscore.h"
11 #define ENOUGH_DATA_THRESHOLD 1024
13 #define MINIMUM_DATA_THRESHOLD 4
15 class CharDistributionAnalysis
16 {
17 public:
18 CharDistributionAnalysis() {Reset(false);}
20 //feed a block of data and do distribution analysis
21 void HandleData(const char* aBuf, uint32_t aLen) {}
23 //Feed a character with known length
24 void HandleOneChar(const char* aStr, uint32_t aCharLen)
25 {
26 int32_t order;
28 //we only care about 2-bytes character in our distribution analysis
29 order = (aCharLen == 2) ? GetOrder(aStr) : -1;
31 if (order >= 0)
32 {
33 mTotalChars++;
34 //order is valid
35 if ((uint32_t)order < mTableSize)
36 {
37 if (512 > mCharToFreqOrder[order])
38 mFreqChars++;
39 }
40 }
41 }
43 //return confidence base on existing data
44 float GetConfidence(void);
46 //Reset analyser, clear any state
47 void Reset(bool aIsPreferredLanguage)
48 {
49 mDone = false;
50 mTotalChars = 0;
51 mFreqChars = 0;
52 mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
53 }
55 //It is not necessary to receive all data to draw conclusion. For charset detection,
56 // certain amount of data is enough
57 bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
59 protected:
60 //we do not handle character base on its original encoding string, but
61 //convert this encoding string to a number, here called order.
62 //This allow multiple encoding of a language to share one frequency table
63 virtual int32_t GetOrder(const char* str) {return -1;}
65 //If this flag is set to true, detection is done and conclusion has been made
66 bool mDone;
68 //The number of characters whose frequency order is less than 512
69 uint32_t mFreqChars;
71 //Total character encounted.
72 uint32_t mTotalChars;
74 //Number of hi-byte characters needed to trigger detection
75 uint32_t mDataThreshold;
77 //Mapping table to get frequency order from char order (get from GetOrder())
78 const int16_t *mCharToFreqOrder;
80 //Size of above table
81 uint32_t mTableSize;
83 //This is a constant value varies from language to language, it is used in
84 //calculating confidence. See my paper for further detail.
85 float mTypicalDistributionRatio;
86 };
89 class EUCTWDistributionAnalysis: public CharDistributionAnalysis
90 {
91 public:
92 EUCTWDistributionAnalysis();
93 protected:
95 //for euc-TW encoding, we are interested
96 // first byte range: 0xc4 -- 0xfe
97 // second byte range: 0xa1 -- 0xfe
98 //no validation needed here. State machine has done that
99 int32_t GetOrder(const char* str)
100 { if ((unsigned char)*str >= (unsigned char)0xc4)
101 return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
102 else
103 return -1;
104 }
105 };
108 class EUCKRDistributionAnalysis : public CharDistributionAnalysis
109 {
110 public:
111 EUCKRDistributionAnalysis();
112 protected:
113 //for euc-KR encoding, we are interested
114 // first byte range: 0xb0 -- 0xfe
115 // second byte range: 0xa1 -- 0xfe
116 //no validation needed here. State machine has done that
117 int32_t GetOrder(const char* str)
118 { if ((unsigned char)*str >= (unsigned char)0xb0)
119 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
120 else
121 return -1;
122 }
123 };
125 class GB2312DistributionAnalysis : public CharDistributionAnalysis
126 {
127 public:
128 GB2312DistributionAnalysis();
129 protected:
130 //for GB2312 encoding, we are interested
131 // first byte range: 0xb0 -- 0xfe
132 // second byte range: 0xa1 -- 0xfe
133 //no validation needed here. State machine has done that
134 int32_t GetOrder(const char* str)
135 { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)
136 return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
137 else
138 return -1;
139 }
140 };
143 class Big5DistributionAnalysis : public CharDistributionAnalysis
144 {
145 public:
146 Big5DistributionAnalysis();
147 protected:
148 //for big5 encoding, we are interested
149 // first byte range: 0xa4 -- 0xfe
150 // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
151 //no validation needed here. State machine has done that
152 int32_t GetOrder(const char* str)
153 { if ((unsigned char)*str >= (unsigned char)0xa4)
154 if ((unsigned char)str[1] >= (unsigned char)0xa1)
155 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
156 else
157 return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
158 else
159 return -1;
160 }
161 };
163 class SJISDistributionAnalysis : public CharDistributionAnalysis
164 {
165 public:
166 SJISDistributionAnalysis();
167 protected:
168 //for sjis encoding, we are interested
169 // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
170 // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
171 //no validation needed here. State machine has done that
172 int32_t GetOrder(const char* str)
173 {
174 int32_t order;
175 if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)
176 order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
177 else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)
178 order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
179 else
180 return -1;
181 order += (unsigned char)*(str+1) - 0x40;
182 if ((unsigned char)str[1] > (unsigned char)0x7f)
183 order--;
184 return order;
185 }
186 };
188 class EUCJPDistributionAnalysis : public CharDistributionAnalysis
189 {
190 public:
191 EUCJPDistributionAnalysis();
192 protected:
193 //for euc-JP encoding, we are interested
194 // first byte range: 0xa0 -- 0xfe
195 // second byte range: 0xa1 -- 0xfe
196 //no validation needed here. State machine has done that
197 int32_t GetOrder(const char* str)
198 { if ((unsigned char)*str >= (unsigned char)0xa0)
199 return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
200 else
201 return -1;
202 }
203 };
205 #endif //CharDistribution_h__