Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nscore.h"
8 #include "nsUniversalDetector.h"
10 #include "nsMBCSGroupProber.h"
11 #include "nsSBCSGroupProber.h"
12 #include "nsEscCharsetProber.h"
13 #include "nsLatin1Prober.h"
15 nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter)
16 {
17 mDone = false;
18 mBestGuess = -1; //illegal value as signal
19 mInTag = false;
20 mEscCharSetProber = nullptr;
22 mStart = true;
23 mDetectedCharset = nullptr;
24 mGotData = false;
25 mInputState = ePureAscii;
26 mLastChar = '\0';
27 mLanguageFilter = aLanguageFilter;
29 uint32_t i;
30 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
31 mCharSetProbers[i] = nullptr;
32 }
34 nsUniversalDetector::~nsUniversalDetector()
35 {
36 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
37 delete mCharSetProbers[i];
39 delete mEscCharSetProber;
40 }
42 void
43 nsUniversalDetector::Reset()
44 {
45 mDone = false;
46 mBestGuess = -1; //illegal value as signal
47 mInTag = false;
49 mStart = true;
50 mDetectedCharset = nullptr;
51 mGotData = false;
52 mInputState = ePureAscii;
53 mLastChar = '\0';
55 if (mEscCharSetProber)
56 mEscCharSetProber->Reset();
58 uint32_t i;
59 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
60 if (mCharSetProbers[i])
61 mCharSetProbers[i]->Reset();
62 }
64 //---------------------------------------------------------------------
65 #define SHORTCUT_THRESHOLD (float)0.95
66 #define MINIMUM_THRESHOLD (float)0.20
68 nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
69 {
70 if(mDone)
71 return NS_OK;
73 if (aLen > 0)
74 mGotData = true;
76 //If the data starts with BOM, we know it is UTF
77 if (mStart)
78 {
79 mStart = false;
80 if (aLen >= 2) {
81 switch (aBuf[0]) {
82 case '\xEF':
83 if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) {
84 // EF BB BF UTF-8 encoded BOM
85 mDetectedCharset = "UTF-8";
86 }
87 break;
88 case '\xFE':
89 if ('\xFF' == aBuf[1]) {
90 // FE FF UTF-16, big endian BOM
91 mDetectedCharset = "UTF-16BE";
92 }
93 break;
94 case '\xFF':
95 if ('\xFE' == aBuf[1]) {
96 // FF FE UTF-16, little endian BOM
97 mDetectedCharset = "UTF-16LE";
98 }
99 break;
100 } // switch
101 }
103 if (mDetectedCharset)
104 {
105 mDone = true;
106 return NS_OK;
107 }
108 }
110 uint32_t i;
111 for (i = 0; i < aLen; i++)
112 {
113 //other than 0xa0, if every othe character is ascii, the page is ascii
114 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
115 {
116 //we got a non-ascii byte (high-byte)
117 if (mInputState != eHighbyte)
118 {
119 //adjust state
120 mInputState = eHighbyte;
122 //kill mEscCharSetProber if it is active
123 if (mEscCharSetProber) {
124 delete mEscCharSetProber;
125 mEscCharSetProber = nullptr;
126 }
128 //start multibyte and singlebyte charset prober
129 if (nullptr == mCharSetProbers[0])
130 {
131 mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
132 if (nullptr == mCharSetProbers[0])
133 return NS_ERROR_OUT_OF_MEMORY;
134 }
135 if (nullptr == mCharSetProbers[1] &&
136 (mLanguageFilter & NS_FILTER_NON_CJK))
137 {
138 mCharSetProbers[1] = new nsSBCSGroupProber;
139 if (nullptr == mCharSetProbers[1])
140 return NS_ERROR_OUT_OF_MEMORY;
141 }
142 if (nullptr == mCharSetProbers[2])
143 {
144 mCharSetProbers[2] = new nsLatin1Prober;
145 if (nullptr == mCharSetProbers[2])
146 return NS_ERROR_OUT_OF_MEMORY;
147 }
148 }
149 }
150 else
151 {
152 //ok, just pure ascii so far
153 if ( ePureAscii == mInputState &&
154 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
155 {
156 //found escape character or HZ "~{"
157 mInputState = eEscAscii;
158 }
159 mLastChar = aBuf[i];
160 }
161 }
163 nsProbingState st;
164 switch (mInputState)
165 {
166 case eEscAscii:
167 if (nullptr == mEscCharSetProber) {
168 mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
169 if (nullptr == mEscCharSetProber)
170 return NS_ERROR_OUT_OF_MEMORY;
171 }
172 st = mEscCharSetProber->HandleData(aBuf, aLen);
173 if (st == eFoundIt)
174 {
175 mDone = true;
176 mDetectedCharset = mEscCharSetProber->GetCharSetName();
177 }
178 break;
179 case eHighbyte:
180 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
181 {
182 if (mCharSetProbers[i])
183 {
184 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
185 if (st == eFoundIt)
186 {
187 mDone = true;
188 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
189 return NS_OK;
190 }
191 }
192 }
193 break;
195 default: //pure ascii
196 ;//do nothing here
197 }
198 return NS_OK;
199 }
202 //---------------------------------------------------------------------
203 void nsUniversalDetector::DataEnd()
204 {
205 if (!mGotData)
206 {
207 // we haven't got any data yet, return immediately
208 // caller program sometimes call DataEnd before anything has been sent to detector
209 return;
210 }
212 if (mDetectedCharset)
213 {
214 mDone = true;
215 Report(mDetectedCharset);
216 return;
217 }
219 switch (mInputState)
220 {
221 case eHighbyte:
222 {
223 float proberConfidence;
224 float maxProberConfidence = (float)0.0;
225 int32_t maxProber = 0;
227 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
228 {
229 if (mCharSetProbers[i])
230 {
231 proberConfidence = mCharSetProbers[i]->GetConfidence();
232 if (proberConfidence > maxProberConfidence)
233 {
234 maxProberConfidence = proberConfidence;
235 maxProber = i;
236 }
237 }
238 }
239 //do not report anything because we are not confident of it, that's in fact a negative answer
240 if (maxProberConfidence > MINIMUM_THRESHOLD)
241 Report(mCharSetProbers[maxProber]->GetCharSetName());
242 }
243 break;
244 case eEscAscii:
245 break;
246 default:
247 ;
248 }
249 return;
250 }