|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nscore.h" |
|
7 |
|
8 #include "nsUniversalDetector.h" |
|
9 |
|
10 #include "nsMBCSGroupProber.h" |
|
11 #include "nsSBCSGroupProber.h" |
|
12 #include "nsEscCharsetProber.h" |
|
13 #include "nsLatin1Prober.h" |
|
14 |
|
15 nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter) |
|
16 { |
|
17 mDone = false; |
|
18 mBestGuess = -1; //illegal value as signal |
|
19 mInTag = false; |
|
20 mEscCharSetProber = nullptr; |
|
21 |
|
22 mStart = true; |
|
23 mDetectedCharset = nullptr; |
|
24 mGotData = false; |
|
25 mInputState = ePureAscii; |
|
26 mLastChar = '\0'; |
|
27 mLanguageFilter = aLanguageFilter; |
|
28 |
|
29 uint32_t i; |
|
30 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) |
|
31 mCharSetProbers[i] = nullptr; |
|
32 } |
|
33 |
|
34 nsUniversalDetector::~nsUniversalDetector() |
|
35 { |
|
36 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) |
|
37 delete mCharSetProbers[i]; |
|
38 |
|
39 delete mEscCharSetProber; |
|
40 } |
|
41 |
|
42 void |
|
43 nsUniversalDetector::Reset() |
|
44 { |
|
45 mDone = false; |
|
46 mBestGuess = -1; //illegal value as signal |
|
47 mInTag = false; |
|
48 |
|
49 mStart = true; |
|
50 mDetectedCharset = nullptr; |
|
51 mGotData = false; |
|
52 mInputState = ePureAscii; |
|
53 mLastChar = '\0'; |
|
54 |
|
55 if (mEscCharSetProber) |
|
56 mEscCharSetProber->Reset(); |
|
57 |
|
58 uint32_t i; |
|
59 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) |
|
60 if (mCharSetProbers[i]) |
|
61 mCharSetProbers[i]->Reset(); |
|
62 } |
|
63 |
|
64 //--------------------------------------------------------------------- |
|
65 #define SHORTCUT_THRESHOLD (float)0.95 |
|
66 #define MINIMUM_THRESHOLD (float)0.20 |
|
67 |
|
68 nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) |
|
69 { |
|
70 if(mDone) |
|
71 return NS_OK; |
|
72 |
|
73 if (aLen > 0) |
|
74 mGotData = true; |
|
75 |
|
76 //If the data starts with BOM, we know it is UTF |
|
77 if (mStart) |
|
78 { |
|
79 mStart = false; |
|
80 if (aLen >= 2) { |
|
81 switch (aBuf[0]) { |
|
82 case '\xEF': |
|
83 if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) { |
|
84 // EF BB BF UTF-8 encoded BOM |
|
85 mDetectedCharset = "UTF-8"; |
|
86 } |
|
87 break; |
|
88 case '\xFE': |
|
89 if ('\xFF' == aBuf[1]) { |
|
90 // FE FF UTF-16, big endian BOM |
|
91 mDetectedCharset = "UTF-16BE"; |
|
92 } |
|
93 break; |
|
94 case '\xFF': |
|
95 if ('\xFE' == aBuf[1]) { |
|
96 // FF FE UTF-16, little endian BOM |
|
97 mDetectedCharset = "UTF-16LE"; |
|
98 } |
|
99 break; |
|
100 } // switch |
|
101 } |
|
102 |
|
103 if (mDetectedCharset) |
|
104 { |
|
105 mDone = true; |
|
106 return NS_OK; |
|
107 } |
|
108 } |
|
109 |
|
110 uint32_t i; |
|
111 for (i = 0; i < aLen; i++) |
|
112 { |
|
113 //other than 0xa0, if every othe character is ascii, the page is ascii |
|
114 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP |
|
115 { |
|
116 //we got a non-ascii byte (high-byte) |
|
117 if (mInputState != eHighbyte) |
|
118 { |
|
119 //adjust state |
|
120 mInputState = eHighbyte; |
|
121 |
|
122 //kill mEscCharSetProber if it is active |
|
123 if (mEscCharSetProber) { |
|
124 delete mEscCharSetProber; |
|
125 mEscCharSetProber = nullptr; |
|
126 } |
|
127 |
|
128 //start multibyte and singlebyte charset prober |
|
129 if (nullptr == mCharSetProbers[0]) |
|
130 { |
|
131 mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); |
|
132 if (nullptr == mCharSetProbers[0]) |
|
133 return NS_ERROR_OUT_OF_MEMORY; |
|
134 } |
|
135 if (nullptr == mCharSetProbers[1] && |
|
136 (mLanguageFilter & NS_FILTER_NON_CJK)) |
|
137 { |
|
138 mCharSetProbers[1] = new nsSBCSGroupProber; |
|
139 if (nullptr == mCharSetProbers[1]) |
|
140 return NS_ERROR_OUT_OF_MEMORY; |
|
141 } |
|
142 if (nullptr == mCharSetProbers[2]) |
|
143 { |
|
144 mCharSetProbers[2] = new nsLatin1Prober; |
|
145 if (nullptr == mCharSetProbers[2]) |
|
146 return NS_ERROR_OUT_OF_MEMORY; |
|
147 } |
|
148 } |
|
149 } |
|
150 else |
|
151 { |
|
152 //ok, just pure ascii so far |
|
153 if ( ePureAscii == mInputState && |
|
154 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) |
|
155 { |
|
156 //found escape character or HZ "~{" |
|
157 mInputState = eEscAscii; |
|
158 } |
|
159 mLastChar = aBuf[i]; |
|
160 } |
|
161 } |
|
162 |
|
163 nsProbingState st; |
|
164 switch (mInputState) |
|
165 { |
|
166 case eEscAscii: |
|
167 if (nullptr == mEscCharSetProber) { |
|
168 mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); |
|
169 if (nullptr == mEscCharSetProber) |
|
170 return NS_ERROR_OUT_OF_MEMORY; |
|
171 } |
|
172 st = mEscCharSetProber->HandleData(aBuf, aLen); |
|
173 if (st == eFoundIt) |
|
174 { |
|
175 mDone = true; |
|
176 mDetectedCharset = mEscCharSetProber->GetCharSetName(); |
|
177 } |
|
178 break; |
|
179 case eHighbyte: |
|
180 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) |
|
181 { |
|
182 if (mCharSetProbers[i]) |
|
183 { |
|
184 st = mCharSetProbers[i]->HandleData(aBuf, aLen); |
|
185 if (st == eFoundIt) |
|
186 { |
|
187 mDone = true; |
|
188 mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); |
|
189 return NS_OK; |
|
190 } |
|
191 } |
|
192 } |
|
193 break; |
|
194 |
|
195 default: //pure ascii |
|
196 ;//do nothing here |
|
197 } |
|
198 return NS_OK; |
|
199 } |
|
200 |
|
201 |
|
202 //--------------------------------------------------------------------- |
|
203 void nsUniversalDetector::DataEnd() |
|
204 { |
|
205 if (!mGotData) |
|
206 { |
|
207 // we haven't got any data yet, return immediately |
|
208 // caller program sometimes call DataEnd before anything has been sent to detector |
|
209 return; |
|
210 } |
|
211 |
|
212 if (mDetectedCharset) |
|
213 { |
|
214 mDone = true; |
|
215 Report(mDetectedCharset); |
|
216 return; |
|
217 } |
|
218 |
|
219 switch (mInputState) |
|
220 { |
|
221 case eHighbyte: |
|
222 { |
|
223 float proberConfidence; |
|
224 float maxProberConfidence = (float)0.0; |
|
225 int32_t maxProber = 0; |
|
226 |
|
227 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) |
|
228 { |
|
229 if (mCharSetProbers[i]) |
|
230 { |
|
231 proberConfidence = mCharSetProbers[i]->GetConfidence(); |
|
232 if (proberConfidence > maxProberConfidence) |
|
233 { |
|
234 maxProberConfidence = proberConfidence; |
|
235 maxProber = i; |
|
236 } |
|
237 } |
|
238 } |
|
239 //do not report anything because we are not confident of it, that's in fact a negative answer |
|
240 if (maxProberConfidence > MINIMUM_THRESHOLD) |
|
241 Report(mCharSetProbers[maxProber]->GetCharSetName()); |
|
242 } |
|
243 break; |
|
244 case eEscAscii: |
|
245 break; |
|
246 default: |
|
247 ; |
|
248 } |
|
249 return; |
|
250 } |