1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsUniversalDetector.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,250 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "nscore.h" 1.10 + 1.11 +#include "nsUniversalDetector.h" 1.12 + 1.13 +#include "nsMBCSGroupProber.h" 1.14 +#include "nsSBCSGroupProber.h" 1.15 +#include "nsEscCharsetProber.h" 1.16 +#include "nsLatin1Prober.h" 1.17 + 1.18 +nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter) 1.19 +{ 1.20 + mDone = false; 1.21 + mBestGuess = -1; //illegal value as signal 1.22 + mInTag = false; 1.23 + mEscCharSetProber = nullptr; 1.24 + 1.25 + mStart = true; 1.26 + mDetectedCharset = nullptr; 1.27 + mGotData = false; 1.28 + mInputState = ePureAscii; 1.29 + mLastChar = '\0'; 1.30 + mLanguageFilter = aLanguageFilter; 1.31 + 1.32 + uint32_t i; 1.33 + for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 1.34 + mCharSetProbers[i] = nullptr; 1.35 +} 1.36 + 1.37 +nsUniversalDetector::~nsUniversalDetector() 1.38 +{ 1.39 + for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 1.40 + delete mCharSetProbers[i]; 1.41 + 1.42 + delete mEscCharSetProber; 1.43 +} 1.44 + 1.45 +void 1.46 +nsUniversalDetector::Reset() 1.47 +{ 1.48 + mDone = false; 1.49 + mBestGuess = -1; //illegal value as signal 1.50 + mInTag = false; 1.51 + 1.52 + mStart = true; 1.53 + mDetectedCharset = nullptr; 1.54 + mGotData = false; 1.55 + mInputState = ePureAscii; 1.56 + mLastChar = '\0'; 1.57 + 1.58 + if (mEscCharSetProber) 1.59 + mEscCharSetProber->Reset(); 1.60 + 1.61 + uint32_t i; 1.62 + for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 1.63 + if (mCharSetProbers[i]) 1.64 + mCharSetProbers[i]->Reset(); 1.65 +} 1.66 + 1.67 +//--------------------------------------------------------------------- 1.68 +#define SHORTCUT_THRESHOLD (float)0.95 1.69 +#define MINIMUM_THRESHOLD (float)0.20 1.70 + 1.71 +nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) 1.72 +{ 1.73 + if(mDone) 1.74 + return NS_OK; 1.75 + 1.76 + if (aLen > 0) 1.77 + mGotData = true; 1.78 + 1.79 + //If the data starts with BOM, we know it is UTF 1.80 + if (mStart) 1.81 + { 1.82 + mStart = false; 1.83 + if (aLen >= 2) { 1.84 + switch (aBuf[0]) { 1.85 + case '\xEF': 1.86 + if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) { 1.87 + // EF BB BF UTF-8 encoded BOM 1.88 + mDetectedCharset = "UTF-8"; 1.89 + } 1.90 + break; 1.91 + case '\xFE': 1.92 + if ('\xFF' == aBuf[1]) { 1.93 + // FE FF UTF-16, big endian BOM 1.94 + mDetectedCharset = "UTF-16BE"; 1.95 + } 1.96 + break; 1.97 + case '\xFF': 1.98 + if ('\xFE' == aBuf[1]) { 1.99 + // FF FE UTF-16, little endian BOM 1.100 + mDetectedCharset = "UTF-16LE"; 1.101 + } 1.102 + break; 1.103 + } // switch 1.104 + } 1.105 + 1.106 + if (mDetectedCharset) 1.107 + { 1.108 + mDone = true; 1.109 + return NS_OK; 1.110 + } 1.111 + } 1.112 + 1.113 + uint32_t i; 1.114 + for (i = 0; i < aLen; i++) 1.115 + { 1.116 + //other than 0xa0, if every othe character is ascii, the page is ascii 1.117 + if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP 1.118 + { 1.119 + //we got a non-ascii byte (high-byte) 1.120 + if (mInputState != eHighbyte) 1.121 + { 1.122 + //adjust state 1.123 + mInputState = eHighbyte; 1.124 + 1.125 + //kill mEscCharSetProber if it is active 1.126 + if (mEscCharSetProber) { 1.127 + delete mEscCharSetProber; 1.128 + mEscCharSetProber = nullptr; 1.129 + } 1.130 + 1.131 + //start multibyte and singlebyte charset prober 1.132 + if (nullptr == mCharSetProbers[0]) 1.133 + { 1.134 + mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); 1.135 + if (nullptr == mCharSetProbers[0]) 1.136 + return NS_ERROR_OUT_OF_MEMORY; 1.137 + } 1.138 + if (nullptr == mCharSetProbers[1] && 1.139 + (mLanguageFilter & NS_FILTER_NON_CJK)) 1.140 + { 1.141 + mCharSetProbers[1] = new nsSBCSGroupProber; 1.142 + if (nullptr == mCharSetProbers[1]) 1.143 + return NS_ERROR_OUT_OF_MEMORY; 1.144 + } 1.145 + if (nullptr == mCharSetProbers[2]) 1.146 + { 1.147 + mCharSetProbers[2] = new nsLatin1Prober; 1.148 + if (nullptr == mCharSetProbers[2]) 1.149 + return NS_ERROR_OUT_OF_MEMORY; 1.150 + } 1.151 + } 1.152 + } 1.153 + else 1.154 + { 1.155 + //ok, just pure ascii so far 1.156 + if ( ePureAscii == mInputState && 1.157 + (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) 1.158 + { 1.159 + //found escape character or HZ "~{" 1.160 + mInputState = eEscAscii; 1.161 + } 1.162 + mLastChar = aBuf[i]; 1.163 + } 1.164 + } 1.165 + 1.166 + nsProbingState st; 1.167 + switch (mInputState) 1.168 + { 1.169 + case eEscAscii: 1.170 + if (nullptr == mEscCharSetProber) { 1.171 + mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); 1.172 + if (nullptr == mEscCharSetProber) 1.173 + return NS_ERROR_OUT_OF_MEMORY; 1.174 + } 1.175 + st = mEscCharSetProber->HandleData(aBuf, aLen); 1.176 + if (st == eFoundIt) 1.177 + { 1.178 + mDone = true; 1.179 + mDetectedCharset = mEscCharSetProber->GetCharSetName(); 1.180 + } 1.181 + break; 1.182 + case eHighbyte: 1.183 + for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 1.184 + { 1.185 + if (mCharSetProbers[i]) 1.186 + { 1.187 + st = mCharSetProbers[i]->HandleData(aBuf, aLen); 1.188 + if (st == eFoundIt) 1.189 + { 1.190 + mDone = true; 1.191 + mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); 1.192 + return NS_OK; 1.193 + } 1.194 + } 1.195 + } 1.196 + break; 1.197 + 1.198 + default: //pure ascii 1.199 + ;//do nothing here 1.200 + } 1.201 + return NS_OK; 1.202 +} 1.203 + 1.204 + 1.205 +//--------------------------------------------------------------------- 1.206 +void nsUniversalDetector::DataEnd() 1.207 +{ 1.208 + if (!mGotData) 1.209 + { 1.210 + // we haven't got any data yet, return immediately 1.211 + // caller program sometimes call DataEnd before anything has been sent to detector 1.212 + return; 1.213 + } 1.214 + 1.215 + if (mDetectedCharset) 1.216 + { 1.217 + mDone = true; 1.218 + Report(mDetectedCharset); 1.219 + return; 1.220 + } 1.221 + 1.222 + switch (mInputState) 1.223 + { 1.224 + case eHighbyte: 1.225 + { 1.226 + float proberConfidence; 1.227 + float maxProberConfidence = (float)0.0; 1.228 + int32_t maxProber = 0; 1.229 + 1.230 + for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) 1.231 + { 1.232 + if (mCharSetProbers[i]) 1.233 + { 1.234 + proberConfidence = mCharSetProbers[i]->GetConfidence(); 1.235 + if (proberConfidence > maxProberConfidence) 1.236 + { 1.237 + maxProberConfidence = proberConfidence; 1.238 + maxProber = i; 1.239 + } 1.240 + } 1.241 + } 1.242 + //do not report anything because we are not confident of it, that's in fact a negative answer 1.243 + if (maxProberConfidence > MINIMUM_THRESHOLD) 1.244 + Report(mCharSetProbers[maxProber]->GetCharSetName()); 1.245 + } 1.246 + break; 1.247 + case eEscAscii: 1.248 + break; 1.249 + default: 1.250 + ; 1.251 + } 1.252 + return; 1.253 +}