1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,196 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +#include <stdio.h> 1.9 + 1.10 +#include "nsMBCSGroupProber.h" 1.11 +#include "nsUniversalDetector.h" 1.12 + 1.13 +#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) 1.14 +const char *ProberName[] = 1.15 +{ 1.16 + "UTF8", 1.17 + "SJIS", 1.18 + "EUCJP", 1.19 + "GB18030", 1.20 + "EUCKR", 1.21 + "Big5", 1.22 + "EUCTW", 1.23 +}; 1.24 + 1.25 +#endif 1.26 + 1.27 +nsMBCSGroupProber::nsMBCSGroupProber(uint32_t aLanguageFilter) 1.28 +{ 1.29 + for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 1.30 + mProbers[i] = nullptr; 1.31 + 1.32 + mProbers[0] = new nsUTF8Prober(); 1.33 + if (aLanguageFilter & NS_FILTER_JAPANESE) 1.34 + { 1.35 + mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE); 1.36 + mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE); 1.37 + } 1.38 + if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) 1.39 + mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED); 1.40 + if (aLanguageFilter & NS_FILTER_KOREAN) 1.41 + mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN); 1.42 + if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 1.43 + { 1.44 + mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); 1.45 + mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); 1.46 + } 1.47 + Reset(); 1.48 +} 1.49 + 1.50 +nsMBCSGroupProber::~nsMBCSGroupProber() 1.51 +{ 1.52 + for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 1.53 + { 1.54 + delete mProbers[i]; 1.55 + } 1.56 +} 1.57 + 1.58 +const char* nsMBCSGroupProber::GetCharSetName() 1.59 +{ 1.60 + if (mBestGuess == -1) 1.61 + { 1.62 + GetConfidence(); 1.63 + if (mBestGuess == -1) 1.64 + mBestGuess = 0; 1.65 + } 1.66 + return mProbers[mBestGuess]->GetCharSetName(); 1.67 +} 1.68 + 1.69 +void nsMBCSGroupProber::Reset(void) 1.70 +{ 1.71 + mActiveNum = 0; 1.72 + for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 1.73 + { 1.74 + if (mProbers[i]) 1.75 + { 1.76 + mProbers[i]->Reset(); 1.77 + mIsActive[i] = true; 1.78 + ++mActiveNum; 1.79 + } 1.80 + else 1.81 + mIsActive[i] = false; 1.82 + } 1.83 + mBestGuess = -1; 1.84 + mState = eDetecting; 1.85 + mKeepNext = 0; 1.86 +} 1.87 + 1.88 +nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen) 1.89 +{ 1.90 + nsProbingState st; 1.91 + uint32_t start = 0; 1.92 + uint32_t keepNext = mKeepNext; 1.93 + 1.94 + //do filtering to reduce load to probers 1.95 + for (uint32_t pos = 0; pos < aLen; ++pos) 1.96 + { 1.97 + if (aBuf[pos] & 0x80) 1.98 + { 1.99 + if (!keepNext) 1.100 + start = pos; 1.101 + keepNext = 2; 1.102 + } 1.103 + else if (keepNext) 1.104 + { 1.105 + if (--keepNext == 0) 1.106 + { 1.107 + for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 1.108 + { 1.109 + if (!mIsActive[i]) 1.110 + continue; 1.111 + st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); 1.112 + if (st == eFoundIt) 1.113 + { 1.114 + mBestGuess = i; 1.115 + mState = eFoundIt; 1.116 + return mState; 1.117 + } 1.118 + } 1.119 + } 1.120 + } 1.121 + } 1.122 + 1.123 + if (keepNext) { 1.124 + for (uint32_t i = 0; i < NUM_OF_PROBERS; i++) 1.125 + { 1.126 + if (!mIsActive[i]) 1.127 + continue; 1.128 + st = mProbers[i]->HandleData(aBuf + start, aLen - start); 1.129 + if (st == eFoundIt) 1.130 + { 1.131 + mBestGuess = i; 1.132 + mState = eFoundIt; 1.133 + return mState; 1.134 + } 1.135 + } 1.136 + } 1.137 + mKeepNext = keepNext; 1.138 + 1.139 + return mState; 1.140 +} 1.141 + 1.142 +float nsMBCSGroupProber::GetConfidence(void) 1.143 +{ 1.144 + uint32_t i; 1.145 + float bestConf = 0.0, cf; 1.146 + 1.147 + switch (mState) 1.148 + { 1.149 + case eFoundIt: 1.150 + return (float)0.99; 1.151 + case eNotMe: 1.152 + return (float)0.01; 1.153 + default: 1.154 + for (i = 0; i < NUM_OF_PROBERS; i++) 1.155 + { 1.156 + if (!mIsActive[i]) 1.157 + continue; 1.158 + cf = mProbers[i]->GetConfidence(); 1.159 + if (bestConf < cf) 1.160 + { 1.161 + bestConf = cf; 1.162 + mBestGuess = i; 1.163 + } 1.164 + } 1.165 + } 1.166 + return bestConf; 1.167 +} 1.168 + 1.169 +#ifdef DEBUG_chardet 1.170 +void nsMBCSGroupProber::DumpStatus() 1.171 +{ 1.172 + uint32_t i; 1.173 + float cf; 1.174 + 1.175 + GetConfidence(); 1.176 + for (i = 0; i < NUM_OF_PROBERS; i++) 1.177 + { 1.178 + if (!mIsActive[i]) 1.179 + printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 1.180 + else 1.181 + { 1.182 + cf = mProbers[i]->GetConfidence(); 1.183 + printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); 1.184 + } 1.185 + } 1.186 +} 1.187 +#endif 1.188 + 1.189 +#ifdef DEBUG_jgmyers 1.190 +void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], uint32_t &offset) 1.191 +{ 1.192 + for (uint32_t i = 0; i < NUM_OF_PROBERS; ++i) { 1.193 + states[offset].name = ProberName[i]; 1.194 + states[offset].isActive = mIsActive[i]; 1.195 + states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; 1.196 + ++offset; 1.197 + } 1.198 +} 1.199 +#endif /* DEBUG_jgmyers */