1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsSBCSGroupProber.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,191 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include <stdio.h> 1.10 +#include "prmem.h" 1.11 + 1.12 +#include "nsSBCharSetProber.h" 1.13 +#include "nsSBCSGroupProber.h" 1.14 + 1.15 +#include "nsHebrewProber.h" 1.16 + 1.17 +nsSBCSGroupProber::nsSBCSGroupProber() 1.18 +{ 1.19 + mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); 1.20 + mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); 1.21 + mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); 1.22 + mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); 1.23 + mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); 1.24 + mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); 1.25 + mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); 1.26 + mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); 1.27 + mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); 1.28 + mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); 1.29 + mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel); 1.30 + 1.31 + nsHebrewProber *hebprober = new nsHebrewProber(); 1.32 + // Notice: Any change in these indexes - 10,11,12 must be reflected 1.33 + // in the code below as well. 1.34 + mProbers[11] = hebprober; 1.35 + mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew 1.36 + mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew 1.37 + // Tell the Hebrew prober about the logical and visual probers 1.38 + if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null 1.39 + { 1.40 + hebprober->SetModelProbers(mProbers[12], mProbers[13]); 1.41 + } 1.42 + else // One or more is null. avoid any Hebrew probing, null them all 1.43 + { 1.44 + for (uint32_t i = 11; i <= 13; ++i) 1.45 + { 1.46 + delete mProbers[i]; 1.47 + mProbers[i] = 0; 1.48 + } 1.49 + } 1.50 + 1.51 + // disable latin2 before latin1 is available, otherwise all latin1 1.52 + // will be detected as latin2 because of their similarity. 1.53 + //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); 1.54 + //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); 1.55 + 1.56 + Reset(); 1.57 +} 1.58 + 1.59 +nsSBCSGroupProber::~nsSBCSGroupProber() 1.60 +{ 1.61 + for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++) 1.62 + { 1.63 + delete mProbers[i]; 1.64 + } 1.65 +} 1.66 + 1.67 + 1.68 +const char* nsSBCSGroupProber::GetCharSetName() 1.69 +{ 1.70 + //if we have no answer yet 1.71 + if (mBestGuess == -1) 1.72 + { 1.73 + GetConfidence(); 1.74 + //no charset seems positive 1.75 + if (mBestGuess == -1) 1.76 + //we will use default. 1.77 + mBestGuess = 0; 1.78 + } 1.79 + return mProbers[mBestGuess]->GetCharSetName(); 1.80 +} 1.81 + 1.82 +void nsSBCSGroupProber::Reset(void) 1.83 +{ 1.84 + mActiveNum = 0; 1.85 + for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++) 1.86 + { 1.87 + if (mProbers[i]) // not null 1.88 + { 1.89 + mProbers[i]->Reset(); 1.90 + mIsActive[i] = true; 1.91 + ++mActiveNum; 1.92 + } 1.93 + else 1.94 + mIsActive[i] = false; 1.95 + } 1.96 + mBestGuess = -1; 1.97 + mState = eDetecting; 1.98 +} 1.99 + 1.100 + 1.101 +nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen) 1.102 +{ 1.103 + nsProbingState st; 1.104 + uint32_t i; 1.105 + char *newBuf1 = 0; 1.106 + uint32_t newLen1 = 0; 1.107 + 1.108 + //apply filter to original buffer, and we got new buffer back 1.109 + //depend on what script it is, we will feed them the new buffer 1.110 + //we got after applying proper filter 1.111 + //this is done without any consideration to KeepEnglishLetters 1.112 + //of each prober since as of now, there are no probers here which 1.113 + //recognize languages with English characters. 1.114 + if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) 1.115 + goto done; 1.116 + 1.117 + if (newLen1 == 0) 1.118 + goto done; // Nothing to see here, move on. 1.119 + 1.120 + for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 1.121 + { 1.122 + if (!mIsActive[i]) 1.123 + continue; 1.124 + st = mProbers[i]->HandleData(newBuf1, newLen1); 1.125 + if (st == eFoundIt) 1.126 + { 1.127 + mBestGuess = i; 1.128 + mState = eFoundIt; 1.129 + break; 1.130 + } 1.131 + else if (st == eNotMe) 1.132 + { 1.133 + mIsActive[i] = false; 1.134 + mActiveNum--; 1.135 + if (mActiveNum <= 0) 1.136 + { 1.137 + mState = eNotMe; 1.138 + break; 1.139 + } 1.140 + } 1.141 + } 1.142 + 1.143 +done: 1.144 + PR_FREEIF(newBuf1); 1.145 + 1.146 + return mState; 1.147 +} 1.148 + 1.149 +float nsSBCSGroupProber::GetConfidence(void) 1.150 +{ 1.151 + uint32_t i; 1.152 + float bestConf = 0.0, cf; 1.153 + 1.154 + switch (mState) 1.155 + { 1.156 + case eFoundIt: 1.157 + return (float)0.99; //sure yes 1.158 + case eNotMe: 1.159 + return (float)0.01; //sure no 1.160 + default: 1.161 + for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 1.162 + { 1.163 + if (!mIsActive[i]) 1.164 + continue; 1.165 + cf = mProbers[i]->GetConfidence(); 1.166 + if (bestConf < cf) 1.167 + { 1.168 + bestConf = cf; 1.169 + mBestGuess = i; 1.170 + } 1.171 + } 1.172 + } 1.173 + return bestConf; 1.174 +} 1.175 + 1.176 +#ifdef DEBUG_chardet 1.177 +void nsSBCSGroupProber::DumpStatus() 1.178 +{ 1.179 + uint32_t i; 1.180 + float cf; 1.181 + 1.182 + cf = GetConfidence(); 1.183 + printf(" SBCS Group Prober --------begin status \r\n"); 1.184 + for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) 1.185 + { 1.186 + if (!mIsActive[i]) 1.187 + printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); 1.188 + else 1.189 + mProbers[i]->DumpStatus(); 1.190 + } 1.191 + printf(" SBCS Group found best match [%s] confidence %f.\r\n", 1.192 + mProbers[mBestGuess]->GetCharSetName(), cf); 1.193 +} 1.194 +#endif