extensions/universalchardet/src/base/nsSBCSGroupProber.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include <stdio.h>
michael@0 7 #include "prmem.h"
michael@0 8
michael@0 9 #include "nsSBCharSetProber.h"
michael@0 10 #include "nsSBCSGroupProber.h"
michael@0 11
michael@0 12 #include "nsHebrewProber.h"
michael@0 13
michael@0 14 nsSBCSGroupProber::nsSBCSGroupProber()
michael@0 15 {
michael@0 16 mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
michael@0 17 mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
michael@0 18 mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
michael@0 19 mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
michael@0 20 mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
michael@0 21 mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
michael@0 22 mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
michael@0 23 mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
michael@0 24 mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
michael@0 25 mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
michael@0 26 mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
michael@0 27
michael@0 28 nsHebrewProber *hebprober = new nsHebrewProber();
michael@0 29 // Notice: Any change in these indexes - 10,11,12 must be reflected
michael@0 30 // in the code below as well.
michael@0 31 mProbers[11] = hebprober;
michael@0 32 mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
michael@0 33 mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
michael@0 34 // Tell the Hebrew prober about the logical and visual probers
michael@0 35 if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
michael@0 36 {
michael@0 37 hebprober->SetModelProbers(mProbers[12], mProbers[13]);
michael@0 38 }
michael@0 39 else // One or more is null. avoid any Hebrew probing, null them all
michael@0 40 {
michael@0 41 for (uint32_t i = 11; i <= 13; ++i)
michael@0 42 {
michael@0 43 delete mProbers[i];
michael@0 44 mProbers[i] = 0;
michael@0 45 }
michael@0 46 }
michael@0 47
michael@0 48 // disable latin2 before latin1 is available, otherwise all latin1
michael@0 49 // will be detected as latin2 because of their similarity.
michael@0 50 //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
michael@0 51 //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
michael@0 52
michael@0 53 Reset();
michael@0 54 }
michael@0 55
michael@0 56 nsSBCSGroupProber::~nsSBCSGroupProber()
michael@0 57 {
michael@0 58 for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
michael@0 59 {
michael@0 60 delete mProbers[i];
michael@0 61 }
michael@0 62 }
michael@0 63
michael@0 64
michael@0 65 const char* nsSBCSGroupProber::GetCharSetName()
michael@0 66 {
michael@0 67 //if we have no answer yet
michael@0 68 if (mBestGuess == -1)
michael@0 69 {
michael@0 70 GetConfidence();
michael@0 71 //no charset seems positive
michael@0 72 if (mBestGuess == -1)
michael@0 73 //we will use default.
michael@0 74 mBestGuess = 0;
michael@0 75 }
michael@0 76 return mProbers[mBestGuess]->GetCharSetName();
michael@0 77 }
michael@0 78
michael@0 79 void nsSBCSGroupProber::Reset(void)
michael@0 80 {
michael@0 81 mActiveNum = 0;
michael@0 82 for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
michael@0 83 {
michael@0 84 if (mProbers[i]) // not null
michael@0 85 {
michael@0 86 mProbers[i]->Reset();
michael@0 87 mIsActive[i] = true;
michael@0 88 ++mActiveNum;
michael@0 89 }
michael@0 90 else
michael@0 91 mIsActive[i] = false;
michael@0 92 }
michael@0 93 mBestGuess = -1;
michael@0 94 mState = eDetecting;
michael@0 95 }
michael@0 96
michael@0 97
michael@0 98 nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen)
michael@0 99 {
michael@0 100 nsProbingState st;
michael@0 101 uint32_t i;
michael@0 102 char *newBuf1 = 0;
michael@0 103 uint32_t newLen1 = 0;
michael@0 104
michael@0 105 //apply filter to original buffer, and we got new buffer back
michael@0 106 //depend on what script it is, we will feed them the new buffer
michael@0 107 //we got after applying proper filter
michael@0 108 //this is done without any consideration to KeepEnglishLetters
michael@0 109 //of each prober since as of now, there are no probers here which
michael@0 110 //recognize languages with English characters.
michael@0 111 if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
michael@0 112 goto done;
michael@0 113
michael@0 114 if (newLen1 == 0)
michael@0 115 goto done; // Nothing to see here, move on.
michael@0 116
michael@0 117 for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
michael@0 118 {
michael@0 119 if (!mIsActive[i])
michael@0 120 continue;
michael@0 121 st = mProbers[i]->HandleData(newBuf1, newLen1);
michael@0 122 if (st == eFoundIt)
michael@0 123 {
michael@0 124 mBestGuess = i;
michael@0 125 mState = eFoundIt;
michael@0 126 break;
michael@0 127 }
michael@0 128 else if (st == eNotMe)
michael@0 129 {
michael@0 130 mIsActive[i] = false;
michael@0 131 mActiveNum--;
michael@0 132 if (mActiveNum <= 0)
michael@0 133 {
michael@0 134 mState = eNotMe;
michael@0 135 break;
michael@0 136 }
michael@0 137 }
michael@0 138 }
michael@0 139
michael@0 140 done:
michael@0 141 PR_FREEIF(newBuf1);
michael@0 142
michael@0 143 return mState;
michael@0 144 }
michael@0 145
michael@0 146 float nsSBCSGroupProber::GetConfidence(void)
michael@0 147 {
michael@0 148 uint32_t i;
michael@0 149 float bestConf = 0.0, cf;
michael@0 150
michael@0 151 switch (mState)
michael@0 152 {
michael@0 153 case eFoundIt:
michael@0 154 return (float)0.99; //sure yes
michael@0 155 case eNotMe:
michael@0 156 return (float)0.01; //sure no
michael@0 157 default:
michael@0 158 for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
michael@0 159 {
michael@0 160 if (!mIsActive[i])
michael@0 161 continue;
michael@0 162 cf = mProbers[i]->GetConfidence();
michael@0 163 if (bestConf < cf)
michael@0 164 {
michael@0 165 bestConf = cf;
michael@0 166 mBestGuess = i;
michael@0 167 }
michael@0 168 }
michael@0 169 }
michael@0 170 return bestConf;
michael@0 171 }
michael@0 172
michael@0 173 #ifdef DEBUG_chardet
michael@0 174 void nsSBCSGroupProber::DumpStatus()
michael@0 175 {
michael@0 176 uint32_t i;
michael@0 177 float cf;
michael@0 178
michael@0 179 cf = GetConfidence();
michael@0 180 printf(" SBCS Group Prober --------begin status \r\n");
michael@0 181 for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
michael@0 182 {
michael@0 183 if (!mIsActive[i])
michael@0 184 printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
michael@0 185 else
michael@0 186 mProbers[i]->DumpStatus();
michael@0 187 }
michael@0 188 printf(" SBCS Group found best match [%s] confidence %f.\r\n",
michael@0 189 mProbers[mBestGuess]->GetCharSetName(), cf);
michael@0 190 }
michael@0 191 #endif

mercurial