extensions/universalchardet/src/base/nsSBCSGroupProber.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsSBCSGroupProber.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,191 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include <stdio.h>
    1.10 +#include "prmem.h"
    1.11 +
    1.12 +#include "nsSBCharSetProber.h"
    1.13 +#include "nsSBCSGroupProber.h"
    1.14 +
    1.15 +#include "nsHebrewProber.h"
    1.16 +
    1.17 +nsSBCSGroupProber::nsSBCSGroupProber()
    1.18 +{
    1.19 +  mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
    1.20 +  mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
    1.21 +  mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
    1.22 +  mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
    1.23 +  mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
    1.24 +  mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
    1.25 +  mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
    1.26 +  mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
    1.27 +  mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
    1.28 +  mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
    1.29 +  mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
    1.30 +
    1.31 +  nsHebrewProber *hebprober = new nsHebrewProber();
    1.32 +  // Notice: Any change in these indexes - 10,11,12 must be reflected
    1.33 +  // in the code below as well.
    1.34 +  mProbers[11] = hebprober;
    1.35 +  mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
    1.36 +  mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
    1.37 +  // Tell the Hebrew prober about the logical and visual probers
    1.38 +  if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
    1.39 +  {
    1.40 +    hebprober->SetModelProbers(mProbers[12], mProbers[13]);
    1.41 +  }
    1.42 +  else // One or more is null. avoid any Hebrew probing, null them all
    1.43 +  {
    1.44 +    for (uint32_t i = 11; i <= 13; ++i)
    1.45 +    { 
    1.46 +      delete mProbers[i]; 
    1.47 +      mProbers[i] = 0; 
    1.48 +    }
    1.49 +  }
    1.50 +
    1.51 +  // disable latin2 before latin1 is available, otherwise all latin1 
    1.52 +  // will be detected as latin2 because of their similarity.
    1.53 +  //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
    1.54 +  //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
    1.55 +
    1.56 +  Reset();
    1.57 +}
    1.58 +
    1.59 +nsSBCSGroupProber::~nsSBCSGroupProber()
    1.60 +{
    1.61 +  for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
    1.62 +  {
    1.63 +    delete mProbers[i];
    1.64 +  }
    1.65 +}
    1.66 +
    1.67 +
    1.68 +const char* nsSBCSGroupProber::GetCharSetName()
    1.69 +{
    1.70 +  //if we have no answer yet
    1.71 +  if (mBestGuess == -1)
    1.72 +  {
    1.73 +    GetConfidence();
    1.74 +    //no charset seems positive
    1.75 +    if (mBestGuess == -1)
    1.76 +      //we will use default.
    1.77 +      mBestGuess = 0;
    1.78 +  }
    1.79 +  return mProbers[mBestGuess]->GetCharSetName();
    1.80 +}
    1.81 +
    1.82 +void  nsSBCSGroupProber::Reset(void)
    1.83 +{
    1.84 +  mActiveNum = 0;
    1.85 +  for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
    1.86 +  {
    1.87 +    if (mProbers[i]) // not null
    1.88 +    {
    1.89 +      mProbers[i]->Reset();
    1.90 +      mIsActive[i] = true;
    1.91 +      ++mActiveNum;
    1.92 +    }
    1.93 +    else
    1.94 +      mIsActive[i] = false;
    1.95 +  }
    1.96 +  mBestGuess = -1;
    1.97 +  mState = eDetecting;
    1.98 +}
    1.99 +
   1.100 +
   1.101 +nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen)
   1.102 +{
   1.103 +  nsProbingState st;
   1.104 +  uint32_t i;
   1.105 +  char *newBuf1 = 0;
   1.106 +  uint32_t newLen1 = 0;
   1.107 +
   1.108 +  //apply filter to original buffer, and we got new buffer back
   1.109 +  //depend on what script it is, we will feed them the new buffer 
   1.110 +  //we got after applying proper filter
   1.111 +  //this is done without any consideration to KeepEnglishLetters
   1.112 +  //of each prober since as of now, there are no probers here which
   1.113 +  //recognize languages with English characters.
   1.114 +  if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
   1.115 +    goto done;
   1.116 +  
   1.117 +  if (newLen1 == 0)
   1.118 +    goto done; // Nothing to see here, move on.
   1.119 +
   1.120 +  for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   1.121 +  {
   1.122 +     if (!mIsActive[i])
   1.123 +       continue;
   1.124 +     st = mProbers[i]->HandleData(newBuf1, newLen1);
   1.125 +     if (st == eFoundIt)
   1.126 +     {
   1.127 +       mBestGuess = i;
   1.128 +       mState = eFoundIt;
   1.129 +       break;
   1.130 +     }
   1.131 +     else if (st == eNotMe)
   1.132 +     {
   1.133 +       mIsActive[i] = false;
   1.134 +       mActiveNum--;
   1.135 +       if (mActiveNum <= 0)
   1.136 +       {
   1.137 +         mState = eNotMe;
   1.138 +         break;
   1.139 +       }
   1.140 +     }
   1.141 +  }
   1.142 +
   1.143 +done:
   1.144 +  PR_FREEIF(newBuf1);
   1.145 +
   1.146 +  return mState;
   1.147 +}
   1.148 +
   1.149 +float nsSBCSGroupProber::GetConfidence(void)
   1.150 +{
   1.151 +  uint32_t i;
   1.152 +  float bestConf = 0.0, cf;
   1.153 +
   1.154 +  switch (mState)
   1.155 +  {
   1.156 +  case eFoundIt:
   1.157 +    return (float)0.99; //sure yes
   1.158 +  case eNotMe:
   1.159 +    return (float)0.01;  //sure no
   1.160 +  default:
   1.161 +    for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   1.162 +    {
   1.163 +      if (!mIsActive[i])
   1.164 +        continue;
   1.165 +      cf = mProbers[i]->GetConfidence();
   1.166 +      if (bestConf < cf)
   1.167 +      {
   1.168 +        bestConf = cf;
   1.169 +        mBestGuess = i;
   1.170 +      }
   1.171 +    }
   1.172 +  }
   1.173 +  return bestConf;
   1.174 +}
   1.175 +
   1.176 +#ifdef DEBUG_chardet
   1.177 +void nsSBCSGroupProber::DumpStatus()
   1.178 +{
   1.179 +  uint32_t i;
   1.180 +  float cf;
   1.181 +  
   1.182 +  cf = GetConfidence();
   1.183 +  printf(" SBCS Group Prober --------begin status \r\n");
   1.184 +  for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   1.185 +  {
   1.186 +    if (!mIsActive[i])
   1.187 +      printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
   1.188 +    else
   1.189 +      mProbers[i]->DumpStatus();
   1.190 +  }
   1.191 +  printf(" SBCS Group found best match [%s] confidence %f.\r\n",  
   1.192 +         mProbers[mBestGuess]->GetCharSetName(), cf);
   1.193 +}
   1.194 +#endif

mercurial