extensions/universalchardet/src/base/nsSBCSGroupProber.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include <stdio.h>
     7 #include "prmem.h"
     9 #include "nsSBCharSetProber.h"
    10 #include "nsSBCSGroupProber.h"
    12 #include "nsHebrewProber.h"
    14 nsSBCSGroupProber::nsSBCSGroupProber()
    15 {
    16   mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model);
    17   mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel);
    18   mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model);
    19   mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel);
    20   mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model);
    21   mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model);
    22   mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model);
    23   mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model);
    24   mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
    25   mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
    26   mProbers[10] = new nsSingleByteCharSetProber(&TIS620ThaiModel);
    28   nsHebrewProber *hebprober = new nsHebrewProber();
    29   // Notice: Any change in these indexes - 10,11,12 must be reflected
    30   // in the code below as well.
    31   mProbers[11] = hebprober;
    32   mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, false, hebprober); // Logical Hebrew
    33   mProbers[13] = new nsSingleByteCharSetProber(&Win1255Model, true, hebprober); // Visual Hebrew
    34   // Tell the Hebrew prober about the logical and visual probers
    35   if (mProbers[11] && mProbers[12] && mProbers[13]) // all are not null
    36   {
    37     hebprober->SetModelProbers(mProbers[12], mProbers[13]);
    38   }
    39   else // One or more is null. avoid any Hebrew probing, null them all
    40   {
    41     for (uint32_t i = 11; i <= 13; ++i)
    42     { 
    43       delete mProbers[i]; 
    44       mProbers[i] = 0; 
    45     }
    46   }
    48   // disable latin2 before latin1 is available, otherwise all latin1 
    49   // will be detected as latin2 because of their similarity.
    50   //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel);
    51   //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel);
    53   Reset();
    54 }
    56 nsSBCSGroupProber::~nsSBCSGroupProber()
    57 {
    58   for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
    59   {
    60     delete mProbers[i];
    61   }
    62 }
    65 const char* nsSBCSGroupProber::GetCharSetName()
    66 {
    67   //if we have no answer yet
    68   if (mBestGuess == -1)
    69   {
    70     GetConfidence();
    71     //no charset seems positive
    72     if (mBestGuess == -1)
    73       //we will use default.
    74       mBestGuess = 0;
    75   }
    76   return mProbers[mBestGuess]->GetCharSetName();
    77 }
    79 void  nsSBCSGroupProber::Reset(void)
    80 {
    81   mActiveNum = 0;
    82   for (uint32_t i = 0; i < NUM_OF_SBCS_PROBERS; i++)
    83   {
    84     if (mProbers[i]) // not null
    85     {
    86       mProbers[i]->Reset();
    87       mIsActive[i] = true;
    88       ++mActiveNum;
    89     }
    90     else
    91       mIsActive[i] = false;
    92   }
    93   mBestGuess = -1;
    94   mState = eDetecting;
    95 }
    98 nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen)
    99 {
   100   nsProbingState st;
   101   uint32_t i;
   102   char *newBuf1 = 0;
   103   uint32_t newLen1 = 0;
   105   //apply filter to original buffer, and we got new buffer back
   106   //depend on what script it is, we will feed them the new buffer 
   107   //we got after applying proper filter
   108   //this is done without any consideration to KeepEnglishLetters
   109   //of each prober since as of now, there are no probers here which
   110   //recognize languages with English characters.
   111   if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1))
   112     goto done;
   114   if (newLen1 == 0)
   115     goto done; // Nothing to see here, move on.
   117   for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   118   {
   119      if (!mIsActive[i])
   120        continue;
   121      st = mProbers[i]->HandleData(newBuf1, newLen1);
   122      if (st == eFoundIt)
   123      {
   124        mBestGuess = i;
   125        mState = eFoundIt;
   126        break;
   127      }
   128      else if (st == eNotMe)
   129      {
   130        mIsActive[i] = false;
   131        mActiveNum--;
   132        if (mActiveNum <= 0)
   133        {
   134          mState = eNotMe;
   135          break;
   136        }
   137      }
   138   }
   140 done:
   141   PR_FREEIF(newBuf1);
   143   return mState;
   144 }
   146 float nsSBCSGroupProber::GetConfidence(void)
   147 {
   148   uint32_t i;
   149   float bestConf = 0.0, cf;
   151   switch (mState)
   152   {
   153   case eFoundIt:
   154     return (float)0.99; //sure yes
   155   case eNotMe:
   156     return (float)0.01;  //sure no
   157   default:
   158     for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   159     {
   160       if (!mIsActive[i])
   161         continue;
   162       cf = mProbers[i]->GetConfidence();
   163       if (bestConf < cf)
   164       {
   165         bestConf = cf;
   166         mBestGuess = i;
   167       }
   168     }
   169   }
   170   return bestConf;
   171 }
   173 #ifdef DEBUG_chardet
   174 void nsSBCSGroupProber::DumpStatus()
   175 {
   176   uint32_t i;
   177   float cf;
   179   cf = GetConfidence();
   180   printf(" SBCS Group Prober --------begin status \r\n");
   181   for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
   182   {
   183     if (!mIsActive[i])
   184       printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName());
   185     else
   186       mProbers[i]->DumpStatus();
   187   }
   188   printf(" SBCS Group found best match [%s] confidence %f.\r\n",  
   189          mProbers[mBestGuess]->GetCharSetName(), cf);
   190 }
   191 #endif

mercurial