extensions/universalchardet/src/base/nsMBCSGroupProber.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 #include <stdio.h>
     7 #include "nsMBCSGroupProber.h"
     8 #include "nsUniversalDetector.h"
    10 #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
    11 const char *ProberName[] = 
    12 {
    13   "UTF8",
    14   "SJIS",
    15   "EUCJP",
    16   "GB18030",
    17   "EUCKR",
    18   "Big5",
    19   "EUCTW",
    20 };
    22 #endif
    24 nsMBCSGroupProber::nsMBCSGroupProber(uint32_t aLanguageFilter)
    25 {
    26   for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    27     mProbers[i] = nullptr;
    29   mProbers[0] = new nsUTF8Prober();
    30   if (aLanguageFilter & NS_FILTER_JAPANESE) 
    31   {
    32     mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
    33     mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
    34   }
    35   if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
    36     mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
    37   if (aLanguageFilter & NS_FILTER_KOREAN)
    38     mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
    39   if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 
    40   {
    41     mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
    42     mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
    43   }
    44   Reset();
    45 }
    47 nsMBCSGroupProber::~nsMBCSGroupProber()
    48 {
    49   for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    50   {
    51     delete mProbers[i];
    52   }
    53 }
    55 const char* nsMBCSGroupProber::GetCharSetName()
    56 {
    57   if (mBestGuess == -1)
    58   {
    59     GetConfidence();
    60     if (mBestGuess == -1)
    61       mBestGuess = 0;
    62   }
    63   return mProbers[mBestGuess]->GetCharSetName();
    64 }
    66 void  nsMBCSGroupProber::Reset(void)
    67 {
    68   mActiveNum = 0;
    69   for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    70   {
    71     if (mProbers[i])
    72     {
    73       mProbers[i]->Reset();
    74       mIsActive[i] = true;
    75       ++mActiveNum;
    76     }
    77     else
    78       mIsActive[i] = false;
    79   }
    80   mBestGuess = -1;
    81   mState = eDetecting;
    82   mKeepNext = 0;
    83 }
    85 nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen)
    86 {
    87   nsProbingState st;
    88   uint32_t start = 0;
    89   uint32_t keepNext = mKeepNext;
    91   //do filtering to reduce load to probers
    92   for (uint32_t pos = 0; pos < aLen; ++pos)
    93   {
    94     if (aBuf[pos] & 0x80)
    95     {
    96       if (!keepNext)
    97         start = pos;
    98       keepNext = 2;
    99     }
   100     else if (keepNext)
   101     {
   102       if (--keepNext == 0)
   103       {
   104         for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
   105         {
   106           if (!mIsActive[i])
   107             continue;
   108           st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
   109           if (st == eFoundIt)
   110           {
   111             mBestGuess = i;
   112             mState = eFoundIt;
   113             return mState;
   114           }
   115         }
   116       }
   117     }
   118   }
   120   if (keepNext) {
   121     for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
   122     {
   123       if (!mIsActive[i])
   124         continue;
   125       st = mProbers[i]->HandleData(aBuf + start, aLen - start);
   126       if (st == eFoundIt)
   127       {
   128         mBestGuess = i;
   129         mState = eFoundIt;
   130         return mState;
   131       }
   132     }
   133   }
   134   mKeepNext = keepNext;
   136   return mState;
   137 }
   139 float nsMBCSGroupProber::GetConfidence(void)
   140 {
   141   uint32_t i;
   142   float bestConf = 0.0, cf;
   144   switch (mState)
   145   {
   146   case eFoundIt:
   147     return (float)0.99;
   148   case eNotMe:
   149     return (float)0.01;
   150   default:
   151     for (i = 0; i < NUM_OF_PROBERS; i++)
   152     {
   153       if (!mIsActive[i])
   154         continue;
   155       cf = mProbers[i]->GetConfidence();
   156       if (bestConf < cf)
   157       {
   158         bestConf = cf;
   159         mBestGuess = i;
   160       }
   161     }
   162   }
   163   return bestConf;
   164 }
   166 #ifdef DEBUG_chardet
   167 void nsMBCSGroupProber::DumpStatus()
   168 {
   169   uint32_t i;
   170   float cf;
   172   GetConfidence();
   173   for (i = 0; i < NUM_OF_PROBERS; i++)
   174   {
   175     if (!mIsActive[i])
   176       printf("  MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
   177     else
   178     {
   179       cf = mProbers[i]->GetConfidence();
   180       printf("  MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
   181     }
   182   }
   183 }
   184 #endif
   186 #ifdef DEBUG_jgmyers
   187 void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], uint32_t &offset)
   188 {
   189   for (uint32_t i = 0; i < NUM_OF_PROBERS; ++i) {
   190     states[offset].name = ProberName[i];
   191     states[offset].isActive = mIsActive[i];
   192     states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
   193     ++offset;
   194   }
   195 }
   196 #endif /* DEBUG_jgmyers */

mercurial