extensions/universalchardet/src/base/nsMBCSGroupProber.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsMBCSGroupProber.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,196 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +#include <stdio.h>
     1.9 +
    1.10 +#include "nsMBCSGroupProber.h"
    1.11 +#include "nsUniversalDetector.h"
    1.12 +
    1.13 +#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
    1.14 +const char *ProberName[] = 
    1.15 +{
    1.16 +  "UTF8",
    1.17 +  "SJIS",
    1.18 +  "EUCJP",
    1.19 +  "GB18030",
    1.20 +  "EUCKR",
    1.21 +  "Big5",
    1.22 +  "EUCTW",
    1.23 +};
    1.24 +
    1.25 +#endif
    1.26 +
    1.27 +nsMBCSGroupProber::nsMBCSGroupProber(uint32_t aLanguageFilter)
    1.28 +{
    1.29 +  for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    1.30 +    mProbers[i] = nullptr;
    1.31 +
    1.32 +  mProbers[0] = new nsUTF8Prober();
    1.33 +  if (aLanguageFilter & NS_FILTER_JAPANESE) 
    1.34 +  {
    1.35 +    mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
    1.36 +    mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
    1.37 +  }
    1.38 +  if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
    1.39 +    mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
    1.40 +  if (aLanguageFilter & NS_FILTER_KOREAN)
    1.41 +    mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
    1.42 +  if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) 
    1.43 +  {
    1.44 +    mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
    1.45 +    mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
    1.46 +  }
    1.47 +  Reset();
    1.48 +}
    1.49 +
    1.50 +nsMBCSGroupProber::~nsMBCSGroupProber()
    1.51 +{
    1.52 +  for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    1.53 +  {
    1.54 +    delete mProbers[i];
    1.55 +  }
    1.56 +}
    1.57 +
    1.58 +const char* nsMBCSGroupProber::GetCharSetName()
    1.59 +{
    1.60 +  if (mBestGuess == -1)
    1.61 +  {
    1.62 +    GetConfidence();
    1.63 +    if (mBestGuess == -1)
    1.64 +      mBestGuess = 0;
    1.65 +  }
    1.66 +  return mProbers[mBestGuess]->GetCharSetName();
    1.67 +}
    1.68 +
    1.69 +void  nsMBCSGroupProber::Reset(void)
    1.70 +{
    1.71 +  mActiveNum = 0;
    1.72 +  for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
    1.73 +  {
    1.74 +    if (mProbers[i])
    1.75 +    {
    1.76 +      mProbers[i]->Reset();
    1.77 +      mIsActive[i] = true;
    1.78 +      ++mActiveNum;
    1.79 +    }
    1.80 +    else
    1.81 +      mIsActive[i] = false;
    1.82 +  }
    1.83 +  mBestGuess = -1;
    1.84 +  mState = eDetecting;
    1.85 +  mKeepNext = 0;
    1.86 +}
    1.87 +
    1.88 +nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, uint32_t aLen)
    1.89 +{
    1.90 +  nsProbingState st;
    1.91 +  uint32_t start = 0;
    1.92 +  uint32_t keepNext = mKeepNext;
    1.93 +
    1.94 +  //do filtering to reduce load to probers
    1.95 +  for (uint32_t pos = 0; pos < aLen; ++pos)
    1.96 +  {
    1.97 +    if (aBuf[pos] & 0x80)
    1.98 +    {
    1.99 +      if (!keepNext)
   1.100 +        start = pos;
   1.101 +      keepNext = 2;
   1.102 +    }
   1.103 +    else if (keepNext)
   1.104 +    {
   1.105 +      if (--keepNext == 0)
   1.106 +      {
   1.107 +        for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
   1.108 +        {
   1.109 +          if (!mIsActive[i])
   1.110 +            continue;
   1.111 +          st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start);
   1.112 +          if (st == eFoundIt)
   1.113 +          {
   1.114 +            mBestGuess = i;
   1.115 +            mState = eFoundIt;
   1.116 +            return mState;
   1.117 +          }
   1.118 +        }
   1.119 +      }
   1.120 +    }
   1.121 +  }
   1.122 +
   1.123 +  if (keepNext) {
   1.124 +    for (uint32_t i = 0; i < NUM_OF_PROBERS; i++)
   1.125 +    {
   1.126 +      if (!mIsActive[i])
   1.127 +        continue;
   1.128 +      st = mProbers[i]->HandleData(aBuf + start, aLen - start);
   1.129 +      if (st == eFoundIt)
   1.130 +      {
   1.131 +        mBestGuess = i;
   1.132 +        mState = eFoundIt;
   1.133 +        return mState;
   1.134 +      }
   1.135 +    }
   1.136 +  }
   1.137 +  mKeepNext = keepNext;
   1.138 +
   1.139 +  return mState;
   1.140 +}
   1.141 +
   1.142 +float nsMBCSGroupProber::GetConfidence(void)
   1.143 +{
   1.144 +  uint32_t i;
   1.145 +  float bestConf = 0.0, cf;
   1.146 +
   1.147 +  switch (mState)
   1.148 +  {
   1.149 +  case eFoundIt:
   1.150 +    return (float)0.99;
   1.151 +  case eNotMe:
   1.152 +    return (float)0.01;
   1.153 +  default:
   1.154 +    for (i = 0; i < NUM_OF_PROBERS; i++)
   1.155 +    {
   1.156 +      if (!mIsActive[i])
   1.157 +        continue;
   1.158 +      cf = mProbers[i]->GetConfidence();
   1.159 +      if (bestConf < cf)
   1.160 +      {
   1.161 +        bestConf = cf;
   1.162 +        mBestGuess = i;
   1.163 +      }
   1.164 +    }
   1.165 +  }
   1.166 +  return bestConf;
   1.167 +}
   1.168 +
   1.169 +#ifdef DEBUG_chardet
   1.170 +void nsMBCSGroupProber::DumpStatus()
   1.171 +{
   1.172 +  uint32_t i;
   1.173 +  float cf;
   1.174 +  
   1.175 +  GetConfidence();
   1.176 +  for (i = 0; i < NUM_OF_PROBERS; i++)
   1.177 +  {
   1.178 +    if (!mIsActive[i])
   1.179 +      printf("  MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]);
   1.180 +    else
   1.181 +    {
   1.182 +      cf = mProbers[i]->GetConfidence();
   1.183 +      printf("  MBCS %1.3f: [%s]\r\n", cf, ProberName[i]);
   1.184 +    }
   1.185 +  }
   1.186 +}
   1.187 +#endif
   1.188 +
   1.189 +#ifdef DEBUG_jgmyers
   1.190 +void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], uint32_t &offset)
   1.191 +{
   1.192 +  for (uint32_t i = 0; i < NUM_OF_PROBERS; ++i) {
   1.193 +    states[offset].name = ProberName[i];
   1.194 +    states[offset].isActive = mIsActive[i];
   1.195 +    states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0;
   1.196 +    ++offset;
   1.197 +  }
   1.198 +}
   1.199 +#endif /* DEBUG_jgmyers */

mercurial