extensions/universalchardet/src/base/nsUniversalDetector.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsUniversalDetector.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,250 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nscore.h"
    1.10 +
    1.11 +#include "nsUniversalDetector.h"
    1.12 +
    1.13 +#include "nsMBCSGroupProber.h"
    1.14 +#include "nsSBCSGroupProber.h"
    1.15 +#include "nsEscCharsetProber.h"
    1.16 +#include "nsLatin1Prober.h"
    1.17 +
    1.18 +nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter)
    1.19 +{
    1.20 +  mDone = false;
    1.21 +  mBestGuess = -1;   //illegal value as signal
    1.22 +  mInTag = false;
    1.23 +  mEscCharSetProber = nullptr;
    1.24 +
    1.25 +  mStart = true;
    1.26 +  mDetectedCharset = nullptr;
    1.27 +  mGotData = false;
    1.28 +  mInputState = ePureAscii;
    1.29 +  mLastChar = '\0';
    1.30 +  mLanguageFilter = aLanguageFilter;
    1.31 +
    1.32 +  uint32_t i;
    1.33 +  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    1.34 +    mCharSetProbers[i] = nullptr;
    1.35 +}
    1.36 +
    1.37 +nsUniversalDetector::~nsUniversalDetector() 
    1.38 +{
    1.39 +  for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    1.40 +    delete mCharSetProbers[i];
    1.41 +
    1.42 +  delete mEscCharSetProber;
    1.43 +}
    1.44 +
    1.45 +void 
    1.46 +nsUniversalDetector::Reset()
    1.47 +{
    1.48 +  mDone = false;
    1.49 +  mBestGuess = -1;   //illegal value as signal
    1.50 +  mInTag = false;
    1.51 +
    1.52 +  mStart = true;
    1.53 +  mDetectedCharset = nullptr;
    1.54 +  mGotData = false;
    1.55 +  mInputState = ePureAscii;
    1.56 +  mLastChar = '\0';
    1.57 +
    1.58 +  if (mEscCharSetProber)
    1.59 +    mEscCharSetProber->Reset();
    1.60 +
    1.61 +  uint32_t i;
    1.62 +  for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    1.63 +    if (mCharSetProbers[i])
    1.64 +      mCharSetProbers[i]->Reset();
    1.65 +}
    1.66 +
    1.67 +//---------------------------------------------------------------------
    1.68 +#define SHORTCUT_THRESHOLD      (float)0.95
    1.69 +#define MINIMUM_THRESHOLD      (float)0.20
    1.70 +
    1.71 +nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
    1.72 +{
    1.73 +  if(mDone) 
    1.74 +    return NS_OK;
    1.75 +
    1.76 +  if (aLen > 0)
    1.77 +    mGotData = true;
    1.78 +
    1.79 +  //If the data starts with BOM, we know it is UTF
    1.80 +  if (mStart)
    1.81 +  {
    1.82 +    mStart = false;
    1.83 +    if (aLen >= 2) {
    1.84 +      switch (aBuf[0]) {
    1.85 +      case '\xEF':
    1.86 +        if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) {
    1.87 +          // EF BB BF  UTF-8 encoded BOM
    1.88 +          mDetectedCharset = "UTF-8";
    1.89 +        }
    1.90 +        break;
    1.91 +      case '\xFE':
    1.92 +        if ('\xFF' == aBuf[1]) {
    1.93 +          // FE FF  UTF-16, big endian BOM
    1.94 +          mDetectedCharset = "UTF-16BE";
    1.95 +        }
    1.96 +        break;
    1.97 +      case '\xFF':
    1.98 +        if ('\xFE' == aBuf[1]) {
    1.99 +          // FF FE  UTF-16, little endian BOM
   1.100 +          mDetectedCharset = "UTF-16LE";
   1.101 +        }
   1.102 +        break;
   1.103 +      }  // switch
   1.104 +    }
   1.105 +
   1.106 +    if (mDetectedCharset)
   1.107 +    {
   1.108 +      mDone = true;
   1.109 +      return NS_OK;
   1.110 +    }
   1.111 +  }
   1.112 +  
   1.113 +  uint32_t i;
   1.114 +  for (i = 0; i < aLen; i++)
   1.115 +  {
   1.116 +    //other than 0xa0, if every othe character is ascii, the page is ascii
   1.117 +    if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
   1.118 +    {
   1.119 +      //we got a non-ascii byte (high-byte)
   1.120 +      if (mInputState != eHighbyte)
   1.121 +      {
   1.122 +        //adjust state
   1.123 +        mInputState = eHighbyte;
   1.124 +
   1.125 +        //kill mEscCharSetProber if it is active
   1.126 +        if (mEscCharSetProber) {
   1.127 +          delete mEscCharSetProber;
   1.128 +          mEscCharSetProber = nullptr;
   1.129 +        }
   1.130 +
   1.131 +        //start multibyte and singlebyte charset prober
   1.132 +        if (nullptr == mCharSetProbers[0])
   1.133 +        {
   1.134 +          mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
   1.135 +          if (nullptr == mCharSetProbers[0])
   1.136 +            return NS_ERROR_OUT_OF_MEMORY;
   1.137 +        }
   1.138 +        if (nullptr == mCharSetProbers[1] &&
   1.139 +            (mLanguageFilter & NS_FILTER_NON_CJK))
   1.140 +        {
   1.141 +          mCharSetProbers[1] = new nsSBCSGroupProber;
   1.142 +          if (nullptr == mCharSetProbers[1])
   1.143 +            return NS_ERROR_OUT_OF_MEMORY;
   1.144 +        }
   1.145 +        if (nullptr == mCharSetProbers[2])
   1.146 +        {
   1.147 +          mCharSetProbers[2] = new nsLatin1Prober; 
   1.148 +          if (nullptr == mCharSetProbers[2])
   1.149 +            return NS_ERROR_OUT_OF_MEMORY;
   1.150 +        }
   1.151 +      }
   1.152 +    }
   1.153 +    else
   1.154 +    {
   1.155 +      //ok, just pure ascii so far
   1.156 +      if ( ePureAscii == mInputState &&
   1.157 +        (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
   1.158 +      {
   1.159 +        //found escape character or HZ "~{"
   1.160 +        mInputState = eEscAscii;
   1.161 +      }
   1.162 +      mLastChar = aBuf[i];
   1.163 +    }
   1.164 +  }
   1.165 +
   1.166 +  nsProbingState st;
   1.167 +  switch (mInputState)
   1.168 +  {
   1.169 +  case eEscAscii:
   1.170 +    if (nullptr == mEscCharSetProber) {
   1.171 +      mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
   1.172 +      if (nullptr == mEscCharSetProber)
   1.173 +        return NS_ERROR_OUT_OF_MEMORY;
   1.174 +    }
   1.175 +    st = mEscCharSetProber->HandleData(aBuf, aLen);
   1.176 +    if (st == eFoundIt)
   1.177 +    {
   1.178 +      mDone = true;
   1.179 +      mDetectedCharset = mEscCharSetProber->GetCharSetName();
   1.180 +    }
   1.181 +    break;
   1.182 +  case eHighbyte:
   1.183 +    for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
   1.184 +    {
   1.185 +      if (mCharSetProbers[i])
   1.186 +      {
   1.187 +        st = mCharSetProbers[i]->HandleData(aBuf, aLen);
   1.188 +        if (st == eFoundIt) 
   1.189 +        {
   1.190 +          mDone = true;
   1.191 +          mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
   1.192 +          return NS_OK;
   1.193 +        }
   1.194 +      } 
   1.195 +    }
   1.196 +    break;
   1.197 +
   1.198 +  default:  //pure ascii
   1.199 +    ;//do nothing here
   1.200 +  }
   1.201 +  return NS_OK;
   1.202 +}
   1.203 +
   1.204 +
   1.205 +//---------------------------------------------------------------------
   1.206 +void nsUniversalDetector::DataEnd()
   1.207 +{
   1.208 +  if (!mGotData)
   1.209 +  {
   1.210 +    // we haven't got any data yet, return immediately 
   1.211 +    // caller program sometimes call DataEnd before anything has been sent to detector
   1.212 +    return;
   1.213 +  }
   1.214 +
   1.215 +  if (mDetectedCharset)
   1.216 +  {
   1.217 +    mDone = true;
   1.218 +    Report(mDetectedCharset);
   1.219 +    return;
   1.220 +  }
   1.221 +  
   1.222 +  switch (mInputState)
   1.223 +  {
   1.224 +  case eHighbyte:
   1.225 +    {
   1.226 +      float proberConfidence;
   1.227 +      float maxProberConfidence = (float)0.0;
   1.228 +      int32_t maxProber = 0;
   1.229 +
   1.230 +      for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
   1.231 +      {
   1.232 +        if (mCharSetProbers[i])
   1.233 +        {
   1.234 +          proberConfidence = mCharSetProbers[i]->GetConfidence();
   1.235 +          if (proberConfidence > maxProberConfidence)
   1.236 +          {
   1.237 +            maxProberConfidence = proberConfidence;
   1.238 +            maxProber = i;
   1.239 +          }
   1.240 +        }
   1.241 +      }
   1.242 +      //do not report anything because we are not confident of it, that's in fact a negative answer
   1.243 +      if (maxProberConfidence > MINIMUM_THRESHOLD)
   1.244 +        Report(mCharSetProbers[maxProber]->GetCharSetName());
   1.245 +    }
   1.246 +    break;
   1.247 +  case eEscAscii:
   1.248 +    break;
   1.249 +  default:
   1.250 +    ;
   1.251 +  }
   1.252 +  return;
   1.253 +}

mercurial