extensions/universalchardet/src/base/nsEUCJPProber.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsEUCJPProber.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,69 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +// for japanese encoding, obeserve characteristic:
    1.10 +// 1, kana character (or hankaku?) often have hight frequency of appereance
    1.11 +// 2, kana character often exist in group
    1.12 +// 3, certain combination of kana is never used in japanese language
    1.13 +
    1.14 +#include "nsEUCJPProber.h"
    1.15 +#include "nsDebug.h"
    1.16 +
    1.17 +void  nsEUCJPProber::Reset(void)
    1.18 +{
    1.19 +  mCodingSM->Reset(); 
    1.20 +  mState = eDetecting;
    1.21 +  mContextAnalyser.Reset(mIsPreferredLanguage);
    1.22 +  mDistributionAnalyser.Reset(mIsPreferredLanguage);
    1.23 +}
    1.24 +
    1.25 +nsProbingState nsEUCJPProber::HandleData(const char* aBuf, uint32_t aLen)
    1.26 +{
    1.27 +  NS_ASSERTION(aLen, "HandleData called with empty buffer");
    1.28 +  nsSMState codingState;
    1.29 +
    1.30 +  for (uint32_t i = 0; i < aLen; i++)
    1.31 +  {
    1.32 +    codingState = mCodingSM->NextState(aBuf[i]);
    1.33 +    if (codingState == eItsMe)
    1.34 +    {
    1.35 +      mState = eFoundIt;
    1.36 +      break;
    1.37 +    }
    1.38 +    if (codingState == eStart)
    1.39 +    {
    1.40 +      uint32_t charLen = mCodingSM->GetCurrentCharLen();
    1.41 +
    1.42 +      if (i == 0)
    1.43 +      {
    1.44 +        mLastChar[1] = aBuf[0];
    1.45 +        mContextAnalyser.HandleOneChar(mLastChar, charLen);
    1.46 +        mDistributionAnalyser.HandleOneChar(mLastChar, charLen);
    1.47 +      }
    1.48 +      else
    1.49 +      {
    1.50 +        mContextAnalyser.HandleOneChar(aBuf+i-1, charLen);
    1.51 +        mDistributionAnalyser.HandleOneChar(aBuf+i-1, charLen);
    1.52 +      }
    1.53 +    }
    1.54 +  }
    1.55 +
    1.56 +  mLastChar[0] = aBuf[aLen-1];
    1.57 +
    1.58 +  if (mState == eDetecting)
    1.59 +    if (mContextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
    1.60 +      mState = eFoundIt;
    1.61 +
    1.62 +  return mState;
    1.63 +}
    1.64 +
    1.65 +float nsEUCJPProber::GetConfidence(void)
    1.66 +{
    1.67 +  float contxtCf = mContextAnalyser.GetConfidence();
    1.68 +  float distribCf = mDistributionAnalyser.GetConfidence();
    1.69 +
    1.70 +  return (contxtCf > distribCf ? contxtCf : distribCf);
    1.71 +}
    1.72 +

mercurial