extensions/universalchardet/src/base/nsSBCharSetProber.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsSBCharSetProber.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,93 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +#include <stdio.h>
     1.9 +#include "nsSBCharSetProber.h"
    1.10 +
    1.11 +nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen)
    1.12 +{
    1.13 +  unsigned char order;
    1.14 +
    1.15 +  for (uint32_t i = 0; i < aLen; i++)
    1.16 +  {
    1.17 +    order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
    1.18 +
    1.19 +    if (order < SYMBOL_CAT_ORDER)
    1.20 +      mTotalChar++;
    1.21 +    if (order < SAMPLE_SIZE)
    1.22 +    {
    1.23 +        mFreqChar++;
    1.24 +
    1.25 +      if (mLastOrder < SAMPLE_SIZE)
    1.26 +      {
    1.27 +        mTotalSeqs++;
    1.28 +        if (!mReversed)
    1.29 +          ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
    1.30 +        else // reverse the order of the letters in the lookup
    1.31 +          ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]);
    1.32 +      }
    1.33 +    }
    1.34 +    mLastOrder = order;
    1.35 +  }
    1.36 +
    1.37 +  if (mState == eDetecting)
    1.38 +    if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
    1.39 +    {
    1.40 +      float cf = GetConfidence();
    1.41 +      if (cf > POSITIVE_SHORTCUT_THRESHOLD)
    1.42 +        mState = eFoundIt;
    1.43 +      else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
    1.44 +        mState = eNotMe;
    1.45 +    }
    1.46 +
    1.47 +  return mState;
    1.48 +}
    1.49 +
    1.50 +void  nsSingleByteCharSetProber::Reset(void)
    1.51 +{
    1.52 +  mState = eDetecting;
    1.53 +  mLastOrder = 255;
    1.54 +  for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++)
    1.55 +    mSeqCounters[i] = 0;
    1.56 +  mTotalSeqs = 0;
    1.57 +  mTotalChar = 0;
    1.58 +  mFreqChar = 0;
    1.59 +}
    1.60 +
    1.61 +//#define NEGATIVE_APPROACH 1
    1.62 +
    1.63 +float nsSingleByteCharSetProber::GetConfidence(void)
    1.64 +{
    1.65 +#ifdef NEGATIVE_APPROACH
    1.66 +  if (mTotalSeqs > 0)
    1.67 +    if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
    1.68 +      return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
    1.69 +  return (float)0.01;
    1.70 +#else  //POSITIVE_APPROACH
    1.71 +  float r;
    1.72 +
    1.73 +  if (mTotalSeqs > 0) {
    1.74 +    r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
    1.75 +    r = r*mFreqChar/mTotalChar;
    1.76 +    if (r >= (float)1.00)
    1.77 +      r = (float)0.99;
    1.78 +    return r;
    1.79 +  }
    1.80 +  return (float)0.01;
    1.81 +#endif
    1.82 +}
    1.83 +
    1.84 +const char* nsSingleByteCharSetProber::GetCharSetName() 
    1.85 +{
    1.86 +  if (!mNameProber)
    1.87 +    return mModel->charsetName;
    1.88 +  return mNameProber->GetCharSetName();
    1.89 +}
    1.90 +
    1.91 +#ifdef DEBUG_chardet
    1.92 +void nsSingleByteCharSetProber::DumpStatus()
    1.93 +{
    1.94 +  printf("  SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
    1.95 +}
    1.96 +#endif

mercurial