extensions/universalchardet/src/base/nsSBCharSetProber.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 #include <stdio.h>
michael@0 6 #include "nsSBCharSetProber.h"
michael@0 7
michael@0 8 nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen)
michael@0 9 {
michael@0 10 unsigned char order;
michael@0 11
michael@0 12 for (uint32_t i = 0; i < aLen; i++)
michael@0 13 {
michael@0 14 order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
michael@0 15
michael@0 16 if (order < SYMBOL_CAT_ORDER)
michael@0 17 mTotalChar++;
michael@0 18 if (order < SAMPLE_SIZE)
michael@0 19 {
michael@0 20 mFreqChar++;
michael@0 21
michael@0 22 if (mLastOrder < SAMPLE_SIZE)
michael@0 23 {
michael@0 24 mTotalSeqs++;
michael@0 25 if (!mReversed)
michael@0 26 ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
michael@0 27 else // reverse the order of the letters in the lookup
michael@0 28 ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]);
michael@0 29 }
michael@0 30 }
michael@0 31 mLastOrder = order;
michael@0 32 }
michael@0 33
michael@0 34 if (mState == eDetecting)
michael@0 35 if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
michael@0 36 {
michael@0 37 float cf = GetConfidence();
michael@0 38 if (cf > POSITIVE_SHORTCUT_THRESHOLD)
michael@0 39 mState = eFoundIt;
michael@0 40 else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
michael@0 41 mState = eNotMe;
michael@0 42 }
michael@0 43
michael@0 44 return mState;
michael@0 45 }
michael@0 46
michael@0 47 void nsSingleByteCharSetProber::Reset(void)
michael@0 48 {
michael@0 49 mState = eDetecting;
michael@0 50 mLastOrder = 255;
michael@0 51 for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++)
michael@0 52 mSeqCounters[i] = 0;
michael@0 53 mTotalSeqs = 0;
michael@0 54 mTotalChar = 0;
michael@0 55 mFreqChar = 0;
michael@0 56 }
michael@0 57
michael@0 58 //#define NEGATIVE_APPROACH 1
michael@0 59
michael@0 60 float nsSingleByteCharSetProber::GetConfidence(void)
michael@0 61 {
michael@0 62 #ifdef NEGATIVE_APPROACH
michael@0 63 if (mTotalSeqs > 0)
michael@0 64 if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
michael@0 65 return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
michael@0 66 return (float)0.01;
michael@0 67 #else //POSITIVE_APPROACH
michael@0 68 float r;
michael@0 69
michael@0 70 if (mTotalSeqs > 0) {
michael@0 71 r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
michael@0 72 r = r*mFreqChar/mTotalChar;
michael@0 73 if (r >= (float)1.00)
michael@0 74 r = (float)0.99;
michael@0 75 return r;
michael@0 76 }
michael@0 77 return (float)0.01;
michael@0 78 #endif
michael@0 79 }
michael@0 80
michael@0 81 const char* nsSingleByteCharSetProber::GetCharSetName()
michael@0 82 {
michael@0 83 if (!mNameProber)
michael@0 84 return mModel->charsetName;
michael@0 85 return mNameProber->GetCharSetName();
michael@0 86 }
michael@0 87
michael@0 88 #ifdef DEBUG_chardet
michael@0 89 void nsSingleByteCharSetProber::DumpStatus()
michael@0 90 {
michael@0 91 printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
michael@0 92 }
michael@0 93 #endif

mercurial