Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | #include <stdio.h> |
michael@0 | 6 | #include "nsSBCharSetProber.h" |
michael@0 | 7 | |
michael@0 | 8 | nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen) |
michael@0 | 9 | { |
michael@0 | 10 | unsigned char order; |
michael@0 | 11 | |
michael@0 | 12 | for (uint32_t i = 0; i < aLen; i++) |
michael@0 | 13 | { |
michael@0 | 14 | order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; |
michael@0 | 15 | |
michael@0 | 16 | if (order < SYMBOL_CAT_ORDER) |
michael@0 | 17 | mTotalChar++; |
michael@0 | 18 | if (order < SAMPLE_SIZE) |
michael@0 | 19 | { |
michael@0 | 20 | mFreqChar++; |
michael@0 | 21 | |
michael@0 | 22 | if (mLastOrder < SAMPLE_SIZE) |
michael@0 | 23 | { |
michael@0 | 24 | mTotalSeqs++; |
michael@0 | 25 | if (!mReversed) |
michael@0 | 26 | ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); |
michael@0 | 27 | else // reverse the order of the letters in the lookup |
michael@0 | 28 | ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); |
michael@0 | 29 | } |
michael@0 | 30 | } |
michael@0 | 31 | mLastOrder = order; |
michael@0 | 32 | } |
michael@0 | 33 | |
michael@0 | 34 | if (mState == eDetecting) |
michael@0 | 35 | if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) |
michael@0 | 36 | { |
michael@0 | 37 | float cf = GetConfidence(); |
michael@0 | 38 | if (cf > POSITIVE_SHORTCUT_THRESHOLD) |
michael@0 | 39 | mState = eFoundIt; |
michael@0 | 40 | else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) |
michael@0 | 41 | mState = eNotMe; |
michael@0 | 42 | } |
michael@0 | 43 | |
michael@0 | 44 | return mState; |
michael@0 | 45 | } |
michael@0 | 46 | |
michael@0 | 47 | void nsSingleByteCharSetProber::Reset(void) |
michael@0 | 48 | { |
michael@0 | 49 | mState = eDetecting; |
michael@0 | 50 | mLastOrder = 255; |
michael@0 | 51 | for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++) |
michael@0 | 52 | mSeqCounters[i] = 0; |
michael@0 | 53 | mTotalSeqs = 0; |
michael@0 | 54 | mTotalChar = 0; |
michael@0 | 55 | mFreqChar = 0; |
michael@0 | 56 | } |
michael@0 | 57 | |
michael@0 | 58 | //#define NEGATIVE_APPROACH 1 |
michael@0 | 59 | |
michael@0 | 60 | float nsSingleByteCharSetProber::GetConfidence(void) |
michael@0 | 61 | { |
michael@0 | 62 | #ifdef NEGATIVE_APPROACH |
michael@0 | 63 | if (mTotalSeqs > 0) |
michael@0 | 64 | if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) |
michael@0 | 65 | return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; |
michael@0 | 66 | return (float)0.01; |
michael@0 | 67 | #else //POSITIVE_APPROACH |
michael@0 | 68 | float r; |
michael@0 | 69 | |
michael@0 | 70 | if (mTotalSeqs > 0) { |
michael@0 | 71 | r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; |
michael@0 | 72 | r = r*mFreqChar/mTotalChar; |
michael@0 | 73 | if (r >= (float)1.00) |
michael@0 | 74 | r = (float)0.99; |
michael@0 | 75 | return r; |
michael@0 | 76 | } |
michael@0 | 77 | return (float)0.01; |
michael@0 | 78 | #endif |
michael@0 | 79 | } |
michael@0 | 80 | |
michael@0 | 81 | const char* nsSingleByteCharSetProber::GetCharSetName() |
michael@0 | 82 | { |
michael@0 | 83 | if (!mNameProber) |
michael@0 | 84 | return mModel->charsetName; |
michael@0 | 85 | return mNameProber->GetCharSetName(); |
michael@0 | 86 | } |
michael@0 | 87 | |
michael@0 | 88 | #ifdef DEBUG_chardet |
michael@0 | 89 | void nsSingleByteCharSetProber::DumpStatus() |
michael@0 | 90 | { |
michael@0 | 91 | printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
michael@0 | 92 | } |
michael@0 | 93 | #endif |