extensions/universalchardet/src/base/nsSBCharSetProber.cpp

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:22a0ae4ebe40
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #include <stdio.h>
6 #include "nsSBCharSetProber.h"
7
8 nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen)
9 {
10 unsigned char order;
11
12 for (uint32_t i = 0; i < aLen; i++)
13 {
14 order = mModel->charToOrderMap[(unsigned char)aBuf[i]];
15
16 if (order < SYMBOL_CAT_ORDER)
17 mTotalChar++;
18 if (order < SAMPLE_SIZE)
19 {
20 mFreqChar++;
21
22 if (mLastOrder < SAMPLE_SIZE)
23 {
24 mTotalSeqs++;
25 if (!mReversed)
26 ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]);
27 else // reverse the order of the letters in the lookup
28 ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]);
29 }
30 }
31 mLastOrder = order;
32 }
33
34 if (mState == eDetecting)
35 if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD)
36 {
37 float cf = GetConfidence();
38 if (cf > POSITIVE_SHORTCUT_THRESHOLD)
39 mState = eFoundIt;
40 else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
41 mState = eNotMe;
42 }
43
44 return mState;
45 }
46
47 void nsSingleByteCharSetProber::Reset(void)
48 {
49 mState = eDetecting;
50 mLastOrder = 255;
51 for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++)
52 mSeqCounters[i] = 0;
53 mTotalSeqs = 0;
54 mTotalChar = 0;
55 mFreqChar = 0;
56 }
57
58 //#define NEGATIVE_APPROACH 1
59
60 float nsSingleByteCharSetProber::GetConfidence(void)
61 {
62 #ifdef NEGATIVE_APPROACH
63 if (mTotalSeqs > 0)
64 if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 )
65 return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar;
66 return (float)0.01;
67 #else //POSITIVE_APPROACH
68 float r;
69
70 if (mTotalSeqs > 0) {
71 r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio;
72 r = r*mFreqChar/mTotalChar;
73 if (r >= (float)1.00)
74 r = (float)0.99;
75 return r;
76 }
77 return (float)0.01;
78 #endif
79 }
80
81 const char* nsSingleByteCharSetProber::GetCharSetName()
82 {
83 if (!mNameProber)
84 return mModel->charsetName;
85 return mNameProber->GetCharSetName();
86 }
87
88 #ifdef DEBUG_chardet
89 void nsSingleByteCharSetProber::DumpStatus()
90 {
91 printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
92 }
93 #endif

mercurial