|
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 #include <stdio.h> |
|
6 #include "nsSBCharSetProber.h" |
|
7 |
|
8 nsProbingState nsSingleByteCharSetProber::HandleData(const char* aBuf, uint32_t aLen) |
|
9 { |
|
10 unsigned char order; |
|
11 |
|
12 for (uint32_t i = 0; i < aLen; i++) |
|
13 { |
|
14 order = mModel->charToOrderMap[(unsigned char)aBuf[i]]; |
|
15 |
|
16 if (order < SYMBOL_CAT_ORDER) |
|
17 mTotalChar++; |
|
18 if (order < SAMPLE_SIZE) |
|
19 { |
|
20 mFreqChar++; |
|
21 |
|
22 if (mLastOrder < SAMPLE_SIZE) |
|
23 { |
|
24 mTotalSeqs++; |
|
25 if (!mReversed) |
|
26 ++(mSeqCounters[mModel->precedenceMatrix[mLastOrder*SAMPLE_SIZE+order]]); |
|
27 else // reverse the order of the letters in the lookup |
|
28 ++(mSeqCounters[mModel->precedenceMatrix[order*SAMPLE_SIZE+mLastOrder]]); |
|
29 } |
|
30 } |
|
31 mLastOrder = order; |
|
32 } |
|
33 |
|
34 if (mState == eDetecting) |
|
35 if (mTotalSeqs > SB_ENOUGH_REL_THRESHOLD) |
|
36 { |
|
37 float cf = GetConfidence(); |
|
38 if (cf > POSITIVE_SHORTCUT_THRESHOLD) |
|
39 mState = eFoundIt; |
|
40 else if (cf < NEGATIVE_SHORTCUT_THRESHOLD) |
|
41 mState = eNotMe; |
|
42 } |
|
43 |
|
44 return mState; |
|
45 } |
|
46 |
|
47 void nsSingleByteCharSetProber::Reset(void) |
|
48 { |
|
49 mState = eDetecting; |
|
50 mLastOrder = 255; |
|
51 for (uint32_t i = 0; i < NUMBER_OF_SEQ_CAT; i++) |
|
52 mSeqCounters[i] = 0; |
|
53 mTotalSeqs = 0; |
|
54 mTotalChar = 0; |
|
55 mFreqChar = 0; |
|
56 } |
|
57 |
|
58 //#define NEGATIVE_APPROACH 1 |
|
59 |
|
60 float nsSingleByteCharSetProber::GetConfidence(void) |
|
61 { |
|
62 #ifdef NEGATIVE_APPROACH |
|
63 if (mTotalSeqs > 0) |
|
64 if (mTotalSeqs > mSeqCounters[NEGATIVE_CAT]*10 ) |
|
65 return ((float)(mTotalSeqs - mSeqCounters[NEGATIVE_CAT]*10))/mTotalSeqs * mFreqChar / mTotalChar; |
|
66 return (float)0.01; |
|
67 #else //POSITIVE_APPROACH |
|
68 float r; |
|
69 |
|
70 if (mTotalSeqs > 0) { |
|
71 r = ((float)1.0) * mSeqCounters[POSITIVE_CAT] / mTotalSeqs / mModel->mTypicalPositiveRatio; |
|
72 r = r*mFreqChar/mTotalChar; |
|
73 if (r >= (float)1.00) |
|
74 r = (float)0.99; |
|
75 return r; |
|
76 } |
|
77 return (float)0.01; |
|
78 #endif |
|
79 } |
|
80 |
|
81 const char* nsSingleByteCharSetProber::GetCharSetName() |
|
82 { |
|
83 if (!mNameProber) |
|
84 return mModel->charsetName; |
|
85 return mNameProber->GetCharSetName(); |
|
86 } |
|
87 |
|
88 #ifdef DEBUG_chardet |
|
89 void nsSingleByteCharSetProber::DumpStatus() |
|
90 { |
|
91 printf(" SBCS: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
|
92 } |
|
93 #endif |