extensions/universalchardet/src/base/nsSBCharSetProber.h

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:65e8c20731de
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #ifndef nsSingleByteCharSetProber_h__
6 #define nsSingleByteCharSetProber_h__
7
8 #include "nsCharSetProber.h"
9
10 #define SAMPLE_SIZE 64
11 #define SB_ENOUGH_REL_THRESHOLD 1024
12 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
13 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
14 #define SYMBOL_CAT_ORDER 250
15 #define NUMBER_OF_SEQ_CAT 4
16 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
17 #define NEGATIVE_CAT 0
18
19 typedef struct
20 {
21 const unsigned char* const charToOrderMap; // [256] table use to find a char's order
22 const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
23 float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
24 bool keepEnglishLetter; // says if this script contains English characters (not implemented)
25 const char* const charsetName;
26 } SequenceModel;
27
28
29 class nsSingleByteCharSetProber : public nsCharSetProber{
30 public:
31 nsSingleByteCharSetProber(const SequenceModel *model)
32 :mModel(model), mReversed(false), mNameProber(0) { Reset(); }
33 nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber)
34 :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
35
36 virtual const char* GetCharSetName();
37 virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen);
38 virtual nsProbingState GetState(void) {return mState;}
39 virtual void Reset(void);
40 virtual float GetConfidence(void);
41
42 // This feature is not implemented yet. any current language model
43 // contain this parameter as false. No one is looking at this
44 // parameter or calling this method.
45 // Moreover, the nsSBCSGroupProber which calls the HandleData of this
46 // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
47 // of the English letters.
48 bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
49
50 #ifdef DEBUG_chardet
51 virtual void DumpStatus();
52 #endif
53
54 protected:
55 nsProbingState mState;
56 const SequenceModel* const mModel;
57 const bool mReversed; // true if we need to reverse every pair in the model lookup
58
59 //char order of last character
60 unsigned char mLastOrder;
61
62 uint32_t mTotalSeqs;
63 uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT];
64
65 uint32_t mTotalChar;
66 //characters that fall in our sampling range
67 uint32_t mFreqChar;
68
69 // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
70 nsCharSetProber* mNameProber;
71
72 };
73
74
75 extern const SequenceModel Koi8rModel;
76 extern const SequenceModel Win1251Model;
77 extern const SequenceModel Latin5Model;
78 extern const SequenceModel MacCyrillicModel;
79 extern const SequenceModel Ibm866Model;
80 extern const SequenceModel Ibm855Model;
81 extern const SequenceModel Latin7Model;
82 extern const SequenceModel Win1253Model;
83 extern const SequenceModel Latin5BulgarianModel;
84 extern const SequenceModel Win1251BulgarianModel;
85 extern const SequenceModel Latin2HungarianModel;
86 extern const SequenceModel Win1250HungarianModel;
87 extern const SequenceModel Win1255Model;
88 extern const SequenceModel TIS620ThaiModel;
89
90 #endif /* nsSingleByteCharSetProber_h__ */
91

mercurial