|
1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 #ifndef nsSingleByteCharSetProber_h__ |
|
6 #define nsSingleByteCharSetProber_h__ |
|
7 |
|
8 #include "nsCharSetProber.h" |
|
9 |
|
10 #define SAMPLE_SIZE 64 |
|
11 #define SB_ENOUGH_REL_THRESHOLD 1024 |
|
12 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 |
|
13 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 |
|
14 #define SYMBOL_CAT_ORDER 250 |
|
15 #define NUMBER_OF_SEQ_CAT 4 |
|
16 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) |
|
17 #define NEGATIVE_CAT 0 |
|
18 |
|
19 typedef struct |
|
20 { |
|
21 const unsigned char* const charToOrderMap; // [256] table use to find a char's order |
|
22 const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency |
|
23 float mTypicalPositiveRatio; // = freqSeqs / totalSeqs |
|
24 bool keepEnglishLetter; // says if this script contains English characters (not implemented) |
|
25 const char* const charsetName; |
|
26 } SequenceModel; |
|
27 |
|
28 |
|
29 class nsSingleByteCharSetProber : public nsCharSetProber{ |
|
30 public: |
|
31 nsSingleByteCharSetProber(const SequenceModel *model) |
|
32 :mModel(model), mReversed(false), mNameProber(0) { Reset(); } |
|
33 nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber) |
|
34 :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } |
|
35 |
|
36 virtual const char* GetCharSetName(); |
|
37 virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); |
|
38 virtual nsProbingState GetState(void) {return mState;} |
|
39 virtual void Reset(void); |
|
40 virtual float GetConfidence(void); |
|
41 |
|
42 // This feature is not implemented yet. any current language model |
|
43 // contain this parameter as false. No one is looking at this |
|
44 // parameter or calling this method. |
|
45 // Moreover, the nsSBCSGroupProber which calls the HandleData of this |
|
46 // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid |
|
47 // of the English letters. |
|
48 bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) |
|
49 |
|
50 #ifdef DEBUG_chardet |
|
51 virtual void DumpStatus(); |
|
52 #endif |
|
53 |
|
54 protected: |
|
55 nsProbingState mState; |
|
56 const SequenceModel* const mModel; |
|
57 const bool mReversed; // true if we need to reverse every pair in the model lookup |
|
58 |
|
59 //char order of last character |
|
60 unsigned char mLastOrder; |
|
61 |
|
62 uint32_t mTotalSeqs; |
|
63 uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT]; |
|
64 |
|
65 uint32_t mTotalChar; |
|
66 //characters that fall in our sampling range |
|
67 uint32_t mFreqChar; |
|
68 |
|
69 // Optional auxiliary prober for name decision. created and destroyed by the GroupProber |
|
70 nsCharSetProber* mNameProber; |
|
71 |
|
72 }; |
|
73 |
|
74 |
|
75 extern const SequenceModel Koi8rModel; |
|
76 extern const SequenceModel Win1251Model; |
|
77 extern const SequenceModel Latin5Model; |
|
78 extern const SequenceModel MacCyrillicModel; |
|
79 extern const SequenceModel Ibm866Model; |
|
80 extern const SequenceModel Ibm855Model; |
|
81 extern const SequenceModel Latin7Model; |
|
82 extern const SequenceModel Win1253Model; |
|
83 extern const SequenceModel Latin5BulgarianModel; |
|
84 extern const SequenceModel Win1251BulgarianModel; |
|
85 extern const SequenceModel Latin2HungarianModel; |
|
86 extern const SequenceModel Win1250HungarianModel; |
|
87 extern const SequenceModel Win1255Model; |
|
88 extern const SequenceModel TIS620ThaiModel; |
|
89 |
|
90 #endif /* nsSingleByteCharSetProber_h__ */ |
|
91 |