Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | #ifndef nsSingleByteCharSetProber_h__ |
michael@0 | 6 | #define nsSingleByteCharSetProber_h__ |
michael@0 | 7 | |
michael@0 | 8 | #include "nsCharSetProber.h" |
michael@0 | 9 | |
michael@0 | 10 | #define SAMPLE_SIZE 64 |
michael@0 | 11 | #define SB_ENOUGH_REL_THRESHOLD 1024 |
michael@0 | 12 | #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 |
michael@0 | 13 | #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 |
michael@0 | 14 | #define SYMBOL_CAT_ORDER 250 |
michael@0 | 15 | #define NUMBER_OF_SEQ_CAT 4 |
michael@0 | 16 | #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) |
michael@0 | 17 | #define NEGATIVE_CAT 0 |
michael@0 | 18 | |
michael@0 | 19 | typedef struct |
michael@0 | 20 | { |
michael@0 | 21 | const unsigned char* const charToOrderMap; // [256] table use to find a char's order |
michael@0 | 22 | const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency |
michael@0 | 23 | float mTypicalPositiveRatio; // = freqSeqs / totalSeqs |
michael@0 | 24 | bool keepEnglishLetter; // says if this script contains English characters (not implemented) |
michael@0 | 25 | const char* const charsetName; |
michael@0 | 26 | } SequenceModel; |
michael@0 | 27 | |
michael@0 | 28 | |
michael@0 | 29 | class nsSingleByteCharSetProber : public nsCharSetProber{ |
michael@0 | 30 | public: |
michael@0 | 31 | nsSingleByteCharSetProber(const SequenceModel *model) |
michael@0 | 32 | :mModel(model), mReversed(false), mNameProber(0) { Reset(); } |
michael@0 | 33 | nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber) |
michael@0 | 34 | :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } |
michael@0 | 35 | |
michael@0 | 36 | virtual const char* GetCharSetName(); |
michael@0 | 37 | virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); |
michael@0 | 38 | virtual nsProbingState GetState(void) {return mState;} |
michael@0 | 39 | virtual void Reset(void); |
michael@0 | 40 | virtual float GetConfidence(void); |
michael@0 | 41 | |
michael@0 | 42 | // This feature is not implemented yet. any current language model |
michael@0 | 43 | // contain this parameter as false. No one is looking at this |
michael@0 | 44 | // parameter or calling this method. |
michael@0 | 45 | // Moreover, the nsSBCSGroupProber which calls the HandleData of this |
michael@0 | 46 | // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid |
michael@0 | 47 | // of the English letters. |
michael@0 | 48 | bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) |
michael@0 | 49 | |
michael@0 | 50 | #ifdef DEBUG_chardet |
michael@0 | 51 | virtual void DumpStatus(); |
michael@0 | 52 | #endif |
michael@0 | 53 | |
michael@0 | 54 | protected: |
michael@0 | 55 | nsProbingState mState; |
michael@0 | 56 | const SequenceModel* const mModel; |
michael@0 | 57 | const bool mReversed; // true if we need to reverse every pair in the model lookup |
michael@0 | 58 | |
michael@0 | 59 | //char order of last character |
michael@0 | 60 | unsigned char mLastOrder; |
michael@0 | 61 | |
michael@0 | 62 | uint32_t mTotalSeqs; |
michael@0 | 63 | uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT]; |
michael@0 | 64 | |
michael@0 | 65 | uint32_t mTotalChar; |
michael@0 | 66 | //characters that fall in our sampling range |
michael@0 | 67 | uint32_t mFreqChar; |
michael@0 | 68 | |
michael@0 | 69 | // Optional auxiliary prober for name decision. created and destroyed by the GroupProber |
michael@0 | 70 | nsCharSetProber* mNameProber; |
michael@0 | 71 | |
michael@0 | 72 | }; |
michael@0 | 73 | |
michael@0 | 74 | |
michael@0 | 75 | extern const SequenceModel Koi8rModel; |
michael@0 | 76 | extern const SequenceModel Win1251Model; |
michael@0 | 77 | extern const SequenceModel Latin5Model; |
michael@0 | 78 | extern const SequenceModel MacCyrillicModel; |
michael@0 | 79 | extern const SequenceModel Ibm866Model; |
michael@0 | 80 | extern const SequenceModel Ibm855Model; |
michael@0 | 81 | extern const SequenceModel Latin7Model; |
michael@0 | 82 | extern const SequenceModel Win1253Model; |
michael@0 | 83 | extern const SequenceModel Latin5BulgarianModel; |
michael@0 | 84 | extern const SequenceModel Win1251BulgarianModel; |
michael@0 | 85 | extern const SequenceModel Latin2HungarianModel; |
michael@0 | 86 | extern const SequenceModel Win1250HungarianModel; |
michael@0 | 87 | extern const SequenceModel Win1255Model; |
michael@0 | 88 | extern const SequenceModel TIS620ThaiModel; |
michael@0 | 89 | |
michael@0 | 90 | #endif /* nsSingleByteCharSetProber_h__ */ |
michael@0 | 91 |