michael@0: /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: #ifndef nsSingleByteCharSetProber_h__ michael@0: #define nsSingleByteCharSetProber_h__ michael@0: michael@0: #include "nsCharSetProber.h" michael@0: michael@0: #define SAMPLE_SIZE 64 michael@0: #define SB_ENOUGH_REL_THRESHOLD 1024 michael@0: #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 michael@0: #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 michael@0: #define SYMBOL_CAT_ORDER 250 michael@0: #define NUMBER_OF_SEQ_CAT 4 michael@0: #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) michael@0: #define NEGATIVE_CAT 0 michael@0: michael@0: typedef struct michael@0: { michael@0: const unsigned char* const charToOrderMap; // [256] table use to find a char's order michael@0: const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency michael@0: float mTypicalPositiveRatio; // = freqSeqs / totalSeqs michael@0: bool keepEnglishLetter; // says if this script contains English characters (not implemented) michael@0: const char* const charsetName; michael@0: } SequenceModel; michael@0: michael@0: michael@0: class nsSingleByteCharSetProber : public nsCharSetProber{ michael@0: public: michael@0: nsSingleByteCharSetProber(const SequenceModel *model) michael@0: :mModel(model), mReversed(false), mNameProber(0) { Reset(); } michael@0: nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber) michael@0: :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } michael@0: michael@0: virtual const char* GetCharSetName(); michael@0: virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); michael@0: virtual nsProbingState GetState(void) {return mState;} michael@0: virtual void Reset(void); michael@0: virtual float GetConfidence(void); michael@0: michael@0: // This feature is not implemented yet. any current language model michael@0: // contain this parameter as false. No one is looking at this michael@0: // parameter or calling this method. michael@0: // Moreover, the nsSBCSGroupProber which calls the HandleData of this michael@0: // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid michael@0: // of the English letters. michael@0: bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) michael@0: michael@0: #ifdef DEBUG_chardet michael@0: virtual void DumpStatus(); michael@0: #endif michael@0: michael@0: protected: michael@0: nsProbingState mState; michael@0: const SequenceModel* const mModel; michael@0: const bool mReversed; // true if we need to reverse every pair in the model lookup michael@0: michael@0: //char order of last character michael@0: unsigned char mLastOrder; michael@0: michael@0: uint32_t mTotalSeqs; michael@0: uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT]; michael@0: michael@0: uint32_t mTotalChar; michael@0: //characters that fall in our sampling range michael@0: uint32_t mFreqChar; michael@0: michael@0: // Optional auxiliary prober for name decision. created and destroyed by the GroupProber michael@0: nsCharSetProber* mNameProber; michael@0: michael@0: }; michael@0: michael@0: michael@0: extern const SequenceModel Koi8rModel; michael@0: extern const SequenceModel Win1251Model; michael@0: extern const SequenceModel Latin5Model; michael@0: extern const SequenceModel MacCyrillicModel; michael@0: extern const SequenceModel Ibm866Model; michael@0: extern const SequenceModel Ibm855Model; michael@0: extern const SequenceModel Latin7Model; michael@0: extern const SequenceModel Win1253Model; michael@0: extern const SequenceModel Latin5BulgarianModel; michael@0: extern const SequenceModel Win1251BulgarianModel; michael@0: extern const SequenceModel Latin2HungarianModel; michael@0: extern const SequenceModel Win1250HungarianModel; michael@0: extern const SequenceModel Win1255Model; michael@0: extern const SequenceModel TIS620ThaiModel; michael@0: michael@0: #endif /* nsSingleByteCharSetProber_h__ */ michael@0: