1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/nsSBCharSetProber.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,91 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +#ifndef nsSingleByteCharSetProber_h__ 1.9 +#define nsSingleByteCharSetProber_h__ 1.10 + 1.11 +#include "nsCharSetProber.h" 1.12 + 1.13 +#define SAMPLE_SIZE 64 1.14 +#define SB_ENOUGH_REL_THRESHOLD 1024 1.15 +#define POSITIVE_SHORTCUT_THRESHOLD (float)0.95 1.16 +#define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 1.17 +#define SYMBOL_CAT_ORDER 250 1.18 +#define NUMBER_OF_SEQ_CAT 4 1.19 +#define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1) 1.20 +#define NEGATIVE_CAT 0 1.21 + 1.22 +typedef struct 1.23 +{ 1.24 + const unsigned char* const charToOrderMap; // [256] table use to find a char's order 1.25 + const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency 1.26 + float mTypicalPositiveRatio; // = freqSeqs / totalSeqs 1.27 + bool keepEnglishLetter; // says if this script contains English characters (not implemented) 1.28 + const char* const charsetName; 1.29 +} SequenceModel; 1.30 + 1.31 + 1.32 +class nsSingleByteCharSetProber : public nsCharSetProber{ 1.33 +public: 1.34 + nsSingleByteCharSetProber(const SequenceModel *model) 1.35 + :mModel(model), mReversed(false), mNameProber(0) { Reset(); } 1.36 + nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber) 1.37 + :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } 1.38 + 1.39 + virtual const char* GetCharSetName(); 1.40 + virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen); 1.41 + virtual nsProbingState GetState(void) {return mState;} 1.42 + virtual void Reset(void); 1.43 + virtual float GetConfidence(void); 1.44 + 1.45 + // This feature is not implemented yet. any current language model 1.46 + // contain this parameter as false. No one is looking at this 1.47 + // parameter or calling this method. 1.48 + // Moreover, the nsSBCSGroupProber which calls the HandleData of this 1.49 + // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid 1.50 + // of the English letters. 1.51 + bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) 1.52 + 1.53 +#ifdef DEBUG_chardet 1.54 + virtual void DumpStatus(); 1.55 +#endif 1.56 + 1.57 +protected: 1.58 + nsProbingState mState; 1.59 + const SequenceModel* const mModel; 1.60 + const bool mReversed; // true if we need to reverse every pair in the model lookup 1.61 + 1.62 + //char order of last character 1.63 + unsigned char mLastOrder; 1.64 + 1.65 + uint32_t mTotalSeqs; 1.66 + uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT]; 1.67 + 1.68 + uint32_t mTotalChar; 1.69 + //characters that fall in our sampling range 1.70 + uint32_t mFreqChar; 1.71 + 1.72 + // Optional auxiliary prober for name decision. created and destroyed by the GroupProber 1.73 + nsCharSetProber* mNameProber; 1.74 + 1.75 +}; 1.76 + 1.77 + 1.78 +extern const SequenceModel Koi8rModel; 1.79 +extern const SequenceModel Win1251Model; 1.80 +extern const SequenceModel Latin5Model; 1.81 +extern const SequenceModel MacCyrillicModel; 1.82 +extern const SequenceModel Ibm866Model; 1.83 +extern const SequenceModel Ibm855Model; 1.84 +extern const SequenceModel Latin7Model; 1.85 +extern const SequenceModel Win1253Model; 1.86 +extern const SequenceModel Latin5BulgarianModel; 1.87 +extern const SequenceModel Win1251BulgarianModel; 1.88 +extern const SequenceModel Latin2HungarianModel; 1.89 +extern const SequenceModel Win1250HungarianModel; 1.90 +extern const SequenceModel Win1255Model; 1.91 +extern const SequenceModel TIS620ThaiModel; 1.92 + 1.93 +#endif /* nsSingleByteCharSetProber_h__ */ 1.94 +