extensions/universalchardet/src/base/nsSBCharSetProber.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 #ifndef nsSingleByteCharSetProber_h__
michael@0 6 #define nsSingleByteCharSetProber_h__
michael@0 7
michael@0 8 #include "nsCharSetProber.h"
michael@0 9
michael@0 10 #define SAMPLE_SIZE 64
michael@0 11 #define SB_ENOUGH_REL_THRESHOLD 1024
michael@0 12 #define POSITIVE_SHORTCUT_THRESHOLD (float)0.95
michael@0 13 #define NEGATIVE_SHORTCUT_THRESHOLD (float)0.05
michael@0 14 #define SYMBOL_CAT_ORDER 250
michael@0 15 #define NUMBER_OF_SEQ_CAT 4
michael@0 16 #define POSITIVE_CAT (NUMBER_OF_SEQ_CAT-1)
michael@0 17 #define NEGATIVE_CAT 0
michael@0 18
michael@0 19 typedef struct
michael@0 20 {
michael@0 21 const unsigned char* const charToOrderMap; // [256] table use to find a char's order
michael@0 22 const uint8_t* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
michael@0 23 float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
michael@0 24 bool keepEnglishLetter; // says if this script contains English characters (not implemented)
michael@0 25 const char* const charsetName;
michael@0 26 } SequenceModel;
michael@0 27
michael@0 28
michael@0 29 class nsSingleByteCharSetProber : public nsCharSetProber{
michael@0 30 public:
michael@0 31 nsSingleByteCharSetProber(const SequenceModel *model)
michael@0 32 :mModel(model), mReversed(false), mNameProber(0) { Reset(); }
michael@0 33 nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber)
michael@0 34 :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
michael@0 35
michael@0 36 virtual const char* GetCharSetName();
michael@0 37 virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen);
michael@0 38 virtual nsProbingState GetState(void) {return mState;}
michael@0 39 virtual void Reset(void);
michael@0 40 virtual float GetConfidence(void);
michael@0 41
michael@0 42 // This feature is not implemented yet. any current language model
michael@0 43 // contain this parameter as false. No one is looking at this
michael@0 44 // parameter or calling this method.
michael@0 45 // Moreover, the nsSBCSGroupProber which calls the HandleData of this
michael@0 46 // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
michael@0 47 // of the English letters.
michael@0 48 bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
michael@0 49
michael@0 50 #ifdef DEBUG_chardet
michael@0 51 virtual void DumpStatus();
michael@0 52 #endif
michael@0 53
michael@0 54 protected:
michael@0 55 nsProbingState mState;
michael@0 56 const SequenceModel* const mModel;
michael@0 57 const bool mReversed; // true if we need to reverse every pair in the model lookup
michael@0 58
michael@0 59 //char order of last character
michael@0 60 unsigned char mLastOrder;
michael@0 61
michael@0 62 uint32_t mTotalSeqs;
michael@0 63 uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT];
michael@0 64
michael@0 65 uint32_t mTotalChar;
michael@0 66 //characters that fall in our sampling range
michael@0 67 uint32_t mFreqChar;
michael@0 68
michael@0 69 // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
michael@0 70 nsCharSetProber* mNameProber;
michael@0 71
michael@0 72 };
michael@0 73
michael@0 74
michael@0 75 extern const SequenceModel Koi8rModel;
michael@0 76 extern const SequenceModel Win1251Model;
michael@0 77 extern const SequenceModel Latin5Model;
michael@0 78 extern const SequenceModel MacCyrillicModel;
michael@0 79 extern const SequenceModel Ibm866Model;
michael@0 80 extern const SequenceModel Ibm855Model;
michael@0 81 extern const SequenceModel Latin7Model;
michael@0 82 extern const SequenceModel Win1253Model;
michael@0 83 extern const SequenceModel Latin5BulgarianModel;
michael@0 84 extern const SequenceModel Win1251BulgarianModel;
michael@0 85 extern const SequenceModel Latin2HungarianModel;
michael@0 86 extern const SequenceModel Win1250HungarianModel;
michael@0 87 extern const SequenceModel Win1255Model;
michael@0 88 extern const SequenceModel TIS620ThaiModel;
michael@0 89
michael@0 90 #endif /* nsSingleByteCharSetProber_h__ */
michael@0 91

mercurial