extensions/universalchardet/src/base/nsSBCharSetProber.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsSBCharSetProber.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,91 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +#ifndef nsSingleByteCharSetProber_h__
     1.9 +#define nsSingleByteCharSetProber_h__
    1.10 +
    1.11 +#include "nsCharSetProber.h"
    1.12 +
    1.13 +#define SAMPLE_SIZE 64
    1.14 +#define SB_ENOUGH_REL_THRESHOLD  1024
    1.15 +#define POSITIVE_SHORTCUT_THRESHOLD  (float)0.95
    1.16 +#define NEGATIVE_SHORTCUT_THRESHOLD  (float)0.05
    1.17 +#define SYMBOL_CAT_ORDER  250
    1.18 +#define NUMBER_OF_SEQ_CAT 4
    1.19 +#define POSITIVE_CAT   (NUMBER_OF_SEQ_CAT-1)
    1.20 +#define NEGATIVE_CAT   0
    1.21 +
    1.22 +typedef struct
    1.23 +{
    1.24 +  const unsigned char* const charToOrderMap;    // [256] table use to find a char's order
    1.25 +  const uint8_t* const precedenceMatrix;  // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
    1.26 +  float  mTypicalPositiveRatio;     // = freqSeqs / totalSeqs 
    1.27 +  bool keepEnglishLetter;         // says if this script contains English characters (not implemented)
    1.28 +  const char* const charsetName;
    1.29 +} SequenceModel;
    1.30 +
    1.31 +
    1.32 +class nsSingleByteCharSetProber : public nsCharSetProber{
    1.33 +public:
    1.34 +  nsSingleByteCharSetProber(const SequenceModel *model) 
    1.35 +    :mModel(model), mReversed(false), mNameProber(0) { Reset(); }
    1.36 +  nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber)
    1.37 +    :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
    1.38 +
    1.39 +  virtual const char* GetCharSetName();
    1.40 +  virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen);
    1.41 +  virtual nsProbingState GetState(void) {return mState;}
    1.42 +  virtual void      Reset(void);
    1.43 +  virtual float     GetConfidence(void);
    1.44 +  
    1.45 +  // This feature is not implemented yet. any current language model
    1.46 +  // contain this parameter as false. No one is looking at this
    1.47 +  // parameter or calling this method.
    1.48 +  // Moreover, the nsSBCSGroupProber which calls the HandleData of this
    1.49 +  // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
    1.50 +  // of the English letters.
    1.51 +  bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
    1.52 +
    1.53 +#ifdef DEBUG_chardet
    1.54 +  virtual void  DumpStatus();
    1.55 +#endif
    1.56 +
    1.57 +protected:
    1.58 +  nsProbingState mState;
    1.59 +  const SequenceModel* const mModel;
    1.60 +  const bool mReversed; // true if we need to reverse every pair in the model lookup
    1.61 +
    1.62 +  //char order of last character
    1.63 +  unsigned char mLastOrder;
    1.64 +
    1.65 +  uint32_t mTotalSeqs;
    1.66 +  uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT];
    1.67 +
    1.68 +  uint32_t mTotalChar;
    1.69 +  //characters that fall in our sampling range
    1.70 +  uint32_t mFreqChar;
    1.71 +  
    1.72 +  // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
    1.73 +  nsCharSetProber* mNameProber; 
    1.74 +
    1.75 +};
    1.76 +
    1.77 +
    1.78 +extern const SequenceModel Koi8rModel;
    1.79 +extern const SequenceModel Win1251Model;
    1.80 +extern const SequenceModel Latin5Model;
    1.81 +extern const SequenceModel MacCyrillicModel;
    1.82 +extern const SequenceModel Ibm866Model;
    1.83 +extern const SequenceModel Ibm855Model;
    1.84 +extern const SequenceModel Latin7Model;
    1.85 +extern const SequenceModel Win1253Model;
    1.86 +extern const SequenceModel Latin5BulgarianModel;
    1.87 +extern const SequenceModel Win1251BulgarianModel;
    1.88 +extern const SequenceModel Latin2HungarianModel;
    1.89 +extern const SequenceModel Win1250HungarianModel;
    1.90 +extern const SequenceModel Win1255Model;
    1.91 +extern const SequenceModel TIS620ThaiModel;
    1.92 +
    1.93 +#endif /* nsSingleByteCharSetProber_h__ */
    1.94 +

mercurial