extensions/universalchardet/src/base/nsSBCharSetProber.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 #ifndef nsSingleByteCharSetProber_h__
     6 #define nsSingleByteCharSetProber_h__
     8 #include "nsCharSetProber.h"
    10 #define SAMPLE_SIZE 64
    11 #define SB_ENOUGH_REL_THRESHOLD  1024
    12 #define POSITIVE_SHORTCUT_THRESHOLD  (float)0.95
    13 #define NEGATIVE_SHORTCUT_THRESHOLD  (float)0.05
    14 #define SYMBOL_CAT_ORDER  250
    15 #define NUMBER_OF_SEQ_CAT 4
    16 #define POSITIVE_CAT   (NUMBER_OF_SEQ_CAT-1)
    17 #define NEGATIVE_CAT   0
    19 typedef struct
    20 {
    21   const unsigned char* const charToOrderMap;    // [256] table use to find a char's order
    22   const uint8_t* const precedenceMatrix;  // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
    23   float  mTypicalPositiveRatio;     // = freqSeqs / totalSeqs 
    24   bool keepEnglishLetter;         // says if this script contains English characters (not implemented)
    25   const char* const charsetName;
    26 } SequenceModel;
    29 class nsSingleByteCharSetProber : public nsCharSetProber{
    30 public:
    31   nsSingleByteCharSetProber(const SequenceModel *model) 
    32     :mModel(model), mReversed(false), mNameProber(0) { Reset(); }
    33   nsSingleByteCharSetProber(const SequenceModel *model, bool reversed, nsCharSetProber* nameProber)
    34     :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
    36   virtual const char* GetCharSetName();
    37   virtual nsProbingState HandleData(const char* aBuf, uint32_t aLen);
    38   virtual nsProbingState GetState(void) {return mState;}
    39   virtual void      Reset(void);
    40   virtual float     GetConfidence(void);
    42   // This feature is not implemented yet. any current language model
    43   // contain this parameter as false. No one is looking at this
    44   // parameter or calling this method.
    45   // Moreover, the nsSBCSGroupProber which calls the HandleData of this
    46   // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
    47   // of the English letters.
    48   bool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
    50 #ifdef DEBUG_chardet
    51   virtual void  DumpStatus();
    52 #endif
    54 protected:
    55   nsProbingState mState;
    56   const SequenceModel* const mModel;
    57   const bool mReversed; // true if we need to reverse every pair in the model lookup
    59   //char order of last character
    60   unsigned char mLastOrder;
    62   uint32_t mTotalSeqs;
    63   uint32_t mSeqCounters[NUMBER_OF_SEQ_CAT];
    65   uint32_t mTotalChar;
    66   //characters that fall in our sampling range
    67   uint32_t mFreqChar;
    69   // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
    70   nsCharSetProber* mNameProber; 
    72 };
    75 extern const SequenceModel Koi8rModel;
    76 extern const SequenceModel Win1251Model;
    77 extern const SequenceModel Latin5Model;
    78 extern const SequenceModel MacCyrillicModel;
    79 extern const SequenceModel Ibm866Model;
    80 extern const SequenceModel Ibm855Model;
    81 extern const SequenceModel Latin7Model;
    82 extern const SequenceModel Win1253Model;
    83 extern const SequenceModel Latin5BulgarianModel;
    84 extern const SequenceModel Win1251BulgarianModel;
    85 extern const SequenceModel Latin2HungarianModel;
    86 extern const SequenceModel Win1250HungarianModel;
    87 extern const SequenceModel Win1255Model;
    88 extern const SequenceModel TIS620ThaiModel;
    90 #endif /* nsSingleByteCharSetProber_h__ */

mercurial