extensions/universalchardet/src/base/CharDistribution.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #ifndef CharDistribution_h__
     7 #define CharDistribution_h__
     9 #include "nscore.h"
    11 #define ENOUGH_DATA_THRESHOLD 1024
    13 #define MINIMUM_DATA_THRESHOLD  4
    15 class CharDistributionAnalysis
    16 {
    17 public:
    18   CharDistributionAnalysis() {Reset(false);}
    20   //feed a block of data and do distribution analysis
    21   void HandleData(const char* aBuf, uint32_t aLen) {}
    23   //Feed a character with known length
    24   void HandleOneChar(const char* aStr, uint32_t aCharLen)
    25   {
    26     int32_t order;
    28     //we only care about 2-bytes character in our distribution analysis
    29     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
    31     if (order >= 0)
    32     {
    33       mTotalChars++;
    34       //order is valid
    35       if ((uint32_t)order < mTableSize)
    36       {
    37         if (512 > mCharToFreqOrder[order])
    38           mFreqChars++;
    39       }
    40     }
    41   }
    43   //return confidence base on existing data
    44   float GetConfidence(void);
    46   //Reset analyser, clear any state 
    47   void      Reset(bool aIsPreferredLanguage) 
    48   {
    49     mDone = false;
    50     mTotalChars = 0;
    51     mFreqChars = 0;
    52     mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
    53   }
    55   //It is not necessary to receive all data to draw conclusion. For charset detection,
    56   // certain amount of data is enough
    57   bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
    59 protected:
    60   //we do not handle character base on its original encoding string, but 
    61   //convert this encoding string to a number, here called order.
    62   //This allow multiple encoding of a language to share one frequency table 
    63   virtual int32_t GetOrder(const char* str) {return -1;}
    65   //If this flag is set to true, detection is done and conclusion has been made
    66   bool     mDone;
    68   //The number of characters whose frequency order is less than 512
    69   uint32_t mFreqChars;
    71   //Total character encounted.
    72   uint32_t mTotalChars;
    74   //Number of hi-byte characters needed to trigger detection
    75   uint32_t mDataThreshold;
    77   //Mapping table to get frequency order from char order (get from GetOrder())
    78   const int16_t  *mCharToFreqOrder;
    80   //Size of above table
    81   uint32_t mTableSize;
    83   //This is a constant value varies from language to language, it is used in 
    84   //calculating confidence. See my paper for further detail.
    85   float    mTypicalDistributionRatio;
    86 };
    89 class EUCTWDistributionAnalysis: public CharDistributionAnalysis
    90 {
    91 public:
    92   EUCTWDistributionAnalysis();
    93 protected:
    95   //for euc-TW encoding, we are interested 
    96   //  first  byte range: 0xc4 -- 0xfe
    97   //  second byte range: 0xa1 -- 0xfe
    98   //no validation needed here. State machine has done that
    99   int32_t GetOrder(const char* str) 
   100   { if ((unsigned char)*str >= (unsigned char)0xc4)  
   101       return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
   102     else
   103       return -1;
   104   }
   105 };
   108 class EUCKRDistributionAnalysis : public CharDistributionAnalysis
   109 {
   110 public:
   111   EUCKRDistributionAnalysis();
   112 protected:
   113   //for euc-KR encoding, we are interested 
   114   //  first  byte range: 0xb0 -- 0xfe
   115   //  second byte range: 0xa1 -- 0xfe
   116   //no validation needed here. State machine has done that
   117   int32_t GetOrder(const char* str) 
   118   { if ((unsigned char)*str >= (unsigned char)0xb0)  
   119       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
   120     else
   121       return -1;
   122   }
   123 };
   125 class GB2312DistributionAnalysis : public CharDistributionAnalysis
   126 {
   127 public:
   128   GB2312DistributionAnalysis();
   129 protected:
   130   //for GB2312 encoding, we are interested 
   131   //  first  byte range: 0xb0 -- 0xfe
   132   //  second byte range: 0xa1 -- 0xfe
   133   //no validation needed here. State machine has done that
   134   int32_t GetOrder(const char* str) 
   135   { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)  
   136       return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
   137     else
   138       return -1;
   139   }
   140 };
   143 class Big5DistributionAnalysis : public CharDistributionAnalysis
   144 {
   145 public:
   146   Big5DistributionAnalysis();
   147 protected:
   148   //for big5 encoding, we are interested 
   149   //  first  byte range: 0xa4 -- 0xfe
   150   //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
   151   //no validation needed here. State machine has done that
   152   int32_t GetOrder(const char* str) 
   153   { if ((unsigned char)*str >= (unsigned char)0xa4)  
   154       if ((unsigned char)str[1] >= (unsigned char)0xa1)
   155         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
   156       else
   157         return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
   158     else
   159       return -1;
   160   }
   161 };
   163 class SJISDistributionAnalysis : public CharDistributionAnalysis
   164 {
   165 public:
   166   SJISDistributionAnalysis();
   167 protected:
   168   //for sjis encoding, we are interested 
   169   //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
   170   //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
   171   //no validation needed here. State machine has done that
   172   int32_t GetOrder(const char* str) 
   173   { 
   174     int32_t order;
   175     if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)  
   176       order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
   177     else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)  
   178       order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
   179     else
   180       return -1;
   181     order += (unsigned char)*(str+1) - 0x40;
   182     if ((unsigned char)str[1] > (unsigned char)0x7f)
   183       order--;
   184     return order;
   185   }
   186 };
   188 class EUCJPDistributionAnalysis : public CharDistributionAnalysis
   189 {
   190 public:
   191   EUCJPDistributionAnalysis();
   192 protected:
   193   //for euc-JP encoding, we are interested 
   194   //  first  byte range: 0xa0 -- 0xfe
   195   //  second byte range: 0xa1 -- 0xfe
   196   //no validation needed here. State machine has done that
   197   int32_t GetOrder(const char* str) 
   198   { if ((unsigned char)*str >= (unsigned char)0xa0)  
   199       return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
   200     else
   201       return -1;
   202   }
   203 };
   205 #endif //CharDistribution_h__

mercurial