extensions/universalchardet/src/base/CharDistribution.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/CharDistribution.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,206 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#ifndef CharDistribution_h__
    1.10 +#define CharDistribution_h__
    1.11 +
    1.12 +#include "nscore.h"
    1.13 +
    1.14 +#define ENOUGH_DATA_THRESHOLD 1024
    1.15 + 
    1.16 +#define MINIMUM_DATA_THRESHOLD  4
    1.17 +
    1.18 +class CharDistributionAnalysis
    1.19 +{
    1.20 +public:
    1.21 +  CharDistributionAnalysis() {Reset(false);}
    1.22 +
    1.23 +  //feed a block of data and do distribution analysis
    1.24 +  void HandleData(const char* aBuf, uint32_t aLen) {}
    1.25 +  
    1.26 +  //Feed a character with known length
    1.27 +  void HandleOneChar(const char* aStr, uint32_t aCharLen)
    1.28 +  {
    1.29 +    int32_t order;
    1.30 +
    1.31 +    //we only care about 2-bytes character in our distribution analysis
    1.32 +    order = (aCharLen == 2) ? GetOrder(aStr) : -1;
    1.33 +
    1.34 +    if (order >= 0)
    1.35 +    {
    1.36 +      mTotalChars++;
    1.37 +      //order is valid
    1.38 +      if ((uint32_t)order < mTableSize)
    1.39 +      {
    1.40 +        if (512 > mCharToFreqOrder[order])
    1.41 +          mFreqChars++;
    1.42 +      }
    1.43 +    }
    1.44 +  }
    1.45 +
    1.46 +  //return confidence base on existing data
    1.47 +  float GetConfidence(void);
    1.48 +
    1.49 +  //Reset analyser, clear any state 
    1.50 +  void      Reset(bool aIsPreferredLanguage) 
    1.51 +  {
    1.52 +    mDone = false;
    1.53 +    mTotalChars = 0;
    1.54 +    mFreqChars = 0;
    1.55 +    mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
    1.56 +  }
    1.57 +
    1.58 +  //It is not necessary to receive all data to draw conclusion. For charset detection,
    1.59 +  // certain amount of data is enough
    1.60 +  bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
    1.61 +
    1.62 +protected:
    1.63 +  //we do not handle character base on its original encoding string, but 
    1.64 +  //convert this encoding string to a number, here called order.
    1.65 +  //This allow multiple encoding of a language to share one frequency table 
    1.66 +  virtual int32_t GetOrder(const char* str) {return -1;}
    1.67 +  
    1.68 +  //If this flag is set to true, detection is done and conclusion has been made
    1.69 +  bool     mDone;
    1.70 +
    1.71 +  //The number of characters whose frequency order is less than 512
    1.72 +  uint32_t mFreqChars;
    1.73 +
    1.74 +  //Total character encounted.
    1.75 +  uint32_t mTotalChars;
    1.76 +
    1.77 +  //Number of hi-byte characters needed to trigger detection
    1.78 +  uint32_t mDataThreshold;
    1.79 +
    1.80 +  //Mapping table to get frequency order from char order (get from GetOrder())
    1.81 +  const int16_t  *mCharToFreqOrder;
    1.82 +
    1.83 +  //Size of above table
    1.84 +  uint32_t mTableSize;
    1.85 +
    1.86 +  //This is a constant value varies from language to language, it is used in 
    1.87 +  //calculating confidence. See my paper for further detail.
    1.88 +  float    mTypicalDistributionRatio;
    1.89 +};
    1.90 +
    1.91 +
    1.92 +class EUCTWDistributionAnalysis: public CharDistributionAnalysis
    1.93 +{
    1.94 +public:
    1.95 +  EUCTWDistributionAnalysis();
    1.96 +protected:
    1.97 +
    1.98 +  //for euc-TW encoding, we are interested 
    1.99 +  //  first  byte range: 0xc4 -- 0xfe
   1.100 +  //  second byte range: 0xa1 -- 0xfe
   1.101 +  //no validation needed here. State machine has done that
   1.102 +  int32_t GetOrder(const char* str) 
   1.103 +  { if ((unsigned char)*str >= (unsigned char)0xc4)  
   1.104 +      return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
   1.105 +    else
   1.106 +      return -1;
   1.107 +  }
   1.108 +};
   1.109 +
   1.110 +
   1.111 +class EUCKRDistributionAnalysis : public CharDistributionAnalysis
   1.112 +{
   1.113 +public:
   1.114 +  EUCKRDistributionAnalysis();
   1.115 +protected:
   1.116 +  //for euc-KR encoding, we are interested 
   1.117 +  //  first  byte range: 0xb0 -- 0xfe
   1.118 +  //  second byte range: 0xa1 -- 0xfe
   1.119 +  //no validation needed here. State machine has done that
   1.120 +  int32_t GetOrder(const char* str) 
   1.121 +  { if ((unsigned char)*str >= (unsigned char)0xb0)  
   1.122 +      return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
   1.123 +    else
   1.124 +      return -1;
   1.125 +  }
   1.126 +};
   1.127 +
   1.128 +class GB2312DistributionAnalysis : public CharDistributionAnalysis
   1.129 +{
   1.130 +public:
   1.131 +  GB2312DistributionAnalysis();
   1.132 +protected:
   1.133 +  //for GB2312 encoding, we are interested 
   1.134 +  //  first  byte range: 0xb0 -- 0xfe
   1.135 +  //  second byte range: 0xa1 -- 0xfe
   1.136 +  //no validation needed here. State machine has done that
   1.137 +  int32_t GetOrder(const char* str) 
   1.138 +  { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1)  
   1.139 +      return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
   1.140 +    else
   1.141 +      return -1;
   1.142 +  }
   1.143 +};
   1.144 +
   1.145 +
   1.146 +class Big5DistributionAnalysis : public CharDistributionAnalysis
   1.147 +{
   1.148 +public:
   1.149 +  Big5DistributionAnalysis();
   1.150 +protected:
   1.151 +  //for big5 encoding, we are interested 
   1.152 +  //  first  byte range: 0xa4 -- 0xfe
   1.153 +  //  second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
   1.154 +  //no validation needed here. State machine has done that
   1.155 +  int32_t GetOrder(const char* str) 
   1.156 +  { if ((unsigned char)*str >= (unsigned char)0xa4)  
   1.157 +      if ((unsigned char)str[1] >= (unsigned char)0xa1)
   1.158 +        return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63;
   1.159 +      else
   1.160 +        return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
   1.161 +    else
   1.162 +      return -1;
   1.163 +  }
   1.164 +};
   1.165 +
   1.166 +class SJISDistributionAnalysis : public CharDistributionAnalysis
   1.167 +{
   1.168 +public:
   1.169 +  SJISDistributionAnalysis();
   1.170 +protected:
   1.171 +  //for sjis encoding, we are interested 
   1.172 +  //  first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
   1.173 +  //  second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
   1.174 +  //no validation needed here. State machine has done that
   1.175 +  int32_t GetOrder(const char* str) 
   1.176 +  { 
   1.177 +    int32_t order;
   1.178 +    if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f)  
   1.179 +      order = 188 * ((unsigned char)str[0]-(unsigned char)0x81);
   1.180 +    else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef)  
   1.181 +      order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31);
   1.182 +    else
   1.183 +      return -1;
   1.184 +    order += (unsigned char)*(str+1) - 0x40;
   1.185 +    if ((unsigned char)str[1] > (unsigned char)0x7f)
   1.186 +      order--;
   1.187 +    return order;
   1.188 +  }
   1.189 +};
   1.190 +
   1.191 +class EUCJPDistributionAnalysis : public CharDistributionAnalysis
   1.192 +{
   1.193 +public:
   1.194 +  EUCJPDistributionAnalysis();
   1.195 +protected:
   1.196 +  //for euc-JP encoding, we are interested 
   1.197 +  //  first  byte range: 0xa0 -- 0xfe
   1.198 +  //  second byte range: 0xa1 -- 0xfe
   1.199 +  //no validation needed here. State machine has done that
   1.200 +  int32_t GetOrder(const char* str) 
   1.201 +  { if ((unsigned char)*str >= (unsigned char)0xa0)  
   1.202 +      return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
   1.203 +    else
   1.204 +      return -1;
   1.205 +  }
   1.206 +};
   1.207 +
   1.208 +#endif //CharDistribution_h__
   1.209 +

mercurial