michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #ifndef CharDistribution_h__ michael@0: #define CharDistribution_h__ michael@0: michael@0: #include "nscore.h" michael@0: michael@0: #define ENOUGH_DATA_THRESHOLD 1024 michael@0: michael@0: #define MINIMUM_DATA_THRESHOLD 4 michael@0: michael@0: class CharDistributionAnalysis michael@0: { michael@0: public: michael@0: CharDistributionAnalysis() {Reset(false);} michael@0: michael@0: //feed a block of data and do distribution analysis michael@0: void HandleData(const char* aBuf, uint32_t aLen) {} michael@0: michael@0: //Feed a character with known length michael@0: void HandleOneChar(const char* aStr, uint32_t aCharLen) michael@0: { michael@0: int32_t order; michael@0: michael@0: //we only care about 2-bytes character in our distribution analysis michael@0: order = (aCharLen == 2) ? GetOrder(aStr) : -1; michael@0: michael@0: if (order >= 0) michael@0: { michael@0: mTotalChars++; michael@0: //order is valid michael@0: if ((uint32_t)order < mTableSize) michael@0: { michael@0: if (512 > mCharToFreqOrder[order]) michael@0: mFreqChars++; michael@0: } michael@0: } michael@0: } michael@0: michael@0: //return confidence base on existing data michael@0: float GetConfidence(void); michael@0: michael@0: //Reset analyser, clear any state michael@0: void Reset(bool aIsPreferredLanguage) michael@0: { michael@0: mDone = false; michael@0: mTotalChars = 0; michael@0: mFreqChars = 0; michael@0: mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; michael@0: } michael@0: michael@0: //It is not necessary to receive all data to draw conclusion. For charset detection, michael@0: // certain amount of data is enough michael@0: bool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} michael@0: michael@0: protected: michael@0: //we do not handle character base on its original encoding string, but michael@0: //convert this encoding string to a number, here called order. michael@0: //This allow multiple encoding of a language to share one frequency table michael@0: virtual int32_t GetOrder(const char* str) {return -1;} michael@0: michael@0: //If this flag is set to true, detection is done and conclusion has been made michael@0: bool mDone; michael@0: michael@0: //The number of characters whose frequency order is less than 512 michael@0: uint32_t mFreqChars; michael@0: michael@0: //Total character encounted. michael@0: uint32_t mTotalChars; michael@0: michael@0: //Number of hi-byte characters needed to trigger detection michael@0: uint32_t mDataThreshold; michael@0: michael@0: //Mapping table to get frequency order from char order (get from GetOrder()) michael@0: const int16_t *mCharToFreqOrder; michael@0: michael@0: //Size of above table michael@0: uint32_t mTableSize; michael@0: michael@0: //This is a constant value varies from language to language, it is used in michael@0: //calculating confidence. See my paper for further detail. michael@0: float mTypicalDistributionRatio; michael@0: }; michael@0: michael@0: michael@0: class EUCTWDistributionAnalysis: public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: EUCTWDistributionAnalysis(); michael@0: protected: michael@0: michael@0: //for euc-TW encoding, we are interested michael@0: // first byte range: 0xc4 -- 0xfe michael@0: // second byte range: 0xa1 -- 0xfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { if ((unsigned char)*str >= (unsigned char)0xc4) michael@0: return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; michael@0: else michael@0: return -1; michael@0: } michael@0: }; michael@0: michael@0: michael@0: class EUCKRDistributionAnalysis : public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: EUCKRDistributionAnalysis(); michael@0: protected: michael@0: //for euc-KR encoding, we are interested michael@0: // first byte range: 0xb0 -- 0xfe michael@0: // second byte range: 0xa1 -- 0xfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { if ((unsigned char)*str >= (unsigned char)0xb0) michael@0: return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; michael@0: else michael@0: return -1; michael@0: } michael@0: }; michael@0: michael@0: class GB2312DistributionAnalysis : public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: GB2312DistributionAnalysis(); michael@0: protected: michael@0: //for GB2312 encoding, we are interested michael@0: // first byte range: 0xb0 -- 0xfe michael@0: // second byte range: 0xa1 -- 0xfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { if ((unsigned char)*str >= (unsigned char)0xb0 && (unsigned char)str[1] >= (unsigned char)0xa1) michael@0: return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; michael@0: else michael@0: return -1; michael@0: } michael@0: }; michael@0: michael@0: michael@0: class Big5DistributionAnalysis : public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: Big5DistributionAnalysis(); michael@0: protected: michael@0: //for big5 encoding, we are interested michael@0: // first byte range: 0xa4 -- 0xfe michael@0: // second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { if ((unsigned char)*str >= (unsigned char)0xa4) michael@0: if ((unsigned char)str[1] >= (unsigned char)0xa1) michael@0: return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0xa1 +63; michael@0: else michael@0: return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; michael@0: else michael@0: return -1; michael@0: } michael@0: }; michael@0: michael@0: class SJISDistributionAnalysis : public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: SJISDistributionAnalysis(); michael@0: protected: michael@0: //for sjis encoding, we are interested michael@0: // first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe michael@0: // second byte range: 0x40 -- 0x7e, 0x81 -- oxfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { michael@0: int32_t order; michael@0: if ((unsigned char)*str >= (unsigned char)0x81 && (unsigned char)*str <= (unsigned char)0x9f) michael@0: order = 188 * ((unsigned char)str[0]-(unsigned char)0x81); michael@0: else if ((unsigned char)*str >= (unsigned char)0xe0 && (unsigned char)*str <= (unsigned char)0xef) michael@0: order = 188 * ((unsigned char)str[0]-(unsigned char)0xe0 + 31); michael@0: else michael@0: return -1; michael@0: order += (unsigned char)*(str+1) - 0x40; michael@0: if ((unsigned char)str[1] > (unsigned char)0x7f) michael@0: order--; michael@0: return order; michael@0: } michael@0: }; michael@0: michael@0: class EUCJPDistributionAnalysis : public CharDistributionAnalysis michael@0: { michael@0: public: michael@0: EUCJPDistributionAnalysis(); michael@0: protected: michael@0: //for euc-JP encoding, we are interested michael@0: // first byte range: 0xa0 -- 0xfe michael@0: // second byte range: 0xa1 -- 0xfe michael@0: //no validation needed here. State machine has done that michael@0: int32_t GetOrder(const char* str) michael@0: { if ((unsigned char)*str >= (unsigned char)0xa0) michael@0: return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; michael@0: else michael@0: return -1; michael@0: } michael@0: }; michael@0: michael@0: #endif //CharDistribution_h__ michael@0: