1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/extensions/universalchardet/src/base/JpCntx.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,107 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef __JPCNTX_H__ 1.10 +#define __JPCNTX_H__ 1.11 + 1.12 +#define NUM_OF_CATEGORY 6 1.13 + 1.14 +#include "nscore.h" 1.15 + 1.16 +#define ENOUGH_REL_THRESHOLD 100 1.17 +#define MAX_REL_THRESHOLD 1000 1.18 + 1.19 +//hiragana frequency category table 1.20 +extern const uint8_t jp2CharContext[83][83]; 1.21 + 1.22 +class JapaneseContextAnalysis 1.23 +{ 1.24 +public: 1.25 + JapaneseContextAnalysis() {Reset(false);} 1.26 + 1.27 + void HandleData(const char* aBuf, uint32_t aLen); 1.28 + 1.29 + void HandleOneChar(const char* aStr, uint32_t aCharLen) 1.30 + { 1.31 + int32_t order; 1.32 + 1.33 + //if we received enough data, stop here 1.34 + if (mTotalRel > MAX_REL_THRESHOLD) mDone = true; 1.35 + if (mDone) return; 1.36 + 1.37 + //Only 2-bytes characters are of our interest 1.38 + order = (aCharLen == 2) ? GetOrder(aStr) : -1; 1.39 + if (order != -1 && mLastCharOrder != -1) 1.40 + { 1.41 + mTotalRel++; 1.42 + //count this sequence to its category counter 1.43 + mRelSample[jp2CharContext[mLastCharOrder][order]]++; 1.44 + } 1.45 + mLastCharOrder = order; 1.46 + } 1.47 + 1.48 + float GetConfidence(void); 1.49 + void Reset(bool aIsPreferredLanguage); 1.50 + bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} 1.51 + 1.52 +protected: 1.53 + virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0; 1.54 + virtual int32_t GetOrder(const char* str) = 0; 1.55 + 1.56 + //category counters, each integer counts sequences in its category 1.57 + uint32_t mRelSample[NUM_OF_CATEGORY]; 1.58 + 1.59 + //total sequence received 1.60 + uint32_t mTotalRel; 1.61 + 1.62 + //Number of sequences needed to trigger detection 1.63 + uint32_t mDataThreshold; 1.64 + 1.65 + //The order of previous char 1.66 + int32_t mLastCharOrder; 1.67 + 1.68 + //if last byte in current buffer is not the last byte of a character, we 1.69 + //need to know how many byte to skip in next buffer. 1.70 + uint32_t mNeedToSkipCharNum; 1.71 + 1.72 + //If this flag is set to true, detection is done and conclusion has been made 1.73 + bool mDone; 1.74 +}; 1.75 + 1.76 + 1.77 +class SJISContextAnalysis : public JapaneseContextAnalysis 1.78 +{ 1.79 + //SJISContextAnalysis(){}; 1.80 +protected: 1.81 + int32_t GetOrder(const char* str, uint32_t *charLen); 1.82 + 1.83 + int32_t GetOrder(const char* str) 1.84 + { 1.85 + //We only interested in Hiragana, so first byte is '\202' 1.86 + if (*str == '\202' && 1.87 + (unsigned char)*(str+1) >= (unsigned char)0x9f && 1.88 + (unsigned char)*(str+1) <= (unsigned char)0xf1) 1.89 + return (unsigned char)*(str+1) - (unsigned char)0x9f; 1.90 + return -1; 1.91 + } 1.92 +}; 1.93 + 1.94 +class EUCJPContextAnalysis : public JapaneseContextAnalysis 1.95 +{ 1.96 +protected: 1.97 + int32_t GetOrder(const char* str, uint32_t *charLen); 1.98 + int32_t GetOrder(const char* str) 1.99 + //We only interested in Hiragana, so first byte is '\244' 1.100 + { 1.101 + if (*str == '\244' && 1.102 + (unsigned char)*(str+1) >= (unsigned char)0xa1 && 1.103 + (unsigned char)*(str+1) <= (unsigned char)0xf3) 1.104 + return (unsigned char)*(str+1) - (unsigned char)0xa1; 1.105 + return -1; 1.106 + } 1.107 +}; 1.108 + 1.109 +#endif /* __JPCNTX_H__ */ 1.110 +