extensions/universalchardet/src/base/JpCntx.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #ifndef __JPCNTX_H__
michael@0 7 #define __JPCNTX_H__
michael@0 8
michael@0 9 #define NUM_OF_CATEGORY 6
michael@0 10
michael@0 11 #include "nscore.h"
michael@0 12
michael@0 13 #define ENOUGH_REL_THRESHOLD 100
michael@0 14 #define MAX_REL_THRESHOLD 1000
michael@0 15
michael@0 16 //hiragana frequency category table
michael@0 17 extern const uint8_t jp2CharContext[83][83];
michael@0 18
michael@0 19 class JapaneseContextAnalysis
michael@0 20 {
michael@0 21 public:
michael@0 22 JapaneseContextAnalysis() {Reset(false);}
michael@0 23
michael@0 24 void HandleData(const char* aBuf, uint32_t aLen);
michael@0 25
michael@0 26 void HandleOneChar(const char* aStr, uint32_t aCharLen)
michael@0 27 {
michael@0 28 int32_t order;
michael@0 29
michael@0 30 //if we received enough data, stop here
michael@0 31 if (mTotalRel > MAX_REL_THRESHOLD) mDone = true;
michael@0 32 if (mDone) return;
michael@0 33
michael@0 34 //Only 2-bytes characters are of our interest
michael@0 35 order = (aCharLen == 2) ? GetOrder(aStr) : -1;
michael@0 36 if (order != -1 && mLastCharOrder != -1)
michael@0 37 {
michael@0 38 mTotalRel++;
michael@0 39 //count this sequence to its category counter
michael@0 40 mRelSample[jp2CharContext[mLastCharOrder][order]]++;
michael@0 41 }
michael@0 42 mLastCharOrder = order;
michael@0 43 }
michael@0 44
michael@0 45 float GetConfidence(void);
michael@0 46 void Reset(bool aIsPreferredLanguage);
michael@0 47 bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
michael@0 48
michael@0 49 protected:
michael@0 50 virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0;
michael@0 51 virtual int32_t GetOrder(const char* str) = 0;
michael@0 52
michael@0 53 //category counters, each integer counts sequences in its category
michael@0 54 uint32_t mRelSample[NUM_OF_CATEGORY];
michael@0 55
michael@0 56 //total sequence received
michael@0 57 uint32_t mTotalRel;
michael@0 58
michael@0 59 //Number of sequences needed to trigger detection
michael@0 60 uint32_t mDataThreshold;
michael@0 61
michael@0 62 //The order of previous char
michael@0 63 int32_t mLastCharOrder;
michael@0 64
michael@0 65 //if last byte in current buffer is not the last byte of a character, we
michael@0 66 //need to know how many byte to skip in next buffer.
michael@0 67 uint32_t mNeedToSkipCharNum;
michael@0 68
michael@0 69 //If this flag is set to true, detection is done and conclusion has been made
michael@0 70 bool mDone;
michael@0 71 };
michael@0 72
michael@0 73
michael@0 74 class SJISContextAnalysis : public JapaneseContextAnalysis
michael@0 75 {
michael@0 76 //SJISContextAnalysis(){};
michael@0 77 protected:
michael@0 78 int32_t GetOrder(const char* str, uint32_t *charLen);
michael@0 79
michael@0 80 int32_t GetOrder(const char* str)
michael@0 81 {
michael@0 82 //We only interested in Hiragana, so first byte is '\202'
michael@0 83 if (*str == '\202' &&
michael@0 84 (unsigned char)*(str+1) >= (unsigned char)0x9f &&
michael@0 85 (unsigned char)*(str+1) <= (unsigned char)0xf1)
michael@0 86 return (unsigned char)*(str+1) - (unsigned char)0x9f;
michael@0 87 return -1;
michael@0 88 }
michael@0 89 };
michael@0 90
michael@0 91 class EUCJPContextAnalysis : public JapaneseContextAnalysis
michael@0 92 {
michael@0 93 protected:
michael@0 94 int32_t GetOrder(const char* str, uint32_t *charLen);
michael@0 95 int32_t GetOrder(const char* str)
michael@0 96 //We only interested in Hiragana, so first byte is '\244'
michael@0 97 {
michael@0 98 if (*str == '\244' &&
michael@0 99 (unsigned char)*(str+1) >= (unsigned char)0xa1 &&
michael@0 100 (unsigned char)*(str+1) <= (unsigned char)0xf3)
michael@0 101 return (unsigned char)*(str+1) - (unsigned char)0xa1;
michael@0 102 return -1;
michael@0 103 }
michael@0 104 };
michael@0 105
michael@0 106 #endif /* __JPCNTX_H__ */
michael@0 107

mercurial