extensions/universalchardet/src/base/JpCntx.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #ifndef __JPCNTX_H__
     7 #define __JPCNTX_H__
     9 #define NUM_OF_CATEGORY 6
    11 #include "nscore.h" 
    13 #define ENOUGH_REL_THRESHOLD  100
    14 #define MAX_REL_THRESHOLD     1000
    16 //hiragana frequency category table
    17 extern const uint8_t jp2CharContext[83][83];
    19 class JapaneseContextAnalysis
    20 {
    21 public:
    22   JapaneseContextAnalysis() {Reset(false);}
    24   void HandleData(const char* aBuf, uint32_t aLen);
    26   void HandleOneChar(const char* aStr, uint32_t aCharLen)
    27   {
    28     int32_t order;
    30     //if we received enough data, stop here   
    31     if (mTotalRel > MAX_REL_THRESHOLD)   mDone = true;
    32     if (mDone)       return;
    34     //Only 2-bytes characters are of our interest
    35     order = (aCharLen == 2) ? GetOrder(aStr) : -1;
    36     if (order != -1 && mLastCharOrder != -1)
    37     {
    38       mTotalRel++;
    39       //count this sequence to its category counter
    40       mRelSample[jp2CharContext[mLastCharOrder][order]]++;
    41     }
    42     mLastCharOrder = order;
    43   }
    45   float GetConfidence(void);
    46   void      Reset(bool aIsPreferredLanguage);
    47   bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
    49 protected:
    50   virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0;
    51   virtual int32_t GetOrder(const char* str) = 0;
    53   //category counters, each integer counts sequences in its category
    54   uint32_t mRelSample[NUM_OF_CATEGORY];
    56   //total sequence received
    57   uint32_t mTotalRel;
    59   //Number of sequences needed to trigger detection
    60   uint32_t mDataThreshold;
    62   //The order of previous char
    63   int32_t  mLastCharOrder;
    65   //if last byte in current buffer is not the last byte of a character, we
    66   //need to know how many byte to skip in next buffer.
    67   uint32_t mNeedToSkipCharNum;
    69   //If this flag is set to true, detection is done and conclusion has been made
    70   bool     mDone;
    71 };
    74 class SJISContextAnalysis : public JapaneseContextAnalysis
    75 {
    76   //SJISContextAnalysis(){};
    77 protected:
    78   int32_t GetOrder(const char* str, uint32_t *charLen);
    80   int32_t GetOrder(const char* str)
    81   {
    82     //We only interested in Hiragana, so first byte is '\202'
    83     if (*str == '\202' && 
    84           (unsigned char)*(str+1) >= (unsigned char)0x9f && 
    85           (unsigned char)*(str+1) <= (unsigned char)0xf1)
    86       return (unsigned char)*(str+1) - (unsigned char)0x9f;
    87     return -1;
    88   }
    89 };
    91 class EUCJPContextAnalysis : public JapaneseContextAnalysis
    92 {
    93 protected:
    94   int32_t GetOrder(const char* str, uint32_t *charLen);
    95   int32_t GetOrder(const char* str)
    96     //We only interested in Hiragana, so first byte is '\244'
    97   {
    98     if (*str == '\244' &&
    99           (unsigned char)*(str+1) >= (unsigned char)0xa1 &&
   100           (unsigned char)*(str+1) <= (unsigned char)0xf3)
   101       return (unsigned char)*(str+1) - (unsigned char)0xa1;
   102     return -1;
   103   }
   104 };
   106 #endif /* __JPCNTX_H__ */

mercurial