extensions/universalchardet/src/base/JpCntx.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/JpCntx.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,107 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#ifndef __JPCNTX_H__
    1.10 +#define __JPCNTX_H__
    1.11 +
    1.12 +#define NUM_OF_CATEGORY 6
    1.13 +
    1.14 +#include "nscore.h" 
    1.15 +
    1.16 +#define ENOUGH_REL_THRESHOLD  100
    1.17 +#define MAX_REL_THRESHOLD     1000
    1.18 +
    1.19 +//hiragana frequency category table
    1.20 +extern const uint8_t jp2CharContext[83][83];
    1.21 +
    1.22 +class JapaneseContextAnalysis
    1.23 +{
    1.24 +public:
    1.25 +  JapaneseContextAnalysis() {Reset(false);}
    1.26 +
    1.27 +  void HandleData(const char* aBuf, uint32_t aLen);
    1.28 +
    1.29 +  void HandleOneChar(const char* aStr, uint32_t aCharLen)
    1.30 +  {
    1.31 +    int32_t order;
    1.32 +
    1.33 +    //if we received enough data, stop here   
    1.34 +    if (mTotalRel > MAX_REL_THRESHOLD)   mDone = true;
    1.35 +    if (mDone)       return;
    1.36 +     
    1.37 +    //Only 2-bytes characters are of our interest
    1.38 +    order = (aCharLen == 2) ? GetOrder(aStr) : -1;
    1.39 +    if (order != -1 && mLastCharOrder != -1)
    1.40 +    {
    1.41 +      mTotalRel++;
    1.42 +      //count this sequence to its category counter
    1.43 +      mRelSample[jp2CharContext[mLastCharOrder][order]]++;
    1.44 +    }
    1.45 +    mLastCharOrder = order;
    1.46 +  }
    1.47 +
    1.48 +  float GetConfidence(void);
    1.49 +  void      Reset(bool aIsPreferredLanguage);
    1.50 +  bool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
    1.51 +
    1.52 +protected:
    1.53 +  virtual int32_t GetOrder(const char* str, uint32_t *charLen) = 0;
    1.54 +  virtual int32_t GetOrder(const char* str) = 0;
    1.55 +
    1.56 +  //category counters, each integer counts sequences in its category
    1.57 +  uint32_t mRelSample[NUM_OF_CATEGORY];
    1.58 +
    1.59 +  //total sequence received
    1.60 +  uint32_t mTotalRel;
    1.61 +
    1.62 +  //Number of sequences needed to trigger detection
    1.63 +  uint32_t mDataThreshold;
    1.64 +  
    1.65 +  //The order of previous char
    1.66 +  int32_t  mLastCharOrder;
    1.67 +
    1.68 +  //if last byte in current buffer is not the last byte of a character, we
    1.69 +  //need to know how many byte to skip in next buffer.
    1.70 +  uint32_t mNeedToSkipCharNum;
    1.71 +
    1.72 +  //If this flag is set to true, detection is done and conclusion has been made
    1.73 +  bool     mDone;
    1.74 +};
    1.75 +
    1.76 +
    1.77 +class SJISContextAnalysis : public JapaneseContextAnalysis
    1.78 +{
    1.79 +  //SJISContextAnalysis(){};
    1.80 +protected:
    1.81 +  int32_t GetOrder(const char* str, uint32_t *charLen);
    1.82 +
    1.83 +  int32_t GetOrder(const char* str)
    1.84 +  {
    1.85 +    //We only interested in Hiragana, so first byte is '\202'
    1.86 +    if (*str == '\202' && 
    1.87 +          (unsigned char)*(str+1) >= (unsigned char)0x9f && 
    1.88 +          (unsigned char)*(str+1) <= (unsigned char)0xf1)
    1.89 +      return (unsigned char)*(str+1) - (unsigned char)0x9f;
    1.90 +    return -1;
    1.91 +  }
    1.92 +};
    1.93 +
    1.94 +class EUCJPContextAnalysis : public JapaneseContextAnalysis
    1.95 +{
    1.96 +protected:
    1.97 +  int32_t GetOrder(const char* str, uint32_t *charLen);
    1.98 +  int32_t GetOrder(const char* str)
    1.99 +    //We only interested in Hiragana, so first byte is '\244'
   1.100 +  {
   1.101 +    if (*str == '\244' &&
   1.102 +          (unsigned char)*(str+1) >= (unsigned char)0xa1 &&
   1.103 +          (unsigned char)*(str+1) <= (unsigned char)0xf3)
   1.104 +      return (unsigned char)*(str+1) - (unsigned char)0xa1;
   1.105 +    return -1;
   1.106 +  }
   1.107 +};
   1.108 +
   1.109 +#endif /* __JPCNTX_H__ */
   1.110 +

mercurial