extensions/universalchardet/src/base/nsLatin1Prober.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/extensions/universalchardet/src/base/nsLatin1Prober.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,149 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "nsLatin1Prober.h"
    1.10 +#include "prmem.h"
    1.11 +#include <stdio.h>
    1.12 +
    1.13 +#define UDF    0        // undefined
    1.14 +#define OTH    1        //other
    1.15 +#define ASC    2        // ascii capital letter
    1.16 +#define ASS    3        // ascii small letter
    1.17 +#define ACV    4        // accent capital vowel
    1.18 +#define ACO    5        // accent capital other
    1.19 +#define ASV    6        // accent small vowel
    1.20 +#define ASO    7        // accent small other
    1.21 +#define CLASS_NUM   8    // total classes
    1.22 +
    1.23 +static const unsigned char Latin1_CharToClass[] = 
    1.24 +{
    1.25 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
    1.26 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
    1.27 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
    1.28 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
    1.29 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
    1.30 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
    1.31 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
    1.32 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
    1.33 +  OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
    1.34 +  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
    1.35 +  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
    1.36 +  ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
    1.37 +  OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
    1.38 +  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
    1.39 +  ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
    1.40 +  ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
    1.41 +  OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
    1.42 +  OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
    1.43 +  UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
    1.44 +  OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
    1.45 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
    1.46 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
    1.47 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
    1.48 +  OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
    1.49 +  ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
    1.50 +  ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
    1.51 +  ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
    1.52 +  ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
    1.53 +  ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
    1.54 +  ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
    1.55 +  ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
    1.56 +  ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
    1.57 +};
    1.58 +
    1.59 +
    1.60 +/* 0 : illegal 
    1.61 +   1 : very unlikely 
    1.62 +   2 : normal 
    1.63 +   3 : very likely
    1.64 +*/
    1.65 +static const unsigned char Latin1ClassModel[] = 
    1.66 +{
    1.67 +/*      UDF OTH ASC ASS ACV ACO ASV ASO  */
    1.68 +/*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
    1.69 +/*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
    1.70 +/*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
    1.71 +/*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
    1.72 +/*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
    1.73 +/*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
    1.74 +/*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
    1.75 +/*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
    1.76 +};
    1.77 +
    1.78 +void  nsLatin1Prober::Reset(void)
    1.79 +{
    1.80 +  mState = eDetecting;
    1.81 +  mLastCharClass = OTH;
    1.82 +  for (int i = 0; i < FREQ_CAT_NUM; i++)
    1.83 +    mFreqCounter[i] = 0;
    1.84 +}
    1.85 +
    1.86 +
    1.87 +nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen)
    1.88 +{
    1.89 +  char *newBuf1 = 0;
    1.90 +  uint32_t newLen1 = 0;
    1.91 +
    1.92 +  if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
    1.93 +    newBuf1 = (char*)aBuf;
    1.94 +    newLen1 = aLen;
    1.95 +  }
    1.96 +  
    1.97 +  unsigned char charClass;
    1.98 +  unsigned char freq;
    1.99 +  for (uint32_t i = 0; i < newLen1; i++)
   1.100 +  {
   1.101 +    charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
   1.102 +    freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
   1.103 +    if (freq == 0) {
   1.104 +      mState = eNotMe;
   1.105 +      break;
   1.106 +    }
   1.107 +    mFreqCounter[freq]++;
   1.108 +    mLastCharClass = charClass;
   1.109 +  }
   1.110 +
   1.111 +  if (newBuf1 != aBuf)
   1.112 +    PR_FREEIF(newBuf1);
   1.113 +
   1.114 +  return mState;
   1.115 +}
   1.116 +
   1.117 +float nsLatin1Prober::GetConfidence(void)
   1.118 +{
   1.119 +  if (mState == eNotMe)
   1.120 +    return 0.01f;
   1.121 +  
   1.122 +  float confidence;
   1.123 +  uint32_t total = 0;
   1.124 +  for (int32_t i = 0; i < FREQ_CAT_NUM; i++)
   1.125 +    total += mFreqCounter[i];
   1.126 +
   1.127 +  if(!total)
   1.128 +    confidence = 0.0f;
   1.129 +  else
   1.130 +  {
   1.131 +    confidence = mFreqCounter[3]*1.0f / total;
   1.132 +    confidence -= mFreqCounter[1]*20.0f/total;
   1.133 +  }
   1.134 +
   1.135 +  if (confidence < 0.0f)
   1.136 +    confidence = 0.0f;
   1.137 +  
   1.138 +  // lower the confidence of latin1 so that other more accurate detector 
   1.139 +  // can take priority.
   1.140 +  confidence *= 0.50f;
   1.141 +
   1.142 +  return confidence;
   1.143 +}
   1.144 +
   1.145 +#ifdef DEBUG_chardet
   1.146 +void  nsLatin1Prober::DumpStatus()
   1.147 +{
   1.148 +  printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
   1.149 +}
   1.150 +#endif
   1.151 +
   1.152 +

mercurial