extensions/universalchardet/src/base/nsLatin1Prober.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nsLatin1Prober.h"
     7 #include "prmem.h"
     8 #include <stdio.h>
    10 #define UDF    0        // undefined
    11 #define OTH    1        //other
    12 #define ASC    2        // ascii capital letter
    13 #define ASS    3        // ascii small letter
    14 #define ACV    4        // accent capital vowel
    15 #define ACO    5        // accent capital other
    16 #define ASV    6        // accent small vowel
    17 #define ASO    7        // accent small other
    18 #define CLASS_NUM   8    // total classes
    20 static const unsigned char Latin1_CharToClass[] = 
    21 {
    22   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 00 - 07
    23   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 08 - 0F
    24   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 10 - 17
    25   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 18 - 1F
    26   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 20 - 27
    27   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 28 - 2F
    28   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 30 - 37
    29   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 38 - 3F
    30   OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 40 - 47
    31   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 48 - 4F
    32   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   // 50 - 57
    33   ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   // 58 - 5F
    34   OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 60 - 67
    35   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 68 - 6F
    36   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   // 70 - 77
    37   ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   // 78 - 7F
    38   OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   // 80 - 87
    39   OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   // 88 - 8F
    40   UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // 90 - 97
    41   OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   // 98 - 9F
    42   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A0 - A7
    43   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // A8 - AF
    44   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B0 - B7
    45   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   // B8 - BF
    46   ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   // C0 - C7
    47   ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   // C8 - CF
    48   ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   // D0 - D7
    49   ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   // D8 - DF
    50   ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   // E0 - E7
    51   ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   // E8 - EF
    52   ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   // F0 - F7
    53   ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   // F8 - FF
    54 };
    57 /* 0 : illegal 
    58    1 : very unlikely 
    59    2 : normal 
    60    3 : very likely
    61 */
    62 static const unsigned char Latin1ClassModel[] = 
    63 {
    64 /*      UDF OTH ASC ASS ACV ACO ASV ASO  */
    65 /*UDF*/  0,  0,  0,  0,  0,  0,  0,  0,
    66 /*OTH*/  0,  3,  3,  3,  3,  3,  3,  3,
    67 /*ASC*/  0,  3,  3,  3,  3,  3,  3,  3, 
    68 /*ASS*/  0,  3,  3,  3,  1,  1,  3,  3,
    69 /*ACV*/  0,  3,  3,  3,  1,  2,  1,  2,
    70 /*ACO*/  0,  3,  3,  3,  3,  3,  3,  3, 
    71 /*ASV*/  0,  3,  1,  3,  1,  1,  1,  3, 
    72 /*ASO*/  0,  3,  1,  3,  1,  1,  3,  3,
    73 };
    75 void  nsLatin1Prober::Reset(void)
    76 {
    77   mState = eDetecting;
    78   mLastCharClass = OTH;
    79   for (int i = 0; i < FREQ_CAT_NUM; i++)
    80     mFreqCounter[i] = 0;
    81 }
    84 nsProbingState nsLatin1Prober::HandleData(const char* aBuf, uint32_t aLen)
    85 {
    86   char *newBuf1 = 0;
    87   uint32_t newLen1 = 0;
    89   if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) {
    90     newBuf1 = (char*)aBuf;
    91     newLen1 = aLen;
    92   }
    94   unsigned char charClass;
    95   unsigned char freq;
    96   for (uint32_t i = 0; i < newLen1; i++)
    97   {
    98     charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]];
    99     freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass];
   100     if (freq == 0) {
   101       mState = eNotMe;
   102       break;
   103     }
   104     mFreqCounter[freq]++;
   105     mLastCharClass = charClass;
   106   }
   108   if (newBuf1 != aBuf)
   109     PR_FREEIF(newBuf1);
   111   return mState;
   112 }
   114 float nsLatin1Prober::GetConfidence(void)
   115 {
   116   if (mState == eNotMe)
   117     return 0.01f;
   119   float confidence;
   120   uint32_t total = 0;
   121   for (int32_t i = 0; i < FREQ_CAT_NUM; i++)
   122     total += mFreqCounter[i];
   124   if(!total)
   125     confidence = 0.0f;
   126   else
   127   {
   128     confidence = mFreqCounter[3]*1.0f / total;
   129     confidence -= mFreqCounter[1]*20.0f/total;
   130   }
   132   if (confidence < 0.0f)
   133     confidence = 0.0f;
   135   // lower the confidence of latin1 so that other more accurate detector 
   136   // can take priority.
   137   confidence *= 0.50f;
   139   return confidence;
   140 }
   142 #ifdef DEBUG_chardet
   143 void  nsLatin1Prober::DumpStatus()
   144 {
   145   printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName());
   146 }
   147 #endif

mercurial