extensions/universalchardet/src/base/nsUniversalDetector.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nscore.h"
     8 #include "nsUniversalDetector.h"
    10 #include "nsMBCSGroupProber.h"
    11 #include "nsSBCSGroupProber.h"
    12 #include "nsEscCharsetProber.h"
    13 #include "nsLatin1Prober.h"
    15 nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter)
    16 {
    17   mDone = false;
    18   mBestGuess = -1;   //illegal value as signal
    19   mInTag = false;
    20   mEscCharSetProber = nullptr;
    22   mStart = true;
    23   mDetectedCharset = nullptr;
    24   mGotData = false;
    25   mInputState = ePureAscii;
    26   mLastChar = '\0';
    27   mLanguageFilter = aLanguageFilter;
    29   uint32_t i;
    30   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    31     mCharSetProbers[i] = nullptr;
    32 }
    34 nsUniversalDetector::~nsUniversalDetector() 
    35 {
    36   for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    37     delete mCharSetProbers[i];
    39   delete mEscCharSetProber;
    40 }
    42 void 
    43 nsUniversalDetector::Reset()
    44 {
    45   mDone = false;
    46   mBestGuess = -1;   //illegal value as signal
    47   mInTag = false;
    49   mStart = true;
    50   mDetectedCharset = nullptr;
    51   mGotData = false;
    52   mInputState = ePureAscii;
    53   mLastChar = '\0';
    55   if (mEscCharSetProber)
    56     mEscCharSetProber->Reset();
    58   uint32_t i;
    59   for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
    60     if (mCharSetProbers[i])
    61       mCharSetProbers[i]->Reset();
    62 }
    64 //---------------------------------------------------------------------
    65 #define SHORTCUT_THRESHOLD      (float)0.95
    66 #define MINIMUM_THRESHOLD      (float)0.20
    68 nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
    69 {
    70   if(mDone) 
    71     return NS_OK;
    73   if (aLen > 0)
    74     mGotData = true;
    76   //If the data starts with BOM, we know it is UTF
    77   if (mStart)
    78   {
    79     mStart = false;
    80     if (aLen >= 2) {
    81       switch (aBuf[0]) {
    82       case '\xEF':
    83         if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) {
    84           // EF BB BF  UTF-8 encoded BOM
    85           mDetectedCharset = "UTF-8";
    86         }
    87         break;
    88       case '\xFE':
    89         if ('\xFF' == aBuf[1]) {
    90           // FE FF  UTF-16, big endian BOM
    91           mDetectedCharset = "UTF-16BE";
    92         }
    93         break;
    94       case '\xFF':
    95         if ('\xFE' == aBuf[1]) {
    96           // FF FE  UTF-16, little endian BOM
    97           mDetectedCharset = "UTF-16LE";
    98         }
    99         break;
   100       }  // switch
   101     }
   103     if (mDetectedCharset)
   104     {
   105       mDone = true;
   106       return NS_OK;
   107     }
   108   }
   110   uint32_t i;
   111   for (i = 0; i < aLen; i++)
   112   {
   113     //other than 0xa0, if every othe character is ascii, the page is ascii
   114     if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')  //Since many Ascii only page contains NBSP 
   115     {
   116       //we got a non-ascii byte (high-byte)
   117       if (mInputState != eHighbyte)
   118       {
   119         //adjust state
   120         mInputState = eHighbyte;
   122         //kill mEscCharSetProber if it is active
   123         if (mEscCharSetProber) {
   124           delete mEscCharSetProber;
   125           mEscCharSetProber = nullptr;
   126         }
   128         //start multibyte and singlebyte charset prober
   129         if (nullptr == mCharSetProbers[0])
   130         {
   131           mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
   132           if (nullptr == mCharSetProbers[0])
   133             return NS_ERROR_OUT_OF_MEMORY;
   134         }
   135         if (nullptr == mCharSetProbers[1] &&
   136             (mLanguageFilter & NS_FILTER_NON_CJK))
   137         {
   138           mCharSetProbers[1] = new nsSBCSGroupProber;
   139           if (nullptr == mCharSetProbers[1])
   140             return NS_ERROR_OUT_OF_MEMORY;
   141         }
   142         if (nullptr == mCharSetProbers[2])
   143         {
   144           mCharSetProbers[2] = new nsLatin1Prober; 
   145           if (nullptr == mCharSetProbers[2])
   146             return NS_ERROR_OUT_OF_MEMORY;
   147         }
   148       }
   149     }
   150     else
   151     {
   152       //ok, just pure ascii so far
   153       if ( ePureAscii == mInputState &&
   154         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
   155       {
   156         //found escape character or HZ "~{"
   157         mInputState = eEscAscii;
   158       }
   159       mLastChar = aBuf[i];
   160     }
   161   }
   163   nsProbingState st;
   164   switch (mInputState)
   165   {
   166   case eEscAscii:
   167     if (nullptr == mEscCharSetProber) {
   168       mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
   169       if (nullptr == mEscCharSetProber)
   170         return NS_ERROR_OUT_OF_MEMORY;
   171     }
   172     st = mEscCharSetProber->HandleData(aBuf, aLen);
   173     if (st == eFoundIt)
   174     {
   175       mDone = true;
   176       mDetectedCharset = mEscCharSetProber->GetCharSetName();
   177     }
   178     break;
   179   case eHighbyte:
   180     for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
   181     {
   182       if (mCharSetProbers[i])
   183       {
   184         st = mCharSetProbers[i]->HandleData(aBuf, aLen);
   185         if (st == eFoundIt) 
   186         {
   187           mDone = true;
   188           mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
   189           return NS_OK;
   190         }
   191       } 
   192     }
   193     break;
   195   default:  //pure ascii
   196     ;//do nothing here
   197   }
   198   return NS_OK;
   199 }
   202 //---------------------------------------------------------------------
   203 void nsUniversalDetector::DataEnd()
   204 {
   205   if (!mGotData)
   206   {
   207     // we haven't got any data yet, return immediately 
   208     // caller program sometimes call DataEnd before anything has been sent to detector
   209     return;
   210   }
   212   if (mDetectedCharset)
   213   {
   214     mDone = true;
   215     Report(mDetectedCharset);
   216     return;
   217   }
   219   switch (mInputState)
   220   {
   221   case eHighbyte:
   222     {
   223       float proberConfidence;
   224       float maxProberConfidence = (float)0.0;
   225       int32_t maxProber = 0;
   227       for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
   228       {
   229         if (mCharSetProbers[i])
   230         {
   231           proberConfidence = mCharSetProbers[i]->GetConfidence();
   232           if (proberConfidence > maxProberConfidence)
   233           {
   234             maxProberConfidence = proberConfidence;
   235             maxProber = i;
   236           }
   237         }
   238       }
   239       //do not report anything because we are not confident of it, that's in fact a negative answer
   240       if (maxProberConfidence > MINIMUM_THRESHOLD)
   241         Report(mCharSetProbers[maxProber]->GetCharSetName());
   242     }
   243     break;
   244   case eEscAscii:
   245     break;
   246   default:
   247     ;
   248   }
   249   return;
   250 }

mercurial