extensions/universalchardet/src/base/nsUniversalDetector.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "nscore.h"
michael@0 7
michael@0 8 #include "nsUniversalDetector.h"
michael@0 9
michael@0 10 #include "nsMBCSGroupProber.h"
michael@0 11 #include "nsSBCSGroupProber.h"
michael@0 12 #include "nsEscCharsetProber.h"
michael@0 13 #include "nsLatin1Prober.h"
michael@0 14
michael@0 15 nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter)
michael@0 16 {
michael@0 17 mDone = false;
michael@0 18 mBestGuess = -1; //illegal value as signal
michael@0 19 mInTag = false;
michael@0 20 mEscCharSetProber = nullptr;
michael@0 21
michael@0 22 mStart = true;
michael@0 23 mDetectedCharset = nullptr;
michael@0 24 mGotData = false;
michael@0 25 mInputState = ePureAscii;
michael@0 26 mLastChar = '\0';
michael@0 27 mLanguageFilter = aLanguageFilter;
michael@0 28
michael@0 29 uint32_t i;
michael@0 30 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
michael@0 31 mCharSetProbers[i] = nullptr;
michael@0 32 }
michael@0 33
michael@0 34 nsUniversalDetector::~nsUniversalDetector()
michael@0 35 {
michael@0 36 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
michael@0 37 delete mCharSetProbers[i];
michael@0 38
michael@0 39 delete mEscCharSetProber;
michael@0 40 }
michael@0 41
michael@0 42 void
michael@0 43 nsUniversalDetector::Reset()
michael@0 44 {
michael@0 45 mDone = false;
michael@0 46 mBestGuess = -1; //illegal value as signal
michael@0 47 mInTag = false;
michael@0 48
michael@0 49 mStart = true;
michael@0 50 mDetectedCharset = nullptr;
michael@0 51 mGotData = false;
michael@0 52 mInputState = ePureAscii;
michael@0 53 mLastChar = '\0';
michael@0 54
michael@0 55 if (mEscCharSetProber)
michael@0 56 mEscCharSetProber->Reset();
michael@0 57
michael@0 58 uint32_t i;
michael@0 59 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
michael@0 60 if (mCharSetProbers[i])
michael@0 61 mCharSetProbers[i]->Reset();
michael@0 62 }
michael@0 63
michael@0 64 //---------------------------------------------------------------------
michael@0 65 #define SHORTCUT_THRESHOLD (float)0.95
michael@0 66 #define MINIMUM_THRESHOLD (float)0.20
michael@0 67
michael@0 68 nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen)
michael@0 69 {
michael@0 70 if(mDone)
michael@0 71 return NS_OK;
michael@0 72
michael@0 73 if (aLen > 0)
michael@0 74 mGotData = true;
michael@0 75
michael@0 76 //If the data starts with BOM, we know it is UTF
michael@0 77 if (mStart)
michael@0 78 {
michael@0 79 mStart = false;
michael@0 80 if (aLen >= 2) {
michael@0 81 switch (aBuf[0]) {
michael@0 82 case '\xEF':
michael@0 83 if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) {
michael@0 84 // EF BB BF UTF-8 encoded BOM
michael@0 85 mDetectedCharset = "UTF-8";
michael@0 86 }
michael@0 87 break;
michael@0 88 case '\xFE':
michael@0 89 if ('\xFF' == aBuf[1]) {
michael@0 90 // FE FF UTF-16, big endian BOM
michael@0 91 mDetectedCharset = "UTF-16BE";
michael@0 92 }
michael@0 93 break;
michael@0 94 case '\xFF':
michael@0 95 if ('\xFE' == aBuf[1]) {
michael@0 96 // FF FE UTF-16, little endian BOM
michael@0 97 mDetectedCharset = "UTF-16LE";
michael@0 98 }
michael@0 99 break;
michael@0 100 } // switch
michael@0 101 }
michael@0 102
michael@0 103 if (mDetectedCharset)
michael@0 104 {
michael@0 105 mDone = true;
michael@0 106 return NS_OK;
michael@0 107 }
michael@0 108 }
michael@0 109
michael@0 110 uint32_t i;
michael@0 111 for (i = 0; i < aLen; i++)
michael@0 112 {
michael@0 113 //other than 0xa0, if every othe character is ascii, the page is ascii
michael@0 114 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
michael@0 115 {
michael@0 116 //we got a non-ascii byte (high-byte)
michael@0 117 if (mInputState != eHighbyte)
michael@0 118 {
michael@0 119 //adjust state
michael@0 120 mInputState = eHighbyte;
michael@0 121
michael@0 122 //kill mEscCharSetProber if it is active
michael@0 123 if (mEscCharSetProber) {
michael@0 124 delete mEscCharSetProber;
michael@0 125 mEscCharSetProber = nullptr;
michael@0 126 }
michael@0 127
michael@0 128 //start multibyte and singlebyte charset prober
michael@0 129 if (nullptr == mCharSetProbers[0])
michael@0 130 {
michael@0 131 mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
michael@0 132 if (nullptr == mCharSetProbers[0])
michael@0 133 return NS_ERROR_OUT_OF_MEMORY;
michael@0 134 }
michael@0 135 if (nullptr == mCharSetProbers[1] &&
michael@0 136 (mLanguageFilter & NS_FILTER_NON_CJK))
michael@0 137 {
michael@0 138 mCharSetProbers[1] = new nsSBCSGroupProber;
michael@0 139 if (nullptr == mCharSetProbers[1])
michael@0 140 return NS_ERROR_OUT_OF_MEMORY;
michael@0 141 }
michael@0 142 if (nullptr == mCharSetProbers[2])
michael@0 143 {
michael@0 144 mCharSetProbers[2] = new nsLatin1Prober;
michael@0 145 if (nullptr == mCharSetProbers[2])
michael@0 146 return NS_ERROR_OUT_OF_MEMORY;
michael@0 147 }
michael@0 148 }
michael@0 149 }
michael@0 150 else
michael@0 151 {
michael@0 152 //ok, just pure ascii so far
michael@0 153 if ( ePureAscii == mInputState &&
michael@0 154 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
michael@0 155 {
michael@0 156 //found escape character or HZ "~{"
michael@0 157 mInputState = eEscAscii;
michael@0 158 }
michael@0 159 mLastChar = aBuf[i];
michael@0 160 }
michael@0 161 }
michael@0 162
michael@0 163 nsProbingState st;
michael@0 164 switch (mInputState)
michael@0 165 {
michael@0 166 case eEscAscii:
michael@0 167 if (nullptr == mEscCharSetProber) {
michael@0 168 mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
michael@0 169 if (nullptr == mEscCharSetProber)
michael@0 170 return NS_ERROR_OUT_OF_MEMORY;
michael@0 171 }
michael@0 172 st = mEscCharSetProber->HandleData(aBuf, aLen);
michael@0 173 if (st == eFoundIt)
michael@0 174 {
michael@0 175 mDone = true;
michael@0 176 mDetectedCharset = mEscCharSetProber->GetCharSetName();
michael@0 177 }
michael@0 178 break;
michael@0 179 case eHighbyte:
michael@0 180 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
michael@0 181 {
michael@0 182 if (mCharSetProbers[i])
michael@0 183 {
michael@0 184 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
michael@0 185 if (st == eFoundIt)
michael@0 186 {
michael@0 187 mDone = true;
michael@0 188 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
michael@0 189 return NS_OK;
michael@0 190 }
michael@0 191 }
michael@0 192 }
michael@0 193 break;
michael@0 194
michael@0 195 default: //pure ascii
michael@0 196 ;//do nothing here
michael@0 197 }
michael@0 198 return NS_OK;
michael@0 199 }
michael@0 200
michael@0 201
michael@0 202 //---------------------------------------------------------------------
michael@0 203 void nsUniversalDetector::DataEnd()
michael@0 204 {
michael@0 205 if (!mGotData)
michael@0 206 {
michael@0 207 // we haven't got any data yet, return immediately
michael@0 208 // caller program sometimes call DataEnd before anything has been sent to detector
michael@0 209 return;
michael@0 210 }
michael@0 211
michael@0 212 if (mDetectedCharset)
michael@0 213 {
michael@0 214 mDone = true;
michael@0 215 Report(mDetectedCharset);
michael@0 216 return;
michael@0 217 }
michael@0 218
michael@0 219 switch (mInputState)
michael@0 220 {
michael@0 221 case eHighbyte:
michael@0 222 {
michael@0 223 float proberConfidence;
michael@0 224 float maxProberConfidence = (float)0.0;
michael@0 225 int32_t maxProber = 0;
michael@0 226
michael@0 227 for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
michael@0 228 {
michael@0 229 if (mCharSetProbers[i])
michael@0 230 {
michael@0 231 proberConfidence = mCharSetProbers[i]->GetConfidence();
michael@0 232 if (proberConfidence > maxProberConfidence)
michael@0 233 {
michael@0 234 maxProberConfidence = proberConfidence;
michael@0 235 maxProber = i;
michael@0 236 }
michael@0 237 }
michael@0 238 }
michael@0 239 //do not report anything because we are not confident of it, that's in fact a negative answer
michael@0 240 if (maxProberConfidence > MINIMUM_THRESHOLD)
michael@0 241 Report(mCharSetProbers[maxProber]->GetCharSetName());
michael@0 242 }
michael@0 243 break;
michael@0 244 case eEscAscii:
michael@0 245 break;
michael@0 246 default:
michael@0 247 ;
michael@0 248 }
michael@0 249 return;
michael@0 250 }

mercurial