michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nscore.h" michael@0: michael@0: #include "nsUniversalDetector.h" michael@0: michael@0: #include "nsMBCSGroupProber.h" michael@0: #include "nsSBCSGroupProber.h" michael@0: #include "nsEscCharsetProber.h" michael@0: #include "nsLatin1Prober.h" michael@0: michael@0: nsUniversalDetector::nsUniversalDetector(uint32_t aLanguageFilter) michael@0: { michael@0: mDone = false; michael@0: mBestGuess = -1; //illegal value as signal michael@0: mInTag = false; michael@0: mEscCharSetProber = nullptr; michael@0: michael@0: mStart = true; michael@0: mDetectedCharset = nullptr; michael@0: mGotData = false; michael@0: mInputState = ePureAscii; michael@0: mLastChar = '\0'; michael@0: mLanguageFilter = aLanguageFilter; michael@0: michael@0: uint32_t i; michael@0: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) michael@0: mCharSetProbers[i] = nullptr; michael@0: } michael@0: michael@0: nsUniversalDetector::~nsUniversalDetector() michael@0: { michael@0: for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) michael@0: delete mCharSetProbers[i]; michael@0: michael@0: delete mEscCharSetProber; michael@0: } michael@0: michael@0: void michael@0: nsUniversalDetector::Reset() michael@0: { michael@0: mDone = false; michael@0: mBestGuess = -1; //illegal value as signal michael@0: mInTag = false; michael@0: michael@0: mStart = true; michael@0: mDetectedCharset = nullptr; michael@0: mGotData = false; michael@0: mInputState = ePureAscii; michael@0: mLastChar = '\0'; michael@0: michael@0: if (mEscCharSetProber) michael@0: mEscCharSetProber->Reset(); michael@0: michael@0: uint32_t i; michael@0: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) michael@0: if (mCharSetProbers[i]) michael@0: mCharSetProbers[i]->Reset(); michael@0: } michael@0: michael@0: //--------------------------------------------------------------------- michael@0: #define SHORTCUT_THRESHOLD (float)0.95 michael@0: #define MINIMUM_THRESHOLD (float)0.20 michael@0: michael@0: nsresult nsUniversalDetector::HandleData(const char* aBuf, uint32_t aLen) michael@0: { michael@0: if(mDone) michael@0: return NS_OK; michael@0: michael@0: if (aLen > 0) michael@0: mGotData = true; michael@0: michael@0: //If the data starts with BOM, we know it is UTF michael@0: if (mStart) michael@0: { michael@0: mStart = false; michael@0: if (aLen >= 2) { michael@0: switch (aBuf[0]) { michael@0: case '\xEF': michael@0: if ((aLen > 2) && ('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) { michael@0: // EF BB BF UTF-8 encoded BOM michael@0: mDetectedCharset = "UTF-8"; michael@0: } michael@0: break; michael@0: case '\xFE': michael@0: if ('\xFF' == aBuf[1]) { michael@0: // FE FF UTF-16, big endian BOM michael@0: mDetectedCharset = "UTF-16BE"; michael@0: } michael@0: break; michael@0: case '\xFF': michael@0: if ('\xFE' == aBuf[1]) { michael@0: // FF FE UTF-16, little endian BOM michael@0: mDetectedCharset = "UTF-16LE"; michael@0: } michael@0: break; michael@0: } // switch michael@0: } michael@0: michael@0: if (mDetectedCharset) michael@0: { michael@0: mDone = true; michael@0: return NS_OK; michael@0: } michael@0: } michael@0: michael@0: uint32_t i; michael@0: for (i = 0; i < aLen; i++) michael@0: { michael@0: //other than 0xa0, if every othe character is ascii, the page is ascii michael@0: if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP michael@0: { michael@0: //we got a non-ascii byte (high-byte) michael@0: if (mInputState != eHighbyte) michael@0: { michael@0: //adjust state michael@0: mInputState = eHighbyte; michael@0: michael@0: //kill mEscCharSetProber if it is active michael@0: if (mEscCharSetProber) { michael@0: delete mEscCharSetProber; michael@0: mEscCharSetProber = nullptr; michael@0: } michael@0: michael@0: //start multibyte and singlebyte charset prober michael@0: if (nullptr == mCharSetProbers[0]) michael@0: { michael@0: mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); michael@0: if (nullptr == mCharSetProbers[0]) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: } michael@0: if (nullptr == mCharSetProbers[1] && michael@0: (mLanguageFilter & NS_FILTER_NON_CJK)) michael@0: { michael@0: mCharSetProbers[1] = new nsSBCSGroupProber; michael@0: if (nullptr == mCharSetProbers[1]) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: } michael@0: if (nullptr == mCharSetProbers[2]) michael@0: { michael@0: mCharSetProbers[2] = new nsLatin1Prober; michael@0: if (nullptr == mCharSetProbers[2]) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: } michael@0: } michael@0: } michael@0: else michael@0: { michael@0: //ok, just pure ascii so far michael@0: if ( ePureAscii == mInputState && michael@0: (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) michael@0: { michael@0: //found escape character or HZ "~{" michael@0: mInputState = eEscAscii; michael@0: } michael@0: mLastChar = aBuf[i]; michael@0: } michael@0: } michael@0: michael@0: nsProbingState st; michael@0: switch (mInputState) michael@0: { michael@0: case eEscAscii: michael@0: if (nullptr == mEscCharSetProber) { michael@0: mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); michael@0: if (nullptr == mEscCharSetProber) michael@0: return NS_ERROR_OUT_OF_MEMORY; michael@0: } michael@0: st = mEscCharSetProber->HandleData(aBuf, aLen); michael@0: if (st == eFoundIt) michael@0: { michael@0: mDone = true; michael@0: mDetectedCharset = mEscCharSetProber->GetCharSetName(); michael@0: } michael@0: break; michael@0: case eHighbyte: michael@0: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) michael@0: { michael@0: if (mCharSetProbers[i]) michael@0: { michael@0: st = mCharSetProbers[i]->HandleData(aBuf, aLen); michael@0: if (st == eFoundIt) michael@0: { michael@0: mDone = true; michael@0: mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); michael@0: return NS_OK; michael@0: } michael@0: } michael@0: } michael@0: break; michael@0: michael@0: default: //pure ascii michael@0: ;//do nothing here michael@0: } michael@0: return NS_OK; michael@0: } michael@0: michael@0: michael@0: //--------------------------------------------------------------------- michael@0: void nsUniversalDetector::DataEnd() michael@0: { michael@0: if (!mGotData) michael@0: { michael@0: // we haven't got any data yet, return immediately michael@0: // caller program sometimes call DataEnd before anything has been sent to detector michael@0: return; michael@0: } michael@0: michael@0: if (mDetectedCharset) michael@0: { michael@0: mDone = true; michael@0: Report(mDetectedCharset); michael@0: return; michael@0: } michael@0: michael@0: switch (mInputState) michael@0: { michael@0: case eHighbyte: michael@0: { michael@0: float proberConfidence; michael@0: float maxProberConfidence = (float)0.0; michael@0: int32_t maxProber = 0; michael@0: michael@0: for (int32_t i = 0; i < NUM_OF_CHARSET_PROBERS; i++) michael@0: { michael@0: if (mCharSetProbers[i]) michael@0: { michael@0: proberConfidence = mCharSetProbers[i]->GetConfidence(); michael@0: if (proberConfidence > maxProberConfidence) michael@0: { michael@0: maxProberConfidence = proberConfidence; michael@0: maxProber = i; michael@0: } michael@0: } michael@0: } michael@0: //do not report anything because we are not confident of it, that's in fact a negative answer michael@0: if (maxProberConfidence > MINIMUM_THRESHOLD) michael@0: Report(mCharSetProbers[maxProber]->GetCharSetName()); michael@0: } michael@0: break; michael@0: case eEscAscii: michael@0: break; michael@0: default: michael@0: ; michael@0: } michael@0: return; michael@0: }