michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2005-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "csrutf8.h" michael@0: #include "csmatch.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: CharsetRecog_UTF8::~CharsetRecog_UTF8() michael@0: { michael@0: // nothing to do michael@0: } michael@0: michael@0: const char *CharsetRecog_UTF8::getName() const michael@0: { michael@0: return "UTF-8"; michael@0: } michael@0: michael@0: UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { michael@0: bool hasBOM = FALSE; michael@0: int32_t numValid = 0; michael@0: int32_t numInvalid = 0; michael@0: const uint8_t *inputBytes = input->fRawInput; michael@0: int32_t i; michael@0: int32_t trailBytes = 0; michael@0: int32_t confidence; michael@0: michael@0: if (input->fRawLength >= 3 && michael@0: inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { michael@0: hasBOM = TRUE; michael@0: } michael@0: michael@0: // Scan for multi-byte sequences michael@0: for (i=0; i < input->fRawLength; i += 1) { michael@0: int32_t b = inputBytes[i]; michael@0: michael@0: if ((b & 0x80) == 0) { michael@0: continue; // ASCII michael@0: } michael@0: michael@0: // Hi bit on char found. Figure out how long the sequence should be michael@0: if ((b & 0x0E0) == 0x0C0) { michael@0: trailBytes = 1; michael@0: } else if ((b & 0x0F0) == 0x0E0) { michael@0: trailBytes = 2; michael@0: } else if ((b & 0x0F8) == 0xF0) { michael@0: trailBytes = 3; michael@0: } else { michael@0: numInvalid += 1; michael@0: michael@0: if (numInvalid > 5) { michael@0: break; michael@0: } michael@0: michael@0: trailBytes = 0; michael@0: } michael@0: michael@0: // Verify that we've got the right number of trail bytes in the sequence michael@0: for (;;) { michael@0: i += 1; michael@0: michael@0: if (i >= input->fRawLength) { michael@0: break; michael@0: } michael@0: michael@0: b = inputBytes[i]; michael@0: michael@0: if ((b & 0xC0) != 0x080) { michael@0: numInvalid += 1; michael@0: break; michael@0: } michael@0: michael@0: if (--trailBytes == 0) { michael@0: numValid += 1; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: // Cook up some sort of confidence score, based on presense of a BOM michael@0: // and the existence of valid and/or invalid multi-byte sequences. michael@0: confidence = 0; michael@0: if (hasBOM && numInvalid == 0) { michael@0: confidence = 100; michael@0: } else if (hasBOM && numValid > numInvalid*10) { michael@0: confidence = 80; michael@0: } else if (numValid > 3 && numInvalid == 0) { michael@0: confidence = 100; michael@0: } else if (numValid > 0 && numInvalid == 0) { michael@0: confidence = 80; michael@0: } else if (numValid == 0 && numInvalid == 0) { michael@0: // Plain ASCII. michael@0: confidence = 10; michael@0: } else if (numValid > numInvalid*10) { michael@0: // Probably corruput utf-8 data. Valid sequences aren't likely by chance. michael@0: confidence = 25; michael@0: } michael@0: michael@0: results->set(input, this, confidence); michael@0: return (confidence > 0); michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: #endif