1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csrutf8.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,113 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2012, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_CONVERSION 1.14 + 1.15 +#include "csrutf8.h" 1.16 +#include "csmatch.h" 1.17 + 1.18 +U_NAMESPACE_BEGIN 1.19 + 1.20 +CharsetRecog_UTF8::~CharsetRecog_UTF8() 1.21 +{ 1.22 + // nothing to do 1.23 +} 1.24 + 1.25 +const char *CharsetRecog_UTF8::getName() const 1.26 +{ 1.27 + return "UTF-8"; 1.28 +} 1.29 + 1.30 +UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { 1.31 + bool hasBOM = FALSE; 1.32 + int32_t numValid = 0; 1.33 + int32_t numInvalid = 0; 1.34 + const uint8_t *inputBytes = input->fRawInput; 1.35 + int32_t i; 1.36 + int32_t trailBytes = 0; 1.37 + int32_t confidence; 1.38 + 1.39 + if (input->fRawLength >= 3 && 1.40 + inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) { 1.41 + hasBOM = TRUE; 1.42 + } 1.43 + 1.44 + // Scan for multi-byte sequences 1.45 + for (i=0; i < input->fRawLength; i += 1) { 1.46 + int32_t b = inputBytes[i]; 1.47 + 1.48 + if ((b & 0x80) == 0) { 1.49 + continue; // ASCII 1.50 + } 1.51 + 1.52 + // Hi bit on char found. Figure out how long the sequence should be 1.53 + if ((b & 0x0E0) == 0x0C0) { 1.54 + trailBytes = 1; 1.55 + } else if ((b & 0x0F0) == 0x0E0) { 1.56 + trailBytes = 2; 1.57 + } else if ((b & 0x0F8) == 0xF0) { 1.58 + trailBytes = 3; 1.59 + } else { 1.60 + numInvalid += 1; 1.61 + 1.62 + if (numInvalid > 5) { 1.63 + break; 1.64 + } 1.65 + 1.66 + trailBytes = 0; 1.67 + } 1.68 + 1.69 + // Verify that we've got the right number of trail bytes in the sequence 1.70 + for (;;) { 1.71 + i += 1; 1.72 + 1.73 + if (i >= input->fRawLength) { 1.74 + break; 1.75 + } 1.76 + 1.77 + b = inputBytes[i]; 1.78 + 1.79 + if ((b & 0xC0) != 0x080) { 1.80 + numInvalid += 1; 1.81 + break; 1.82 + } 1.83 + 1.84 + if (--trailBytes == 0) { 1.85 + numValid += 1; 1.86 + break; 1.87 + } 1.88 + } 1.89 + 1.90 + } 1.91 + 1.92 + // Cook up some sort of confidence score, based on presense of a BOM 1.93 + // and the existence of valid and/or invalid multi-byte sequences. 1.94 + confidence = 0; 1.95 + if (hasBOM && numInvalid == 0) { 1.96 + confidence = 100; 1.97 + } else if (hasBOM && numValid > numInvalid*10) { 1.98 + confidence = 80; 1.99 + } else if (numValid > 3 && numInvalid == 0) { 1.100 + confidence = 100; 1.101 + } else if (numValid > 0 && numInvalid == 0) { 1.102 + confidence = 80; 1.103 + } else if (numValid == 0 && numInvalid == 0) { 1.104 + // Plain ASCII. 1.105 + confidence = 10; 1.106 + } else if (numValid > numInvalid*10) { 1.107 + // Probably corruput utf-8 data. Valid sequences aren't likely by chance. 1.108 + confidence = 25; 1.109 + } 1.110 + 1.111 + results->set(input, this, confidence); 1.112 + return (confidence > 0); 1.113 +} 1.114 + 1.115 +U_NAMESPACE_END 1.116 +#endif