1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/csrucode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,154 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2013, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_CONVERSION 1.14 + 1.15 +#include "csrucode.h" 1.16 +#include "csmatch.h" 1.17 + 1.18 +U_NAMESPACE_BEGIN 1.19 + 1.20 +CharsetRecog_Unicode::~CharsetRecog_Unicode() 1.21 +{ 1.22 + // nothing to do 1.23 +} 1.24 + 1.25 +CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() 1.26 +{ 1.27 + // nothing to do 1.28 +} 1.29 + 1.30 +const char *CharsetRecog_UTF_16_BE::getName() const 1.31 +{ 1.32 + return "UTF-16BE"; 1.33 +} 1.34 + 1.35 +UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const 1.36 +{ 1.37 + const uint8_t *input = textIn->fRawInput; 1.38 + int32_t confidence = 0; 1.39 + int32_t length = textIn->fRawLength; 1.40 + 1.41 + if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { 1.42 + confidence = 100; 1.43 + } 1.44 + 1.45 + // TODO: Do some statastics to check for unsigned UTF-16BE 1.46 + results->set(textIn, this, confidence); 1.47 + return (confidence > 0); 1.48 +} 1.49 + 1.50 +CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() 1.51 +{ 1.52 + // nothing to do 1.53 +} 1.54 + 1.55 +const char *CharsetRecog_UTF_16_LE::getName() const 1.56 +{ 1.57 + return "UTF-16LE"; 1.58 +} 1.59 + 1.60 +UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const 1.61 +{ 1.62 + const uint8_t *input = textIn->fRawInput; 1.63 + int32_t confidence = 0; 1.64 + int32_t length = textIn->fRawLength; 1.65 + 1.66 + if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { 1.67 + confidence = 100; 1.68 + } 1.69 + 1.70 + // TODO: Do some statastics to check for unsigned UTF-16LE 1.71 + results->set(textIn, this, confidence); 1.72 + return (confidence > 0); 1.73 +} 1.74 + 1.75 +CharsetRecog_UTF_32::~CharsetRecog_UTF_32() 1.76 +{ 1.77 + // nothing to do 1.78 +} 1.79 + 1.80 +UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const 1.81 +{ 1.82 + const uint8_t *input = textIn->fRawInput; 1.83 + int32_t limit = (textIn->fRawLength / 4) * 4; 1.84 + int32_t numValid = 0; 1.85 + int32_t numInvalid = 0; 1.86 + bool hasBOM = FALSE; 1.87 + int32_t confidence = 0; 1.88 + 1.89 + if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { 1.90 + hasBOM = TRUE; 1.91 + } 1.92 + 1.93 + for(int32_t i = 0; i < limit; i += 4) { 1.94 + int32_t ch = getChar(input, i); 1.95 + 1.96 + if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { 1.97 + numInvalid += 1; 1.98 + } else { 1.99 + numValid += 1; 1.100 + } 1.101 + } 1.102 + 1.103 + 1.104 + // Cook up some sort of confidence score, based on presense of a BOM 1.105 + // and the existence of valid and/or invalid multi-byte sequences. 1.106 + if (hasBOM && numInvalid==0) { 1.107 + confidence = 100; 1.108 + } else if (hasBOM && numValid > numInvalid*10) { 1.109 + confidence = 80; 1.110 + } else if (numValid > 3 && numInvalid == 0) { 1.111 + confidence = 100; 1.112 + } else if (numValid > 0 && numInvalid == 0) { 1.113 + confidence = 80; 1.114 + } else if (numValid > numInvalid*10) { 1.115 + // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. 1.116 + confidence = 25; 1.117 + } 1.118 + 1.119 + results->set(textIn, this, confidence); 1.120 + return (confidence > 0); 1.121 +} 1.122 + 1.123 +CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() 1.124 +{ 1.125 + // nothing to do 1.126 +} 1.127 + 1.128 +const char *CharsetRecog_UTF_32_BE::getName() const 1.129 +{ 1.130 + return "UTF-32BE"; 1.131 +} 1.132 + 1.133 +int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const 1.134 +{ 1.135 + return input[index + 0] << 24 | input[index + 1] << 16 | 1.136 + input[index + 2] << 8 | input[index + 3]; 1.137 +} 1.138 + 1.139 +CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() 1.140 +{ 1.141 + // nothing to do 1.142 +} 1.143 + 1.144 +const char *CharsetRecog_UTF_32_LE::getName() const 1.145 +{ 1.146 + return "UTF-32LE"; 1.147 +} 1.148 + 1.149 +int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const 1.150 +{ 1.151 + return input[index + 3] << 24 | input[index + 2] << 16 | 1.152 + input[index + 1] << 8 | input[index + 0]; 1.153 +} 1.154 + 1.155 +U_NAMESPACE_END 1.156 +#endif 1.157 +