1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/inputext.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,164 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 2005-2009, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + */ 1.10 + 1.11 +#include "unicode/utypes.h" 1.12 + 1.13 +#if !UCONFIG_NO_CONVERSION 1.14 + 1.15 +#include "inputext.h" 1.16 + 1.17 +#include "cmemory.h" 1.18 +#include "cstring.h" 1.19 + 1.20 +#include <string.h> 1.21 + 1.22 +U_NAMESPACE_BEGIN 1.23 + 1.24 +#define BUFFER_SIZE 8192 1.25 + 1.26 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 1.27 + 1.28 +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 1.29 +#define DELETE_ARRAY(array) uprv_free((void *) (array)) 1.30 + 1.31 +InputText::InputText(UErrorCode &status) 1.32 + : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been 1.33 + // removed if appropriate. 1.34 + fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. 1.35 + // Value is percent, not absolute. 1.36 + fDeclaredEncoding(0), 1.37 + fRawInput(0), 1.38 + fRawLength(0) 1.39 +{ 1.40 + if (fInputBytes == NULL || fByteStats == NULL) { 1.41 + status = U_MEMORY_ALLOCATION_ERROR; 1.42 + } 1.43 +} 1.44 + 1.45 +InputText::~InputText() 1.46 +{ 1.47 + DELETE_ARRAY(fDeclaredEncoding); 1.48 + DELETE_ARRAY(fByteStats); 1.49 + DELETE_ARRAY(fInputBytes); 1.50 +} 1.51 + 1.52 +void InputText::setText(const char *in, int32_t len) 1.53 +{ 1.54 + fInputLen = 0; 1.55 + fC1Bytes = FALSE; 1.56 + fRawInput = (const uint8_t *) in; 1.57 + fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; 1.58 +} 1.59 + 1.60 +void InputText::setDeclaredEncoding(const char* encoding, int32_t len) 1.61 +{ 1.62 + if(encoding) { 1.63 + if (len == -1) { 1.64 + len = (int32_t)uprv_strlen(encoding); 1.65 + } 1.66 + 1.67 + len += 1; // to make place for the \0 at the end. 1.68 + uprv_free(fDeclaredEncoding); 1.69 + fDeclaredEncoding = NEW_ARRAY(char, len); 1.70 + uprv_strncpy(fDeclaredEncoding, encoding, len); 1.71 + } 1.72 +} 1.73 + 1.74 +UBool InputText::isSet() const 1.75 +{ 1.76 + return fRawInput != NULL; 1.77 +} 1.78 + 1.79 +/** 1.80 +* MungeInput - after getting a set of raw input data to be analyzed, preprocess 1.81 +* it by removing what appears to be html markup. 1.82 +* 1.83 +* @internal 1.84 +*/ 1.85 +void InputText::MungeInput(UBool fStripTags) { 1.86 + int srci = 0; 1.87 + int dsti = 0; 1.88 + uint8_t b; 1.89 + bool inMarkup = FALSE; 1.90 + int32_t openTags = 0; 1.91 + int32_t badTags = 0; 1.92 + 1.93 + // 1.94 + // html / xml markup stripping. 1.95 + // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 1.96 + // discard everything within < brackets > 1.97 + // Count how many total '<' and illegal (nested) '<' occur, so we can make some 1.98 + // guess as to whether the input was actually marked up at all. 1.99 + // TODO: Think about how this interacts with EBCDIC charsets that are detected. 1.100 + if (fStripTags) { 1.101 + for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { 1.102 + b = fRawInput[srci]; 1.103 + 1.104 + if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ 1.105 + if (inMarkup) { 1.106 + badTags += 1; 1.107 + } 1.108 + 1.109 + inMarkup = TRUE; 1.110 + openTags += 1; 1.111 + } 1.112 + 1.113 + if (! inMarkup) { 1.114 + fInputBytes[dsti++] = b; 1.115 + } 1.116 + 1.117 + if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ 1.118 + inMarkup = FALSE; 1.119 + } 1.120 + } 1.121 + 1.122 + fInputLen = dsti; 1.123 + } 1.124 + 1.125 + // 1.126 + // If it looks like this input wasn't marked up, or if it looks like it's 1.127 + // essentially nothing but markup abandon the markup stripping. 1.128 + // Detection will have to work on the unstripped input. 1.129 + // 1.130 + if (openTags<5 || openTags/5 < badTags || 1.131 + (fInputLen < 100 && fRawLength>600)) 1.132 + { 1.133 + int32_t limit = fRawLength; 1.134 + 1.135 + if (limit > BUFFER_SIZE) { 1.136 + limit = BUFFER_SIZE; 1.137 + } 1.138 + 1.139 + for (srci=0; srci<limit; srci++) { 1.140 + fInputBytes[srci] = fRawInput[srci]; 1.141 + } 1.142 + 1.143 + fInputLen = srci; 1.144 + } 1.145 + 1.146 + // 1.147 + // Tally up the byte occurence statistics. 1.148 + // These are available for use by the various detectors. 1.149 + // 1.150 + 1.151 + uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); 1.152 + 1.153 + for (srci = 0; srci < fInputLen; srci += 1) { 1.154 + fByteStats[fInputBytes[srci]] += 1; 1.155 + } 1.156 + 1.157 + for (int32_t i = 0x80; i <= 0x9F; i += 1) { 1.158 + if (fByteStats[i] != 0) { 1.159 + fC1Bytes = TRUE; 1.160 + break; 1.161 + } 1.162 + } 1.163 +} 1.164 + 1.165 +U_NAMESPACE_END 1.166 +#endif 1.167 +