Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2005-2009, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | */ |
michael@0 | 7 | |
michael@0 | 8 | #include "unicode/utypes.h" |
michael@0 | 9 | |
michael@0 | 10 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 11 | |
michael@0 | 12 | #include "inputext.h" |
michael@0 | 13 | |
michael@0 | 14 | #include "cmemory.h" |
michael@0 | 15 | #include "cstring.h" |
michael@0 | 16 | |
michael@0 | 17 | #include <string.h> |
michael@0 | 18 | |
michael@0 | 19 | U_NAMESPACE_BEGIN |
michael@0 | 20 | |
michael@0 | 21 | #define BUFFER_SIZE 8192 |
michael@0 | 22 | |
michael@0 | 23 | #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
michael@0 | 24 | |
michael@0 | 25 | #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
michael@0 | 26 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
michael@0 | 27 | |
michael@0 | 28 | InputText::InputText(UErrorCode &status) |
michael@0 | 29 | : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
michael@0 | 30 | // removed if appropriate. |
michael@0 | 31 | fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. |
michael@0 | 32 | // Value is percent, not absolute. |
michael@0 | 33 | fDeclaredEncoding(0), |
michael@0 | 34 | fRawInput(0), |
michael@0 | 35 | fRawLength(0) |
michael@0 | 36 | { |
michael@0 | 37 | if (fInputBytes == NULL || fByteStats == NULL) { |
michael@0 | 38 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 39 | } |
michael@0 | 40 | } |
michael@0 | 41 | |
michael@0 | 42 | InputText::~InputText() |
michael@0 | 43 | { |
michael@0 | 44 | DELETE_ARRAY(fDeclaredEncoding); |
michael@0 | 45 | DELETE_ARRAY(fByteStats); |
michael@0 | 46 | DELETE_ARRAY(fInputBytes); |
michael@0 | 47 | } |
michael@0 | 48 | |
michael@0 | 49 | void InputText::setText(const char *in, int32_t len) |
michael@0 | 50 | { |
michael@0 | 51 | fInputLen = 0; |
michael@0 | 52 | fC1Bytes = FALSE; |
michael@0 | 53 | fRawInput = (const uint8_t *) in; |
michael@0 | 54 | fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
michael@0 | 58 | { |
michael@0 | 59 | if(encoding) { |
michael@0 | 60 | if (len == -1) { |
michael@0 | 61 | len = (int32_t)uprv_strlen(encoding); |
michael@0 | 62 | } |
michael@0 | 63 | |
michael@0 | 64 | len += 1; // to make place for the \0 at the end. |
michael@0 | 65 | uprv_free(fDeclaredEncoding); |
michael@0 | 66 | fDeclaredEncoding = NEW_ARRAY(char, len); |
michael@0 | 67 | uprv_strncpy(fDeclaredEncoding, encoding, len); |
michael@0 | 68 | } |
michael@0 | 69 | } |
michael@0 | 70 | |
michael@0 | 71 | UBool InputText::isSet() const |
michael@0 | 72 | { |
michael@0 | 73 | return fRawInput != NULL; |
michael@0 | 74 | } |
michael@0 | 75 | |
michael@0 | 76 | /** |
michael@0 | 77 | * MungeInput - after getting a set of raw input data to be analyzed, preprocess |
michael@0 | 78 | * it by removing what appears to be html markup. |
michael@0 | 79 | * |
michael@0 | 80 | * @internal |
michael@0 | 81 | */ |
michael@0 | 82 | void InputText::MungeInput(UBool fStripTags) { |
michael@0 | 83 | int srci = 0; |
michael@0 | 84 | int dsti = 0; |
michael@0 | 85 | uint8_t b; |
michael@0 | 86 | bool inMarkup = FALSE; |
michael@0 | 87 | int32_t openTags = 0; |
michael@0 | 88 | int32_t badTags = 0; |
michael@0 | 89 | |
michael@0 | 90 | // |
michael@0 | 91 | // html / xml markup stripping. |
michael@0 | 92 | // quick and dirty, not 100% accurate, but hopefully good enough, statistically. |
michael@0 | 93 | // discard everything within < brackets > |
michael@0 | 94 | // Count how many total '<' and illegal (nested) '<' occur, so we can make some |
michael@0 | 95 | // guess as to whether the input was actually marked up at all. |
michael@0 | 96 | // TODO: Think about how this interacts with EBCDIC charsets that are detected. |
michael@0 | 97 | if (fStripTags) { |
michael@0 | 98 | for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
michael@0 | 99 | b = fRawInput[srci]; |
michael@0 | 100 | |
michael@0 | 101 | if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
michael@0 | 102 | if (inMarkup) { |
michael@0 | 103 | badTags += 1; |
michael@0 | 104 | } |
michael@0 | 105 | |
michael@0 | 106 | inMarkup = TRUE; |
michael@0 | 107 | openTags += 1; |
michael@0 | 108 | } |
michael@0 | 109 | |
michael@0 | 110 | if (! inMarkup) { |
michael@0 | 111 | fInputBytes[dsti++] = b; |
michael@0 | 112 | } |
michael@0 | 113 | |
michael@0 | 114 | if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
michael@0 | 115 | inMarkup = FALSE; |
michael@0 | 116 | } |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | fInputLen = dsti; |
michael@0 | 120 | } |
michael@0 | 121 | |
michael@0 | 122 | // |
michael@0 | 123 | // If it looks like this input wasn't marked up, or if it looks like it's |
michael@0 | 124 | // essentially nothing but markup abandon the markup stripping. |
michael@0 | 125 | // Detection will have to work on the unstripped input. |
michael@0 | 126 | // |
michael@0 | 127 | if (openTags<5 || openTags/5 < badTags || |
michael@0 | 128 | (fInputLen < 100 && fRawLength>600)) |
michael@0 | 129 | { |
michael@0 | 130 | int32_t limit = fRawLength; |
michael@0 | 131 | |
michael@0 | 132 | if (limit > BUFFER_SIZE) { |
michael@0 | 133 | limit = BUFFER_SIZE; |
michael@0 | 134 | } |
michael@0 | 135 | |
michael@0 | 136 | for (srci=0; srci<limit; srci++) { |
michael@0 | 137 | fInputBytes[srci] = fRawInput[srci]; |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | fInputLen = srci; |
michael@0 | 141 | } |
michael@0 | 142 | |
michael@0 | 143 | // |
michael@0 | 144 | // Tally up the byte occurence statistics. |
michael@0 | 145 | // These are available for use by the various detectors. |
michael@0 | 146 | // |
michael@0 | 147 | |
michael@0 | 148 | uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
michael@0 | 149 | |
michael@0 | 150 | for (srci = 0; srci < fInputLen; srci += 1) { |
michael@0 | 151 | fByteStats[fInputBytes[srci]] += 1; |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
michael@0 | 155 | if (fByteStats[i] != 0) { |
michael@0 | 156 | fC1Bytes = TRUE; |
michael@0 | 157 | break; |
michael@0 | 158 | } |
michael@0 | 159 | } |
michael@0 | 160 | } |
michael@0 | 161 | |
michael@0 | 162 | U_NAMESPACE_END |
michael@0 | 163 | #endif |
michael@0 | 164 |