intl/icu/source/i18n/inputext.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2009, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_CONVERSION
michael@0 11
michael@0 12 #include "inputext.h"
michael@0 13
michael@0 14 #include "cmemory.h"
michael@0 15 #include "cstring.h"
michael@0 16
michael@0 17 #include <string.h>
michael@0 18
michael@0 19 U_NAMESPACE_BEGIN
michael@0 20
michael@0 21 #define BUFFER_SIZE 8192
michael@0 22
michael@0 23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
michael@0 24
michael@0 25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
michael@0 26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
michael@0 27
michael@0 28 InputText::InputText(UErrorCode &status)
michael@0 29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
michael@0 30 // removed if appropriate.
michael@0 31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
michael@0 32 // Value is percent, not absolute.
michael@0 33 fDeclaredEncoding(0),
michael@0 34 fRawInput(0),
michael@0 35 fRawLength(0)
michael@0 36 {
michael@0 37 if (fInputBytes == NULL || fByteStats == NULL) {
michael@0 38 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 39 }
michael@0 40 }
michael@0 41
michael@0 42 InputText::~InputText()
michael@0 43 {
michael@0 44 DELETE_ARRAY(fDeclaredEncoding);
michael@0 45 DELETE_ARRAY(fByteStats);
michael@0 46 DELETE_ARRAY(fInputBytes);
michael@0 47 }
michael@0 48
michael@0 49 void InputText::setText(const char *in, int32_t len)
michael@0 50 {
michael@0 51 fInputLen = 0;
michael@0 52 fC1Bytes = FALSE;
michael@0 53 fRawInput = (const uint8_t *) in;
michael@0 54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
michael@0 55 }
michael@0 56
michael@0 57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
michael@0 58 {
michael@0 59 if(encoding) {
michael@0 60 if (len == -1) {
michael@0 61 len = (int32_t)uprv_strlen(encoding);
michael@0 62 }
michael@0 63
michael@0 64 len += 1; // to make place for the \0 at the end.
michael@0 65 uprv_free(fDeclaredEncoding);
michael@0 66 fDeclaredEncoding = NEW_ARRAY(char, len);
michael@0 67 uprv_strncpy(fDeclaredEncoding, encoding, len);
michael@0 68 }
michael@0 69 }
michael@0 70
michael@0 71 UBool InputText::isSet() const
michael@0 72 {
michael@0 73 return fRawInput != NULL;
michael@0 74 }
michael@0 75
michael@0 76 /**
michael@0 77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess
michael@0 78 * it by removing what appears to be html markup.
michael@0 79 *
michael@0 80 * @internal
michael@0 81 */
michael@0 82 void InputText::MungeInput(UBool fStripTags) {
michael@0 83 int srci = 0;
michael@0 84 int dsti = 0;
michael@0 85 uint8_t b;
michael@0 86 bool inMarkup = FALSE;
michael@0 87 int32_t openTags = 0;
michael@0 88 int32_t badTags = 0;
michael@0 89
michael@0 90 //
michael@0 91 // html / xml markup stripping.
michael@0 92 // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
michael@0 93 // discard everything within < brackets >
michael@0 94 // Count how many total '<' and illegal (nested) '<' occur, so we can make some
michael@0 95 // guess as to whether the input was actually marked up at all.
michael@0 96 // TODO: Think about how this interacts with EBCDIC charsets that are detected.
michael@0 97 if (fStripTags) {
michael@0 98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
michael@0 99 b = fRawInput[srci];
michael@0 100
michael@0 101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
michael@0 102 if (inMarkup) {
michael@0 103 badTags += 1;
michael@0 104 }
michael@0 105
michael@0 106 inMarkup = TRUE;
michael@0 107 openTags += 1;
michael@0 108 }
michael@0 109
michael@0 110 if (! inMarkup) {
michael@0 111 fInputBytes[dsti++] = b;
michael@0 112 }
michael@0 113
michael@0 114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
michael@0 115 inMarkup = FALSE;
michael@0 116 }
michael@0 117 }
michael@0 118
michael@0 119 fInputLen = dsti;
michael@0 120 }
michael@0 121
michael@0 122 //
michael@0 123 // If it looks like this input wasn't marked up, or if it looks like it's
michael@0 124 // essentially nothing but markup abandon the markup stripping.
michael@0 125 // Detection will have to work on the unstripped input.
michael@0 126 //
michael@0 127 if (openTags<5 || openTags/5 < badTags ||
michael@0 128 (fInputLen < 100 && fRawLength>600))
michael@0 129 {
michael@0 130 int32_t limit = fRawLength;
michael@0 131
michael@0 132 if (limit > BUFFER_SIZE) {
michael@0 133 limit = BUFFER_SIZE;
michael@0 134 }
michael@0 135
michael@0 136 for (srci=0; srci<limit; srci++) {
michael@0 137 fInputBytes[srci] = fRawInput[srci];
michael@0 138 }
michael@0 139
michael@0 140 fInputLen = srci;
michael@0 141 }
michael@0 142
michael@0 143 //
michael@0 144 // Tally up the byte occurence statistics.
michael@0 145 // These are available for use by the various detectors.
michael@0 146 //
michael@0 147
michael@0 148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
michael@0 149
michael@0 150 for (srci = 0; srci < fInputLen; srci += 1) {
michael@0 151 fByteStats[fInputBytes[srci]] += 1;
michael@0 152 }
michael@0 153
michael@0 154 for (int32_t i = 0x80; i <= 0x9F; i += 1) {
michael@0 155 if (fByteStats[i] != 0) {
michael@0 156 fC1Bytes = TRUE;
michael@0 157 break;
michael@0 158 }
michael@0 159 }
michael@0 160 }
michael@0 161
michael@0 162 U_NAMESPACE_END
michael@0 163 #endif
michael@0 164

mercurial