michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2005-2009, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "inputext.h" michael@0: michael@0: #include "cmemory.h" michael@0: #include "cstring.h" michael@0: michael@0: #include michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: #define BUFFER_SIZE 8192 michael@0: michael@0: #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) michael@0: michael@0: #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) michael@0: #define DELETE_ARRAY(array) uprv_free((void *) (array)) michael@0: michael@0: InputText::InputText(UErrorCode &status) michael@0: : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been michael@0: // removed if appropriate. michael@0: fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. michael@0: // Value is percent, not absolute. michael@0: fDeclaredEncoding(0), michael@0: fRawInput(0), michael@0: fRawLength(0) michael@0: { michael@0: if (fInputBytes == NULL || fByteStats == NULL) { michael@0: status = U_MEMORY_ALLOCATION_ERROR; michael@0: } michael@0: } michael@0: michael@0: InputText::~InputText() michael@0: { michael@0: DELETE_ARRAY(fDeclaredEncoding); michael@0: DELETE_ARRAY(fByteStats); michael@0: DELETE_ARRAY(fInputBytes); michael@0: } michael@0: michael@0: void InputText::setText(const char *in, int32_t len) michael@0: { michael@0: fInputLen = 0; michael@0: fC1Bytes = FALSE; michael@0: fRawInput = (const uint8_t *) in; michael@0: fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; michael@0: } michael@0: michael@0: void InputText::setDeclaredEncoding(const char* encoding, int32_t len) michael@0: { michael@0: if(encoding) { michael@0: if (len == -1) { michael@0: len = (int32_t)uprv_strlen(encoding); michael@0: } michael@0: michael@0: len += 1; // to make place for the \0 at the end. michael@0: uprv_free(fDeclaredEncoding); michael@0: fDeclaredEncoding = NEW_ARRAY(char, len); michael@0: uprv_strncpy(fDeclaredEncoding, encoding, len); michael@0: } michael@0: } michael@0: michael@0: UBool InputText::isSet() const michael@0: { michael@0: return fRawInput != NULL; michael@0: } michael@0: michael@0: /** michael@0: * MungeInput - after getting a set of raw input data to be analyzed, preprocess michael@0: * it by removing what appears to be html markup. michael@0: * michael@0: * @internal michael@0: */ michael@0: void InputText::MungeInput(UBool fStripTags) { michael@0: int srci = 0; michael@0: int dsti = 0; michael@0: uint8_t b; michael@0: bool inMarkup = FALSE; michael@0: int32_t openTags = 0; michael@0: int32_t badTags = 0; michael@0: michael@0: // michael@0: // html / xml markup stripping. michael@0: // quick and dirty, not 100% accurate, but hopefully good enough, statistically. michael@0: // discard everything within < brackets > michael@0: // Count how many total '<' and illegal (nested) '<' occur, so we can make some michael@0: // guess as to whether the input was actually marked up at all. michael@0: // TODO: Think about how this interacts with EBCDIC charsets that are detected. michael@0: if (fStripTags) { michael@0: for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { michael@0: b = fRawInput[srci]; michael@0: michael@0: if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ michael@0: if (inMarkup) { michael@0: badTags += 1; michael@0: } michael@0: michael@0: inMarkup = TRUE; michael@0: openTags += 1; michael@0: } michael@0: michael@0: if (! inMarkup) { michael@0: fInputBytes[dsti++] = b; michael@0: } michael@0: michael@0: if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ michael@0: inMarkup = FALSE; michael@0: } michael@0: } michael@0: michael@0: fInputLen = dsti; michael@0: } michael@0: michael@0: // michael@0: // If it looks like this input wasn't marked up, or if it looks like it's michael@0: // essentially nothing but markup abandon the markup stripping. michael@0: // Detection will have to work on the unstripped input. michael@0: // michael@0: if (openTags<5 || openTags/5 < badTags || michael@0: (fInputLen < 100 && fRawLength>600)) michael@0: { michael@0: int32_t limit = fRawLength; michael@0: michael@0: if (limit > BUFFER_SIZE) { michael@0: limit = BUFFER_SIZE; michael@0: } michael@0: michael@0: for (srci=0; srci