intl/icu/source/i18n/inputext.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/inputext.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,164 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2009, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_CONVERSION
    1.14 +
    1.15 +#include "inputext.h"
    1.16 +
    1.17 +#include "cmemory.h"
    1.18 +#include "cstring.h"
    1.19 +
    1.20 +#include <string.h>
    1.21 +
    1.22 +U_NAMESPACE_BEGIN
    1.23 +
    1.24 +#define BUFFER_SIZE 8192
    1.25 +
    1.26 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    1.27 +
    1.28 +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
    1.29 +#define DELETE_ARRAY(array) uprv_free((void *) (array))
    1.30 +
    1.31 +InputText::InputText(UErrorCode &status)
    1.32 +    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
    1.33 +                                                 //   removed if appropriate.
    1.34 +      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
    1.35 +                                                 //   Value is percent, not absolute.
    1.36 +      fDeclaredEncoding(0),
    1.37 +      fRawInput(0),
    1.38 +      fRawLength(0)
    1.39 +{
    1.40 +    if (fInputBytes == NULL || fByteStats == NULL) {
    1.41 +        status = U_MEMORY_ALLOCATION_ERROR;
    1.42 +    }
    1.43 +}
    1.44 +
    1.45 +InputText::~InputText()
    1.46 +{
    1.47 +    DELETE_ARRAY(fDeclaredEncoding);
    1.48 +    DELETE_ARRAY(fByteStats);
    1.49 +    DELETE_ARRAY(fInputBytes);
    1.50 +}
    1.51 +
    1.52 +void InputText::setText(const char *in, int32_t len)
    1.53 +{
    1.54 +    fInputLen  = 0;
    1.55 +    fC1Bytes   = FALSE;
    1.56 +    fRawInput  = (const uint8_t *) in;
    1.57 +    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
    1.58 +}
    1.59 +
    1.60 +void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
    1.61 +{
    1.62 +    if(encoding) {
    1.63 +        if (len == -1) {
    1.64 +            len = (int32_t)uprv_strlen(encoding);
    1.65 +        }
    1.66 +
    1.67 +        len += 1;     // to make place for the \0 at the end.
    1.68 +        uprv_free(fDeclaredEncoding);
    1.69 +        fDeclaredEncoding = NEW_ARRAY(char, len);
    1.70 +        uprv_strncpy(fDeclaredEncoding, encoding, len);
    1.71 +    }
    1.72 +}
    1.73 +
    1.74 +UBool InputText::isSet() const 
    1.75 +{
    1.76 +    return fRawInput != NULL;
    1.77 +}
    1.78 +
    1.79 +/**
    1.80 +*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
    1.81 +*               it by removing what appears to be html markup.
    1.82 +* 
    1.83 +* @internal
    1.84 +*/
    1.85 +void InputText::MungeInput(UBool fStripTags) {
    1.86 +    int     srci = 0;
    1.87 +    int     dsti = 0;
    1.88 +    uint8_t b;
    1.89 +    bool    inMarkup = FALSE;
    1.90 +    int32_t openTags = 0;
    1.91 +    int32_t badTags  = 0;
    1.92 +
    1.93 +    //
    1.94 +    //  html / xml markup stripping.
    1.95 +    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    1.96 +    //     discard everything within < brackets >
    1.97 +    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    1.98 +    //     guess as to whether the input was actually marked up at all.
    1.99 +    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
   1.100 +    if (fStripTags) {
   1.101 +        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
   1.102 +            b = fRawInput[srci];
   1.103 +
   1.104 +            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
   1.105 +                if (inMarkup) {
   1.106 +                    badTags += 1;
   1.107 +                }
   1.108 +
   1.109 +                inMarkup = TRUE;
   1.110 +                openTags += 1;
   1.111 +            }
   1.112 +
   1.113 +            if (! inMarkup) {
   1.114 +                fInputBytes[dsti++] = b;
   1.115 +            }
   1.116 +
   1.117 +            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
   1.118 +                inMarkup = FALSE;
   1.119 +            }
   1.120 +        }
   1.121 +
   1.122 +        fInputLen = dsti;
   1.123 +    }
   1.124 +
   1.125 +    //
   1.126 +    //  If it looks like this input wasn't marked up, or if it looks like it's
   1.127 +    //    essentially nothing but markup abandon the markup stripping.
   1.128 +    //    Detection will have to work on the unstripped input.
   1.129 +    //
   1.130 +    if (openTags<5 || openTags/5 < badTags || 
   1.131 +        (fInputLen < 100 && fRawLength>600))
   1.132 +    {
   1.133 +        int32_t limit = fRawLength;
   1.134 +
   1.135 +        if (limit > BUFFER_SIZE) {
   1.136 +            limit = BUFFER_SIZE;
   1.137 +        }
   1.138 +
   1.139 +        for (srci=0; srci<limit; srci++) {
   1.140 +            fInputBytes[srci] = fRawInput[srci];
   1.141 +        }
   1.142 +
   1.143 +        fInputLen = srci;
   1.144 +    }
   1.145 +
   1.146 +    //
   1.147 +    // Tally up the byte occurence statistics.
   1.148 +    // These are available for use by the various detectors.
   1.149 +    //
   1.150 +
   1.151 +    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
   1.152 +
   1.153 +    for (srci = 0; srci < fInputLen; srci += 1) {
   1.154 +        fByteStats[fInputBytes[srci]] += 1;
   1.155 +    }
   1.156 +
   1.157 +    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
   1.158 +        if (fByteStats[i] != 0) {
   1.159 +            fC1Bytes = TRUE;
   1.160 +            break;
   1.161 +        }
   1.162 +    }
   1.163 +}
   1.164 +
   1.165 +U_NAMESPACE_END
   1.166 +#endif
   1.167 +

mercurial