intl/icu/source/i18n/inputext.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 2005-2009, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  */
     8 #include "unicode/utypes.h"
    10 #if !UCONFIG_NO_CONVERSION
    12 #include "inputext.h"
    14 #include "cmemory.h"
    15 #include "cstring.h"
    17 #include <string.h>
    19 U_NAMESPACE_BEGIN
    21 #define BUFFER_SIZE 8192
    23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
    26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
    28 InputText::InputText(UErrorCode &status)
    29     : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
    30                                                  //   removed if appropriate.
    31       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
    32                                                  //   Value is percent, not absolute.
    33       fDeclaredEncoding(0),
    34       fRawInput(0),
    35       fRawLength(0)
    36 {
    37     if (fInputBytes == NULL || fByteStats == NULL) {
    38         status = U_MEMORY_ALLOCATION_ERROR;
    39     }
    40 }
    42 InputText::~InputText()
    43 {
    44     DELETE_ARRAY(fDeclaredEncoding);
    45     DELETE_ARRAY(fByteStats);
    46     DELETE_ARRAY(fInputBytes);
    47 }
    49 void InputText::setText(const char *in, int32_t len)
    50 {
    51     fInputLen  = 0;
    52     fC1Bytes   = FALSE;
    53     fRawInput  = (const uint8_t *) in;
    54     fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
    55 }
    57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
    58 {
    59     if(encoding) {
    60         if (len == -1) {
    61             len = (int32_t)uprv_strlen(encoding);
    62         }
    64         len += 1;     // to make place for the \0 at the end.
    65         uprv_free(fDeclaredEncoding);
    66         fDeclaredEncoding = NEW_ARRAY(char, len);
    67         uprv_strncpy(fDeclaredEncoding, encoding, len);
    68     }
    69 }
    71 UBool InputText::isSet() const 
    72 {
    73     return fRawInput != NULL;
    74 }
    76 /**
    77 *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
    78 *               it by removing what appears to be html markup.
    79 * 
    80 * @internal
    81 */
    82 void InputText::MungeInput(UBool fStripTags) {
    83     int     srci = 0;
    84     int     dsti = 0;
    85     uint8_t b;
    86     bool    inMarkup = FALSE;
    87     int32_t openTags = 0;
    88     int32_t badTags  = 0;
    90     //
    91     //  html / xml markup stripping.
    92     //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    93     //     discard everything within < brackets >
    94     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    95     //     guess as to whether the input was actually marked up at all.
    96     // TODO: Think about how this interacts with EBCDIC charsets that are detected.
    97     if (fStripTags) {
    98         for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
    99             b = fRawInput[srci];
   101             if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
   102                 if (inMarkup) {
   103                     badTags += 1;
   104                 }
   106                 inMarkup = TRUE;
   107                 openTags += 1;
   108             }
   110             if (! inMarkup) {
   111                 fInputBytes[dsti++] = b;
   112             }
   114             if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
   115                 inMarkup = FALSE;
   116             }
   117         }
   119         fInputLen = dsti;
   120     }
   122     //
   123     //  If it looks like this input wasn't marked up, or if it looks like it's
   124     //    essentially nothing but markup abandon the markup stripping.
   125     //    Detection will have to work on the unstripped input.
   126     //
   127     if (openTags<5 || openTags/5 < badTags || 
   128         (fInputLen < 100 && fRawLength>600))
   129     {
   130         int32_t limit = fRawLength;
   132         if (limit > BUFFER_SIZE) {
   133             limit = BUFFER_SIZE;
   134         }
   136         for (srci=0; srci<limit; srci++) {
   137             fInputBytes[srci] = fRawInput[srci];
   138         }
   140         fInputLen = srci;
   141     }
   143     //
   144     // Tally up the byte occurence statistics.
   145     // These are available for use by the various detectors.
   146     //
   148     uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
   150     for (srci = 0; srci < fInputLen; srci += 1) {
   151         fByteStats[fInputBytes[srci]] += 1;
   152     }
   154     for (int32_t i = 0x80; i <= 0x9F; i += 1) {
   155         if (fByteStats[i] != 0) {
   156             fC1Bytes = TRUE;
   157             break;
   158         }
   159     }
   160 }
   162 U_NAMESPACE_END
   163 #endif

mercurial