The Tor Browser: intl/icu/source/i18n/inputext.cpp@fc2d59ddac77

     1 /*

     2  **********************************************************************

     3  *   Copyright (C) 2005-2009, International Business Machines

     4  *   Corporation and others.  All Rights Reserved.

     5  **********************************************************************

     6  */

     8 #include "unicode/utypes.h"

    10 #if !UCONFIG_NO_CONVERSION

    12 #include "inputext.h"

    14 #include "cmemory.h"

    15 #include "cstring.h"

    17 #include <string.h>

    19 U_NAMESPACE_BEGIN

    21 #define BUFFER_SIZE 8192

    23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

    25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))

    26 #define DELETE_ARRAY(array) uprv_free((void *) (array))

    28 InputText::InputText(UErrorCode &status)

    29     : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been

    30                                                  //   removed if appropriate.

    31       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.

    32                                                  //   Value is percent, not absolute.

    33       fDeclaredEncoding(0),

    34       fRawInput(0),

    35       fRawLength(0)

    36 {

    37     if (fInputBytes == NULL || fByteStats == NULL) {

    38         status = U_MEMORY_ALLOCATION_ERROR;

    39     }

    40 }

    42 InputText::~InputText()

    43 {

    44     DELETE_ARRAY(fDeclaredEncoding);

    45     DELETE_ARRAY(fByteStats);

    46     DELETE_ARRAY(fInputBytes);

    47 }

    49 void InputText::setText(const char *in, int32_t len)

    50 {

    51     fInputLen  = 0;

    52     fC1Bytes   = FALSE;

    53     fRawInput  = (const uint8_t *) in;

    54     fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;

    55 }

    57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)

    58 {

    59     if(encoding) {

    60         if (len == -1) {

    61             len = (int32_t)uprv_strlen(encoding);

    62         }

    64         len += 1;     // to make place for the \0 at the end.

    65         uprv_free(fDeclaredEncoding);

    66         fDeclaredEncoding = NEW_ARRAY(char, len);

    67         uprv_strncpy(fDeclaredEncoding, encoding, len);

    68     }

    69 }

    71 UBool InputText::isSet() const

    72 {

    73     return fRawInput != NULL;

    74 }

    76 /**

    77 *  MungeInput - after getting a set of raw input data to be analyzed, preprocess

    78 *               it by removing what appears to be html markup.

    79 *

    80 * @internal

    81 */

    82 void InputText::MungeInput(UBool fStripTags) {

    83     int     srci = 0;

    84     int     dsti = 0;

    85     uint8_t b;

    86     bool    inMarkup = FALSE;

    87     int32_t openTags = 0;

    88     int32_t badTags  = 0;

    90     //

    91     //  html / xml markup stripping.

    92     //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.

    93     //     discard everything within < brackets >

    94     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some

    95     //     guess as to whether the input was actually marked up at all.

    96     // TODO: Think about how this interacts with EBCDIC charsets that are detected.

    97     if (fStripTags) {

    98         for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {

    99             b = fRawInput[srci];

   101             if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */

   102                 if (inMarkup) {

   103                     badTags += 1;

   104                 }

   106                 inMarkup = TRUE;

   107                 openTags += 1;

   108             }

   110             if (! inMarkup) {

   111                 fInputBytes[dsti++] = b;

   112             }

   114             if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */

   115                 inMarkup = FALSE;

   116             }

   117         }

   119         fInputLen = dsti;

   120     }

   122     //

   123     //  If it looks like this input wasn't marked up, or if it looks like it's

   124     //    essentially nothing but markup abandon the markup stripping.

   125     //    Detection will have to work on the unstripped input.

   126     //

   127     if (openTags<5 || openTags/5 < badTags ||

   128         (fInputLen < 100 && fRawLength>600))

   129     {

   130         int32_t limit = fRawLength;

   132         if (limit > BUFFER_SIZE) {

   133             limit = BUFFER_SIZE;

   134         }

   136         for (srci=0; srci<limit; srci++) {

   137             fInputBytes[srci] = fRawInput[srci];

   138         }

   140         fInputLen = srci;

   141     }

   143     //

   144     // Tally up the byte occurence statistics.

   145     // These are available for use by the various detectors.

   146     //

   148     uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

   150     for (srci = 0; srci < fInputLen; srci += 1) {

   151         fByteStats[fInputBytes[srci]] += 1;

   152     }

   154     for (int32_t i = 0x80; i <= 0x9F; i += 1) {

   155         if (fByteStats[i] != 0) {

   156             fC1Bytes = TRUE;

   157             break;

   158         }

   159     }

   160 }

   162 U_NAMESPACE_END

   163 #endif

The Tor Browser / file revision