intl/icu/source/i18n/csrutf8.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/csrutf8.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,113 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2012, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_CONVERSION
    1.14 +
    1.15 +#include "csrutf8.h"
    1.16 +#include "csmatch.h"
    1.17 +
    1.18 +U_NAMESPACE_BEGIN
    1.19 +
    1.20 +CharsetRecog_UTF8::~CharsetRecog_UTF8()
    1.21 +{
    1.22 +    // nothing to do
    1.23 +}
    1.24 +
    1.25 +const char *CharsetRecog_UTF8::getName() const
    1.26 +{
    1.27 +    return "UTF-8";
    1.28 +}
    1.29 +
    1.30 +UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
    1.31 +    bool hasBOM = FALSE;
    1.32 +    int32_t numValid = 0;
    1.33 +    int32_t numInvalid = 0;
    1.34 +    const uint8_t *inputBytes = input->fRawInput;
    1.35 +    int32_t i;
    1.36 +    int32_t trailBytes = 0;
    1.37 +    int32_t confidence;
    1.38 +
    1.39 +    if (input->fRawLength >= 3 && 
    1.40 +        inputBytes[0] == 0xEF && inputBytes[1] == 0xBB && inputBytes[2] == 0xBF) {
    1.41 +            hasBOM = TRUE;
    1.42 +    }
    1.43 +
    1.44 +    // Scan for multi-byte sequences
    1.45 +    for (i=0; i < input->fRawLength; i += 1) {
    1.46 +        int32_t b = inputBytes[i];
    1.47 +
    1.48 +        if ((b & 0x80) == 0) {
    1.49 +            continue;   // ASCII
    1.50 +        }
    1.51 +
    1.52 +        // Hi bit on char found.  Figure out how long the sequence should be
    1.53 +        if ((b & 0x0E0) == 0x0C0) {
    1.54 +            trailBytes = 1;
    1.55 +        } else if ((b & 0x0F0) == 0x0E0) {
    1.56 +            trailBytes = 2;
    1.57 +        } else if ((b & 0x0F8) == 0xF0) {
    1.58 +            trailBytes = 3;
    1.59 +        } else {
    1.60 +            numInvalid += 1;
    1.61 +
    1.62 +            if (numInvalid > 5) {
    1.63 +                break;
    1.64 +            }
    1.65 +
    1.66 +            trailBytes = 0;
    1.67 +        }
    1.68 +
    1.69 +        // Verify that we've got the right number of trail bytes in the sequence
    1.70 +        for (;;) {
    1.71 +            i += 1;
    1.72 +
    1.73 +            if (i >= input->fRawLength) {
    1.74 +                break;
    1.75 +            }
    1.76 +
    1.77 +            b = inputBytes[i];
    1.78 +
    1.79 +            if ((b & 0xC0) != 0x080) {
    1.80 +                numInvalid += 1;
    1.81 +                break;
    1.82 +            }
    1.83 +
    1.84 +            if (--trailBytes == 0) {
    1.85 +                numValid += 1;
    1.86 +                break;
    1.87 +            }
    1.88 +        }
    1.89 +
    1.90 +    }
    1.91 +
    1.92 +    // Cook up some sort of confidence score, based on presense of a BOM
    1.93 +    //    and the existence of valid and/or invalid multi-byte sequences.
    1.94 +    confidence = 0;
    1.95 +    if (hasBOM && numInvalid == 0) {
    1.96 +        confidence = 100;
    1.97 +    } else if (hasBOM && numValid > numInvalid*10) {
    1.98 +        confidence = 80;
    1.99 +    } else if (numValid > 3 && numInvalid == 0) {
   1.100 +        confidence = 100;
   1.101 +    } else if (numValid > 0 && numInvalid == 0) {
   1.102 +        confidence = 80;
   1.103 +    } else if (numValid == 0 && numInvalid == 0) {
   1.104 +        // Plain ASCII.
   1.105 +        confidence = 10;
   1.106 +    } else if (numValid > numInvalid*10) {
   1.107 +        // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
   1.108 +        confidence = 25;
   1.109 +    }
   1.110 +
   1.111 +    results->set(input, this, confidence);
   1.112 +    return (confidence > 0);
   1.113 +}
   1.114 +
   1.115 +U_NAMESPACE_END
   1.116 +#endif

mercurial