intl/icu/source/i18n/csr2022.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/csr2022.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,190 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 2005-2012, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + */
    1.10 +
    1.11 +#include "unicode/utypes.h"
    1.12 +
    1.13 +#if !UCONFIG_NO_CONVERSION
    1.14 +
    1.15 +#include "cstring.h"
    1.16 +
    1.17 +#include "csr2022.h"
    1.18 +#include "csmatch.h"
    1.19 +
    1.20 +U_NAMESPACE_BEGIN
    1.21 +
    1.22 +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
    1.23 +
    1.24 +/**
    1.25 + * Matching function shared among the 2022 detectors JP, CN and KR
    1.26 + * Counts up the number of legal and unrecognized escape sequences in
    1.27 + * the sample of text, and computes a score based on the total number &
    1.28 + * the proportion that fit the encoding.
    1.29 + * 
    1.30 + * 
    1.31 + * @param text the byte buffer containing text to analyse
    1.32 + * @param textLen  the size of the text in the byte.
    1.33 + * @param escapeSequences the byte escape sequences to test for.
    1.34 + * @return match quality, in the range of 0-100.
    1.35 + */
    1.36 +int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const
    1.37 +{
    1.38 +    int32_t i, j;
    1.39 +    int32_t escN;
    1.40 +    int32_t hits   = 0;
    1.41 +    int32_t misses = 0;
    1.42 +    int32_t shifts = 0;
    1.43 +    int32_t quality;
    1.44 +
    1.45 +    i = 0;
    1.46 +    while(i < textLen) {
    1.47 +        if(text[i] == 0x1B) {
    1.48 +            escN = 0;
    1.49 +            while(escN < escapeSequences_length) {
    1.50 +                const uint8_t *seq = escapeSequences[escN];
    1.51 +                int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);
    1.52 +
    1.53 +                if (textLen-i >= seq_length) {
    1.54 +                    j = 1;
    1.55 +                    while(j < seq_length) {
    1.56 +                        if(seq[j] != text[i+j]) {
    1.57 +                            goto checkEscapes;
    1.58 +                        }
    1.59 +
    1.60 +                        j += 1;
    1.61 +                    }
    1.62 +
    1.63 +                    hits += 1;
    1.64 +                    i += seq_length-1;
    1.65 +                    goto scanInput;
    1.66 +                }
    1.67 +                // else we ran out of string to compare this time.
    1.68 +checkEscapes:
    1.69 +                escN += 1;
    1.70 +            }
    1.71 +
    1.72 +            misses += 1;
    1.73 +        }
    1.74 +
    1.75 +        if( text[i]== 0x0e || text[i] == 0x0f){
    1.76 +            shifts += 1;
    1.77 +        }
    1.78 +
    1.79 +scanInput:
    1.80 +        i += 1;
    1.81 +    }
    1.82 +
    1.83 +    if (hits == 0) {
    1.84 +        return 0;
    1.85 +    }
    1.86 +
    1.87 +    //
    1.88 +    // Initial quality is based on relative proportion of recongized vs.
    1.89 +    //   unrecognized escape sequences. 
    1.90 +    //   All good:  quality = 100;
    1.91 +    //   half or less good: quality = 0;
    1.92 +    //   linear inbetween.
    1.93 +    quality = (100*hits - 100*misses) / (hits + misses);
    1.94 +
    1.95 +    // Back off quality if there were too few escape sequences seen.
    1.96 +    //   Include shifts in this computation, so that KR does not get penalized
    1.97 +    //   for having only a single Escape sequence, but many shifts.
    1.98 +    if (hits+shifts < 5) {
    1.99 +        quality -= (5-(hits+shifts))*10;
   1.100 +    }
   1.101 +
   1.102 +    if (quality < 0) {
   1.103 +        quality = 0;
   1.104 +    }
   1.105 +
   1.106 +    return quality;
   1.107 +}
   1.108 +
   1.109 +
   1.110 +static const uint8_t escapeSequences_2022JP[][5] = {
   1.111 +    {0x1b, 0x24, 0x28, 0x43, 0x00},   // KS X 1001:1992
   1.112 +    {0x1b, 0x24, 0x28, 0x44, 0x00},   // JIS X 212-1990
   1.113 +    {0x1b, 0x24, 0x40, 0x00, 0x00},   // JIS C 6226-1978
   1.114 +    {0x1b, 0x24, 0x41, 0x00, 0x00},   // GB 2312-80
   1.115 +    {0x1b, 0x24, 0x42, 0x00, 0x00},   // JIS X 208-1983
   1.116 +    {0x1b, 0x26, 0x40, 0x00, 0x00},   // JIS X 208 1990, 1997
   1.117 +    {0x1b, 0x28, 0x42, 0x00, 0x00},   // ASCII
   1.118 +    {0x1b, 0x28, 0x48, 0x00, 0x00},   // JIS-Roman
   1.119 +    {0x1b, 0x28, 0x49, 0x00, 0x00},   // Half-width katakana
   1.120 +    {0x1b, 0x28, 0x4a, 0x00, 0x00},   // JIS-Roman
   1.121 +    {0x1b, 0x2e, 0x41, 0x00, 0x00},   // ISO 8859-1
   1.122 +    {0x1b, 0x2e, 0x46, 0x00, 0x00}    // ISO 8859-7
   1.123 +};
   1.124 +
   1.125 +static const uint8_t escapeSequences_2022KR[][5] = {
   1.126 +    {0x1b, 0x24, 0x29, 0x43, 0x00}   
   1.127 +};
   1.128 +
   1.129 +static const uint8_t escapeSequences_2022CN[][5] = {
   1.130 +    {0x1b, 0x24, 0x29, 0x41, 0x00},   // GB 2312-80
   1.131 +    {0x1b, 0x24, 0x29, 0x47, 0x00},   // CNS 11643-1992 Plane 1
   1.132 +    {0x1b, 0x24, 0x2A, 0x48, 0x00},   // CNS 11643-1992 Plane 2
   1.133 +    {0x1b, 0x24, 0x29, 0x45, 0x00},   // ISO-IR-165
   1.134 +    {0x1b, 0x24, 0x2B, 0x49, 0x00},   // CNS 11643-1992 Plane 3
   1.135 +    {0x1b, 0x24, 0x2B, 0x4A, 0x00},   // CNS 11643-1992 Plane 4
   1.136 +    {0x1b, 0x24, 0x2B, 0x4B, 0x00},   // CNS 11643-1992 Plane 5
   1.137 +    {0x1b, 0x24, 0x2B, 0x4C, 0x00},   // CNS 11643-1992 Plane 6
   1.138 +    {0x1b, 0x24, 0x2B, 0x4D, 0x00},   // CNS 11643-1992 Plane 7
   1.139 +    {0x1b, 0x4e, 0x00, 0x00, 0x00},   // SS2
   1.140 +    {0x1b, 0x4f, 0x00, 0x00, 0x00},   // SS3
   1.141 +};
   1.142 +
   1.143 +CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
   1.144 +
   1.145 +const char *CharsetRecog_2022JP::getName() const {
   1.146 +    return "ISO-2022-JP";
   1.147 +}
   1.148 +
   1.149 +UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const {
   1.150 +    int32_t confidence = match_2022(textIn->fInputBytes, 
   1.151 +                                    textIn->fInputLen, 
   1.152 +                                    escapeSequences_2022JP, 
   1.153 +                                    ARRAY_SIZE(escapeSequences_2022JP));
   1.154 +    results->set(textIn, this, confidence);
   1.155 +    return (confidence > 0);
   1.156 +}
   1.157 +
   1.158 +CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
   1.159 +
   1.160 +const char *CharsetRecog_2022KR::getName() const {
   1.161 +    return "ISO-2022-KR";
   1.162 +}
   1.163 +
   1.164 +UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const {
   1.165 +    int32_t confidence = match_2022(textIn->fInputBytes, 
   1.166 +                                    textIn->fInputLen, 
   1.167 +                                    escapeSequences_2022KR, 
   1.168 +                                    ARRAY_SIZE(escapeSequences_2022KR));
   1.169 +    results->set(textIn, this, confidence);
   1.170 +    return (confidence > 0);
   1.171 +}
   1.172 +
   1.173 +CharsetRecog_2022CN::~CharsetRecog_2022CN() {}
   1.174 +
   1.175 +const char *CharsetRecog_2022CN::getName() const {
   1.176 +    return "ISO-2022-CN";
   1.177 +}
   1.178 +
   1.179 +UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const {
   1.180 +    int32_t confidence = match_2022(textIn->fInputBytes,
   1.181 +                                    textIn->fInputLen,
   1.182 +                                    escapeSequences_2022CN,
   1.183 +                                    ARRAY_SIZE(escapeSequences_2022CN));
   1.184 +    results->set(textIn, this, confidence);
   1.185 +    return (confidence > 0);
   1.186 +}
   1.187 +
   1.188 +CharsetRecog_2022::~CharsetRecog_2022() {
   1.189 +    // nothing to do
   1.190 +}
   1.191 +
   1.192 +U_NAMESPACE_END
   1.193 +#endif

mercurial