intl/icu/source/i18n/csr2022.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #if !UCONFIG_NO_CONVERSION
michael@0 11
michael@0 12 #include "cstring.h"
michael@0 13
michael@0 14 #include "csr2022.h"
michael@0 15 #include "csmatch.h"
michael@0 16
michael@0 17 U_NAMESPACE_BEGIN
michael@0 18
michael@0 19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
michael@0 20
michael@0 21 /**
michael@0 22 * Matching function shared among the 2022 detectors JP, CN and KR
michael@0 23 * Counts up the number of legal and unrecognized escape sequences in
michael@0 24 * the sample of text, and computes a score based on the total number &
michael@0 25 * the proportion that fit the encoding.
michael@0 26 *
michael@0 27 *
michael@0 28 * @param text the byte buffer containing text to analyse
michael@0 29 * @param textLen the size of the text in the byte.
michael@0 30 * @param escapeSequences the byte escape sequences to test for.
michael@0 31 * @return match quality, in the range of 0-100.
michael@0 32 */
michael@0 33 int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const
michael@0 34 {
michael@0 35 int32_t i, j;
michael@0 36 int32_t escN;
michael@0 37 int32_t hits = 0;
michael@0 38 int32_t misses = 0;
michael@0 39 int32_t shifts = 0;
michael@0 40 int32_t quality;
michael@0 41
michael@0 42 i = 0;
michael@0 43 while(i < textLen) {
michael@0 44 if(text[i] == 0x1B) {
michael@0 45 escN = 0;
michael@0 46 while(escN < escapeSequences_length) {
michael@0 47 const uint8_t *seq = escapeSequences[escN];
michael@0 48 int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);
michael@0 49
michael@0 50 if (textLen-i >= seq_length) {
michael@0 51 j = 1;
michael@0 52 while(j < seq_length) {
michael@0 53 if(seq[j] != text[i+j]) {
michael@0 54 goto checkEscapes;
michael@0 55 }
michael@0 56
michael@0 57 j += 1;
michael@0 58 }
michael@0 59
michael@0 60 hits += 1;
michael@0 61 i += seq_length-1;
michael@0 62 goto scanInput;
michael@0 63 }
michael@0 64 // else we ran out of string to compare this time.
michael@0 65 checkEscapes:
michael@0 66 escN += 1;
michael@0 67 }
michael@0 68
michael@0 69 misses += 1;
michael@0 70 }
michael@0 71
michael@0 72 if( text[i]== 0x0e || text[i] == 0x0f){
michael@0 73 shifts += 1;
michael@0 74 }
michael@0 75
michael@0 76 scanInput:
michael@0 77 i += 1;
michael@0 78 }
michael@0 79
michael@0 80 if (hits == 0) {
michael@0 81 return 0;
michael@0 82 }
michael@0 83
michael@0 84 //
michael@0 85 // Initial quality is based on relative proportion of recongized vs.
michael@0 86 // unrecognized escape sequences.
michael@0 87 // All good: quality = 100;
michael@0 88 // half or less good: quality = 0;
michael@0 89 // linear inbetween.
michael@0 90 quality = (100*hits - 100*misses) / (hits + misses);
michael@0 91
michael@0 92 // Back off quality if there were too few escape sequences seen.
michael@0 93 // Include shifts in this computation, so that KR does not get penalized
michael@0 94 // for having only a single Escape sequence, but many shifts.
michael@0 95 if (hits+shifts < 5) {
michael@0 96 quality -= (5-(hits+shifts))*10;
michael@0 97 }
michael@0 98
michael@0 99 if (quality < 0) {
michael@0 100 quality = 0;
michael@0 101 }
michael@0 102
michael@0 103 return quality;
michael@0 104 }
michael@0 105
michael@0 106
michael@0 107 static const uint8_t escapeSequences_2022JP[][5] = {
michael@0 108 {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992
michael@0 109 {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990
michael@0 110 {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978
michael@0 111 {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80
michael@0 112 {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983
michael@0 113 {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997
michael@0 114 {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII
michael@0 115 {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman
michael@0 116 {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana
michael@0 117 {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman
michael@0 118 {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1
michael@0 119 {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7
michael@0 120 };
michael@0 121
michael@0 122 static const uint8_t escapeSequences_2022KR[][5] = {
michael@0 123 {0x1b, 0x24, 0x29, 0x43, 0x00}
michael@0 124 };
michael@0 125
michael@0 126 static const uint8_t escapeSequences_2022CN[][5] = {
michael@0 127 {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80
michael@0 128 {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1
michael@0 129 {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2
michael@0 130 {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165
michael@0 131 {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3
michael@0 132 {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4
michael@0 133 {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5
michael@0 134 {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6
michael@0 135 {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7
michael@0 136 {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2
michael@0 137 {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3
michael@0 138 };
michael@0 139
michael@0 140 CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
michael@0 141
michael@0 142 const char *CharsetRecog_2022JP::getName() const {
michael@0 143 return "ISO-2022-JP";
michael@0 144 }
michael@0 145
michael@0 146 UBool CharsetRecog_2022JP::match(InputText *textIn, CharsetMatch *results) const {
michael@0 147 int32_t confidence = match_2022(textIn->fInputBytes,
michael@0 148 textIn->fInputLen,
michael@0 149 escapeSequences_2022JP,
michael@0 150 ARRAY_SIZE(escapeSequences_2022JP));
michael@0 151 results->set(textIn, this, confidence);
michael@0 152 return (confidence > 0);
michael@0 153 }
michael@0 154
michael@0 155 CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
michael@0 156
michael@0 157 const char *CharsetRecog_2022KR::getName() const {
michael@0 158 return "ISO-2022-KR";
michael@0 159 }
michael@0 160
michael@0 161 UBool CharsetRecog_2022KR::match(InputText *textIn, CharsetMatch *results) const {
michael@0 162 int32_t confidence = match_2022(textIn->fInputBytes,
michael@0 163 textIn->fInputLen,
michael@0 164 escapeSequences_2022KR,
michael@0 165 ARRAY_SIZE(escapeSequences_2022KR));
michael@0 166 results->set(textIn, this, confidence);
michael@0 167 return (confidence > 0);
michael@0 168 }
michael@0 169
michael@0 170 CharsetRecog_2022CN::~CharsetRecog_2022CN() {}
michael@0 171
michael@0 172 const char *CharsetRecog_2022CN::getName() const {
michael@0 173 return "ISO-2022-CN";
michael@0 174 }
michael@0 175
michael@0 176 UBool CharsetRecog_2022CN::match(InputText *textIn, CharsetMatch *results) const {
michael@0 177 int32_t confidence = match_2022(textIn->fInputBytes,
michael@0 178 textIn->fInputLen,
michael@0 179 escapeSequences_2022CN,
michael@0 180 ARRAY_SIZE(escapeSequences_2022CN));
michael@0 181 results->set(textIn, this, confidence);
michael@0 182 return (confidence > 0);
michael@0 183 }
michael@0 184
michael@0 185 CharsetRecog_2022::~CharsetRecog_2022() {
michael@0 186 // nothing to do
michael@0 187 }
michael@0 188
michael@0 189 U_NAMESPACE_END
michael@0 190 #endif

mercurial