intl/icu/source/i18n/csrsbcs.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2005-2013, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 */
michael@0 7
michael@0 8 #include "unicode/utypes.h"
michael@0 9
michael@0 10 #include "cmemory.h"
michael@0 11
michael@0 12 #if !UCONFIG_NO_CONVERSION
michael@0 13 #include "csrsbcs.h"
michael@0 14 #include "csmatch.h"
michael@0 15
michael@0 16 #define N_GRAM_SIZE 3
michael@0 17 #define N_GRAM_MASK 0xFFFFFF
michael@0 18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
michael@0 19
michael@0 20 U_NAMESPACE_BEGIN
michael@0 21
michael@0 22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
michael@0 23 : ngram(0), byteIndex(0)
michael@0 24 {
michael@0 25 ngramList = theNgramList;
michael@0 26 charMap = theCharMap;
michael@0 27
michael@0 28 ngramCount = hitCount = 0;
michael@0 29 }
michael@0 30
michael@0 31 /*
michael@0 32 * Binary search for value in table, which must have exactly 64 entries.
michael@0 33 */
michael@0 34
michael@0 35 int32_t NGramParser::search(const int32_t *table, int32_t value)
michael@0 36 {
michael@0 37 int32_t index = 0;
michael@0 38
michael@0 39 if (table[index + 32] <= value) {
michael@0 40 index += 32;
michael@0 41 }
michael@0 42
michael@0 43 if (table[index + 16] <= value) {
michael@0 44 index += 16;
michael@0 45 }
michael@0 46
michael@0 47 if (table[index + 8] <= value) {
michael@0 48 index += 8;
michael@0 49 }
michael@0 50
michael@0 51 if (table[index + 4] <= value) {
michael@0 52 index += 4;
michael@0 53 }
michael@0 54
michael@0 55 if (table[index + 2] <= value) {
michael@0 56 index += 2;
michael@0 57 }
michael@0 58
michael@0 59 if (table[index + 1] <= value) {
michael@0 60 index += 1;
michael@0 61 }
michael@0 62
michael@0 63 if (table[index] > value) {
michael@0 64 index -= 1;
michael@0 65 }
michael@0 66
michael@0 67 if (index < 0 || table[index] != value) {
michael@0 68 return -1;
michael@0 69 }
michael@0 70
michael@0 71 return index;
michael@0 72 }
michael@0 73
michael@0 74 void NGramParser::lookup(int32_t thisNgram)
michael@0 75 {
michael@0 76 ngramCount += 1;
michael@0 77
michael@0 78 if (search(ngramList, thisNgram) >= 0) {
michael@0 79 hitCount += 1;
michael@0 80 }
michael@0 81
michael@0 82 }
michael@0 83
michael@0 84 void NGramParser::addByte(int32_t b)
michael@0 85 {
michael@0 86 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
michael@0 87 lookup(ngram);
michael@0 88 }
michael@0 89
michael@0 90 int32_t NGramParser::nextByte(InputText *det)
michael@0 91 {
michael@0 92 if (byteIndex >= det->fInputLen) {
michael@0 93 return -1;
michael@0 94 }
michael@0 95
michael@0 96 return det->fInputBytes[byteIndex++];
michael@0 97 }
michael@0 98
michael@0 99 void NGramParser::parseCharacters(InputText *det)
michael@0 100 {
michael@0 101 int32_t b;
michael@0 102 bool ignoreSpace = FALSE;
michael@0 103
michael@0 104 while ((b = nextByte(det)) >= 0) {
michael@0 105 uint8_t mb = charMap[b];
michael@0 106
michael@0 107 // TODO: 0x20 might not be a space in all character sets...
michael@0 108 if (mb != 0) {
michael@0 109 if (!(mb == 0x20 && ignoreSpace)) {
michael@0 110 addByte(mb);
michael@0 111 }
michael@0 112
michael@0 113 ignoreSpace = (mb == 0x20);
michael@0 114 }
michael@0 115 }
michael@0 116 }
michael@0 117
michael@0 118 int32_t NGramParser::parse(InputText *det)
michael@0 119 {
michael@0 120 parseCharacters(det);
michael@0 121
michael@0 122 // TODO: Is this OK? The buffer could have ended in the middle of a word...
michael@0 123 addByte(0x20);
michael@0 124
michael@0 125 double rawPercent = (double) hitCount / (double) ngramCount;
michael@0 126
michael@0 127 // if (rawPercent <= 2.0) {
michael@0 128 // return 0;
michael@0 129 // }
michael@0 130
michael@0 131 // TODO - This is a bit of a hack to take care of a case
michael@0 132 // were we were getting a confidence of 135...
michael@0 133 if (rawPercent > 0.33) {
michael@0 134 return 98;
michael@0 135 }
michael@0 136
michael@0 137 return (int32_t) (rawPercent * 300.0);
michael@0 138 }
michael@0 139
michael@0 140 static const uint8_t unshapeMap_IBM420[] = {
michael@0 141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
michael@0 142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
michael@0 147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
michael@0 148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
michael@0 150 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
michael@0 151 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
michael@0 152 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
michael@0 153 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
michael@0 154 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
michael@0 155 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
michael@0 156 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 157 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
michael@0 158 };
michael@0 159
michael@0 160 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
michael@0 161 {
michael@0 162 alef = 0x00;
michael@0 163 }
michael@0 164
michael@0 165
michael@0 166 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
michael@0 167 {
michael@0 168 if(b == 0xB2 || b == 0xB3){
michael@0 169 return 0x47;
michael@0 170 }else if(b == 0xB4 || b == 0xB5){
michael@0 171 return 0x49;
michael@0 172 }else if(b == 0xB8 || b == 0xB9){
michael@0 173 return 0x56;
michael@0 174 }else
michael@0 175 return 0x00;
michael@0 176 }
michael@0 177
michael@0 178 /*
michael@0 179 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
michael@0 180 * because CharsetDetector is dealing with bytes not Unicode code points. We could
michael@0 181 * convert the bytes to Unicode code points but that would leave us dependent
michael@0 182 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
michael@0 183 * of JDK can produce different results and therefore is also avoided.
michael@0 184 */
michael@0 185 int32_t NGramParser_IBM420::nextByte(InputText *det)
michael@0 186 {
michael@0 187
michael@0 188 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
michael@0 189 return -1;
michael@0 190 }
michael@0 191 int next;
michael@0 192
michael@0 193 alef = isLamAlef(det->fInputBytes[byteIndex]);
michael@0 194 if(alef != 0x00)
michael@0 195 next = 0xB1 & 0xFF;
michael@0 196 else
michael@0 197 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
michael@0 198
michael@0 199 byteIndex++;
michael@0 200
michael@0 201 return next;
michael@0 202 }
michael@0 203
michael@0 204 void NGramParser_IBM420::parseCharacters(InputText *det)
michael@0 205 {
michael@0 206 int32_t b;
michael@0 207 bool ignoreSpace = FALSE;
michael@0 208
michael@0 209 while ((b = nextByte(det)) >= 0) {
michael@0 210 uint8_t mb = charMap[b];
michael@0 211
michael@0 212 // TODO: 0x20 might not be a space in all character sets...
michael@0 213 if (mb != 0) {
michael@0 214 if (!(mb == 0x20 && ignoreSpace)) {
michael@0 215 addByte(mb);
michael@0 216 }
michael@0 217 ignoreSpace = (mb == 0x20);
michael@0 218 }
michael@0 219
michael@0 220 if(alef != 0x00){
michael@0 221 mb = charMap[alef & 0xFF];
michael@0 222
michael@0 223 // TODO: 0x20 might not be a space in all character sets...
michael@0 224 if (mb != 0) {
michael@0 225 if (!(mb == 0x20 && ignoreSpace)) {
michael@0 226 addByte(mb);
michael@0 227 }
michael@0 228
michael@0 229 ignoreSpace = (mb == 0x20);
michael@0 230 }
michael@0 231
michael@0 232 }
michael@0 233 }
michael@0 234 }
michael@0 235
michael@0 236 CharsetRecog_sbcs::CharsetRecog_sbcs()
michael@0 237 {
michael@0 238 // nothing else to do
michael@0 239 }
michael@0 240
michael@0 241 CharsetRecog_sbcs::~CharsetRecog_sbcs()
michael@0 242 {
michael@0 243 // nothing to do
michael@0 244 }
michael@0 245
michael@0 246 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
michael@0 247 {
michael@0 248 NGramParser parser(ngrams, byteMap);
michael@0 249 int32_t result;
michael@0 250
michael@0 251 result = parser.parse(det);
michael@0 252
michael@0 253 return result;
michael@0 254 }
michael@0 255
michael@0 256 static const uint8_t charMap_8859_1[] = {
michael@0 257 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 258 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 259 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 260 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 265 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 266 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 267 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 268 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 273 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 274 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 275 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 276 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 278 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 279 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
michael@0 280 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 281 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 282 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 283 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 284 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
michael@0 285 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 286 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 287 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 288 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
michael@0 289 };
michael@0 290
michael@0 291 static const uint8_t charMap_8859_2[] = {
michael@0 292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 293 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 294 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 295 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 300 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 301 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 302 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 303 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 308 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 309 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 310 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 311 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 312 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
michael@0 313 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
michael@0 314 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
michael@0 315 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
michael@0 316 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 317 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 318 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 319 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
michael@0 320 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 322 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 323 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
michael@0 324 };
michael@0 325
michael@0 326 static const uint8_t charMap_8859_5[] = {
michael@0 327 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 328 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 329 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 330 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 335 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 336 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 337 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 338 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 343 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 344 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 345 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 346 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 347 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 348 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
michael@0 349 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
michael@0 350 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 351 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 352 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 353 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
michael@0 354 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 355 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 356 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 357 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 358 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
michael@0 359 };
michael@0 360
michael@0 361 static const uint8_t charMap_8859_6[] = {
michael@0 362 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 363 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 364 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 365 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 370 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 371 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 372 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 373 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 378 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 379 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 380 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 381 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 386 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
michael@0 387 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
michael@0 388 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
michael@0 389 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 391 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 392 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 393 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 394 };
michael@0 395
michael@0 396 static const uint8_t charMap_8859_7[] = {
michael@0 397 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 398 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 401 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 402 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 403 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 405 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 406 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 407 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 408 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 409 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 410 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 411 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 412 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 416 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 417 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 418 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 419 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
michael@0 420 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
michael@0 421 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 422 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 423 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 424 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 425 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 426 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 427 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 428 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
michael@0 429 };
michael@0 430
michael@0 431 static const uint8_t charMap_8859_8[] = {
michael@0 432 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 433 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 434 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 435 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 436 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 437 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 438 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 440 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 441 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 442 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 443 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 444 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 445 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 446 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 447 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 448 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 449 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 454 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
michael@0 455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 460 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 461 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 462 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 463 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 464 };
michael@0 465
michael@0 466 static const uint8_t charMap_8859_9[] = {
michael@0 467 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 468 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 469 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 471 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 472 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 475 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 476 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 477 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 478 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 479 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 480 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 481 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 482 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 483 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 484 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 485 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 486 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 487 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 488 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 489 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
michael@0 490 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 491 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 492 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 493 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 494 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
michael@0 495 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 496 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 497 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
michael@0 498 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
michael@0 499 };
michael@0 500
michael@0 501 static const int32_t ngrams_windows_1251[] = {
michael@0 502 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
michael@0 503 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
michael@0 504 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
michael@0 505 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
michael@0 506 };
michael@0 507
michael@0 508 static const uint8_t charMap_windows_1251[] = {
michael@0 509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 512 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 514 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 517 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 518 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 519 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 520 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 521 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 522 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 523 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 524 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 525 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
michael@0 526 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
michael@0 527 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 528 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
michael@0 529 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
michael@0 530 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
michael@0 531 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
michael@0 532 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
michael@0 533 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 534 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 535 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 536 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
michael@0 537 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 538 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 539 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
michael@0 540 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
michael@0 541 };
michael@0 542
michael@0 543 static const int32_t ngrams_windows_1256[] = {
michael@0 544 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
michael@0 545 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
michael@0 546 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
michael@0 547 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
michael@0 548 };
michael@0 549
michael@0 550 static const uint8_t charMap_windows_1256[] = {
michael@0 551 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 552 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 553 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 554 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 555 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 556 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 557 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 559 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 560 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 561 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 562 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 563 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 564 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 565 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 566 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 567 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
michael@0 568 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
michael@0 569 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 570 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
michael@0 571 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 572 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 573 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
michael@0 574 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 575 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
michael@0 576 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
michael@0 577 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
michael@0 578 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 579 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
michael@0 580 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
michael@0 581 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
michael@0 582 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
michael@0 583 };
michael@0 584
michael@0 585 static const int32_t ngrams_KOI8_R[] = {
michael@0 586 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
michael@0 587 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
michael@0 588 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
michael@0 589 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
michael@0 590 };
michael@0 591
michael@0 592 static const uint8_t charMap_KOI8_R[] = {
michael@0 593 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 594 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 595 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 596 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 597 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
michael@0 598 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 599 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 601 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 602 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 603 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 604 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 605 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
michael@0 606 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
michael@0 607 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
michael@0 608 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 609 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 610 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 611 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 612 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 613 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
michael@0 614 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 615 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
michael@0 616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
michael@0 617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
michael@0 618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
michael@0 619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
michael@0 620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
michael@0 622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
michael@0 623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
michael@0 624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 625 };
michael@0 626
michael@0 627 static const int32_t ngrams_IBM424_he_rtl[] = {
michael@0 628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
michael@0 629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
michael@0 630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
michael@0 631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
michael@0 632 };
michael@0 633
michael@0 634 static const int32_t ngrams_IBM424_he_ltr[] = {
michael@0 635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
michael@0 636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
michael@0 637 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
michael@0 638 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
michael@0 639 };
michael@0 640
michael@0 641 static const uint8_t charMap_IBM424_he[] = {
michael@0 642 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
michael@0 643 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 644 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 645 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 646 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 647 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 648 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 649 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 650 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
michael@0 651 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 652 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 653 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 654 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 655 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 656 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 657 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 658 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 659 };
michael@0 660
michael@0 661 static const int32_t ngrams_IBM420_ar_rtl[] = {
michael@0 662 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
michael@0 663 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
michael@0 664 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
michael@0 665 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
michael@0 666 };
michael@0 667
michael@0 668 static const int32_t ngrams_IBM420_ar_ltr[] = {
michael@0 669 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
michael@0 670 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
michael@0 671 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
michael@0 672 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
michael@0 673 };
michael@0 674
michael@0 675 static const uint8_t charMap_IBM420_ar[]= {
michael@0 676 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
michael@0 677 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 678 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 679 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 680 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 681 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 682 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 683 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
michael@0 685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
michael@0 686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
michael@0 687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
michael@0 688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
michael@0 689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
michael@0 690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
michael@0 691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
michael@0 692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
michael@0 693 };
michael@0 694
michael@0 695 //ISO-8859-1,2,5,6,7,8,9 Ngrams
michael@0 696
michael@0 697 struct NGramsPlusLang {
michael@0 698 const int32_t ngrams[64];
michael@0 699 const char * lang;
michael@0 700 };
michael@0 701
michael@0 702 static const NGramsPlusLang ngrams_8859_1[] = {
michael@0 703 {
michael@0 704 {
michael@0 705 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
michael@0 706 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
michael@0 707 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
michael@0 708 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
michael@0 709 },
michael@0 710 "en"
michael@0 711 },
michael@0 712 {
michael@0 713 {
michael@0 714 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
michael@0 715 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
michael@0 716 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
michael@0 717 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
michael@0 718 },
michael@0 719 "da"
michael@0 720 },
michael@0 721 {
michael@0 722 {
michael@0 723 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
michael@0 724 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
michael@0 725 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
michael@0 726 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
michael@0 727 },
michael@0 728 "de"
michael@0 729 },
michael@0 730 {
michael@0 731 {
michael@0 732 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
michael@0 733 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
michael@0 734 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
michael@0 735 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
michael@0 736 },
michael@0 737 "es"
michael@0 738 },
michael@0 739 {
michael@0 740 {
michael@0 741 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
michael@0 742 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
michael@0 743 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
michael@0 744 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
michael@0 745 },
michael@0 746 "fr"
michael@0 747 },
michael@0 748 {
michael@0 749 {
michael@0 750 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
michael@0 751 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
michael@0 752 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
michael@0 753 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
michael@0 754 },
michael@0 755 "it"
michael@0 756 },
michael@0 757 {
michael@0 758 {
michael@0 759 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
michael@0 760 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
michael@0 761 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
michael@0 762 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
michael@0 763 },
michael@0 764 "nl"
michael@0 765 },
michael@0 766 {
michael@0 767 {
michael@0 768 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
michael@0 769 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
michael@0 770 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
michael@0 771 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
michael@0 772 },
michael@0 773 "no"
michael@0 774 },
michael@0 775 {
michael@0 776 {
michael@0 777 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
michael@0 778 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
michael@0 779 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
michael@0 780 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
michael@0 781 },
michael@0 782 "pt"
michael@0 783 },
michael@0 784 {
michael@0 785 {
michael@0 786 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
michael@0 787 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
michael@0 788 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
michael@0 789 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
michael@0 790 },
michael@0 791 "sv"
michael@0 792 }
michael@0 793 };
michael@0 794
michael@0 795
michael@0 796 static const NGramsPlusLang ngrams_8859_2[] = {
michael@0 797 {
michael@0 798 {
michael@0 799 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
michael@0 800 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
michael@0 801 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
michael@0 802 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
michael@0 803 },
michael@0 804 "cs"
michael@0 805 },
michael@0 806 {
michael@0 807 {
michael@0 808 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
michael@0 809 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
michael@0 810 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
michael@0 811 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
michael@0 812 },
michael@0 813 "hu"
michael@0 814 },
michael@0 815 {
michael@0 816 {
michael@0 817 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
michael@0 818 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
michael@0 819 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
michael@0 820 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
michael@0 821 },
michael@0 822 "pl"
michael@0 823 },
michael@0 824 {
michael@0 825 {
michael@0 826 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
michael@0 827 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
michael@0 828 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
michael@0 829 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
michael@0 830 },
michael@0 831 "ro"
michael@0 832 }
michael@0 833 };
michael@0 834
michael@0 835 static const int32_t ngrams_8859_5_ru[] = {
michael@0 836 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
michael@0 837 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
michael@0 838 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
michael@0 839 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
michael@0 840 };
michael@0 841
michael@0 842 static const int32_t ngrams_8859_6_ar[] = {
michael@0 843 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
michael@0 844 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
michael@0 845 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
michael@0 846 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
michael@0 847 };
michael@0 848
michael@0 849 static const int32_t ngrams_8859_7_el[] = {
michael@0 850 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
michael@0 851 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
michael@0 852 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
michael@0 853 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
michael@0 854 };
michael@0 855
michael@0 856 static const int32_t ngrams_8859_8_I_he[] = {
michael@0 857 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
michael@0 858 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
michael@0 859 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
michael@0 860 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
michael@0 861 };
michael@0 862
michael@0 863 static const int32_t ngrams_8859_8_he[] = {
michael@0 864 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
michael@0 865 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
michael@0 866 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
michael@0 867 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
michael@0 868 };
michael@0 869
michael@0 870 static const int32_t ngrams_8859_9_tr[] = {
michael@0 871 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
michael@0 872 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
michael@0 873 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
michael@0 874 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
michael@0 875 };
michael@0 876
michael@0 877 CharsetRecog_8859_1::~CharsetRecog_8859_1()
michael@0 878 {
michael@0 879 // nothing to do
michael@0 880 }
michael@0 881
michael@0 882 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
michael@0 883 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
michael@0 884 uint32_t i;
michael@0 885 int32_t bestConfidenceSoFar = -1;
michael@0 886 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
michael@0 887 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
michael@0 888 const char *lang = ngrams_8859_1[i].lang;
michael@0 889 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
michael@0 890 if (confidence > bestConfidenceSoFar) {
michael@0 891 results->set(textIn, this, confidence, name, lang);
michael@0 892 bestConfidenceSoFar = confidence;
michael@0 893 }
michael@0 894 }
michael@0 895 return (bestConfidenceSoFar > 0);
michael@0 896 }
michael@0 897
michael@0 898 const char *CharsetRecog_8859_1::getName() const
michael@0 899 {
michael@0 900 return "ISO-8859-1";
michael@0 901 }
michael@0 902
michael@0 903
michael@0 904 CharsetRecog_8859_2::~CharsetRecog_8859_2()
michael@0 905 {
michael@0 906 // nothing to do
michael@0 907 }
michael@0 908
michael@0 909 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
michael@0 910 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
michael@0 911 uint32_t i;
michael@0 912 int32_t bestConfidenceSoFar = -1;
michael@0 913 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
michael@0 914 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
michael@0 915 const char *lang = ngrams_8859_2[i].lang;
michael@0 916 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
michael@0 917 if (confidence > bestConfidenceSoFar) {
michael@0 918 results->set(textIn, this, confidence, name, lang);
michael@0 919 bestConfidenceSoFar = confidence;
michael@0 920 }
michael@0 921 }
michael@0 922 return (bestConfidenceSoFar > 0);
michael@0 923 }
michael@0 924
michael@0 925 const char *CharsetRecog_8859_2::getName() const
michael@0 926 {
michael@0 927 return "ISO-8859-2";
michael@0 928 }
michael@0 929
michael@0 930
michael@0 931 CharsetRecog_8859_5::~CharsetRecog_8859_5()
michael@0 932 {
michael@0 933 // nothing to do
michael@0 934 }
michael@0 935
michael@0 936 const char *CharsetRecog_8859_5::getName() const
michael@0 937 {
michael@0 938 return "ISO-8859-5";
michael@0 939 }
michael@0 940
michael@0 941 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
michael@0 942 {
michael@0 943 // nothing to do
michael@0 944 }
michael@0 945
michael@0 946 const char *CharsetRecog_8859_5_ru::getLanguage() const
michael@0 947 {
michael@0 948 return "ru";
michael@0 949 }
michael@0 950
michael@0 951 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
michael@0 952 {
michael@0 953 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
michael@0 954 results->set(textIn, this, confidence);
michael@0 955 return (confidence > 0);
michael@0 956 }
michael@0 957
michael@0 958 CharsetRecog_8859_6::~CharsetRecog_8859_6()
michael@0 959 {
michael@0 960 // nothing to do
michael@0 961 }
michael@0 962
michael@0 963 const char *CharsetRecog_8859_6::getName() const
michael@0 964 {
michael@0 965 return "ISO-8859-6";
michael@0 966 }
michael@0 967
michael@0 968 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
michael@0 969 {
michael@0 970 // nothing to do
michael@0 971 }
michael@0 972
michael@0 973 const char *CharsetRecog_8859_6_ar::getLanguage() const
michael@0 974 {
michael@0 975 return "ar";
michael@0 976 }
michael@0 977
michael@0 978 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
michael@0 979 {
michael@0 980 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
michael@0 981 results->set(textIn, this, confidence);
michael@0 982 return (confidence > 0);
michael@0 983 }
michael@0 984
michael@0 985 CharsetRecog_8859_7::~CharsetRecog_8859_7()
michael@0 986 {
michael@0 987 // nothing to do
michael@0 988 }
michael@0 989
michael@0 990 const char *CharsetRecog_8859_7::getName() const
michael@0 991 {
michael@0 992 return "ISO-8859-7";
michael@0 993 }
michael@0 994
michael@0 995 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
michael@0 996 {
michael@0 997 // nothing to do
michael@0 998 }
michael@0 999
michael@0 1000 const char *CharsetRecog_8859_7_el::getLanguage() const
michael@0 1001 {
michael@0 1002 return "el";
michael@0 1003 }
michael@0 1004
michael@0 1005 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
michael@0 1006 {
michael@0 1007 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
michael@0 1008 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
michael@0 1009 results->set(textIn, this, confidence, name, "el");
michael@0 1010 return (confidence > 0);
michael@0 1011 }
michael@0 1012
michael@0 1013 CharsetRecog_8859_8::~CharsetRecog_8859_8()
michael@0 1014 {
michael@0 1015 // nothing to do
michael@0 1016 }
michael@0 1017
michael@0 1018 const char *CharsetRecog_8859_8::getName() const
michael@0 1019 {
michael@0 1020 return "ISO-8859-8";
michael@0 1021 }
michael@0 1022
michael@0 1023 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
michael@0 1024 {
michael@0 1025 // nothing to do
michael@0 1026 }
michael@0 1027
michael@0 1028 const char *CharsetRecog_8859_8_I_he::getName() const
michael@0 1029 {
michael@0 1030 return "ISO-8859-8-I";
michael@0 1031 }
michael@0 1032
michael@0 1033 const char *CharsetRecog_8859_8_I_he::getLanguage() const
michael@0 1034 {
michael@0 1035 return "he";
michael@0 1036 }
michael@0 1037
michael@0 1038 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
michael@0 1039 {
michael@0 1040 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
michael@0 1041 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
michael@0 1042 results->set(textIn, this, confidence, name, "he");
michael@0 1043 return (confidence > 0);
michael@0 1044 }
michael@0 1045
michael@0 1046 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
michael@0 1047 {
michael@0 1048 // od ot gnihton
michael@0 1049 }
michael@0 1050
michael@0 1051 const char *CharsetRecog_8859_8_he::getLanguage() const
michael@0 1052 {
michael@0 1053 return "he";
michael@0 1054 }
michael@0 1055
michael@0 1056 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
michael@0 1057 {
michael@0 1058 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
michael@0 1059 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
michael@0 1060 results->set(textIn, this, confidence, name, "he");
michael@0 1061 return (confidence > 0);
michael@0 1062 }
michael@0 1063
michael@0 1064 CharsetRecog_8859_9::~CharsetRecog_8859_9()
michael@0 1065 {
michael@0 1066 // nothing to do
michael@0 1067 }
michael@0 1068
michael@0 1069 const char *CharsetRecog_8859_9::getName() const
michael@0 1070 {
michael@0 1071 return "ISO-8859-9";
michael@0 1072 }
michael@0 1073
michael@0 1074 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
michael@0 1075 {
michael@0 1076 // nothing to do
michael@0 1077 }
michael@0 1078
michael@0 1079 const char *CharsetRecog_8859_9_tr::getLanguage() const
michael@0 1080 {
michael@0 1081 return "tr";
michael@0 1082 }
michael@0 1083
michael@0 1084 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
michael@0 1085 {
michael@0 1086 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
michael@0 1087 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
michael@0 1088 results->set(textIn, this, confidence, name, "tr");
michael@0 1089 return (confidence > 0);
michael@0 1090 }
michael@0 1091
michael@0 1092 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
michael@0 1093 {
michael@0 1094 // nothing to do
michael@0 1095 }
michael@0 1096
michael@0 1097 const char *CharsetRecog_windows_1256::getName() const
michael@0 1098 {
michael@0 1099 return "windows-1256";
michael@0 1100 }
michael@0 1101
michael@0 1102 const char *CharsetRecog_windows_1256::getLanguage() const
michael@0 1103 {
michael@0 1104 return "ar";
michael@0 1105 }
michael@0 1106
michael@0 1107 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
michael@0 1108 {
michael@0 1109 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
michael@0 1110 results->set(textIn, this, confidence);
michael@0 1111 return (confidence > 0);
michael@0 1112 }
michael@0 1113
michael@0 1114 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
michael@0 1115 {
michael@0 1116 // nothing to do
michael@0 1117 }
michael@0 1118
michael@0 1119 const char *CharsetRecog_windows_1251::getName() const
michael@0 1120 {
michael@0 1121 return "windows-1251";
michael@0 1122 }
michael@0 1123
michael@0 1124 const char *CharsetRecog_windows_1251::getLanguage() const
michael@0 1125 {
michael@0 1126 return "ru";
michael@0 1127 }
michael@0 1128
michael@0 1129 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
michael@0 1130 {
michael@0 1131 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
michael@0 1132 results->set(textIn, this, confidence);
michael@0 1133 return (confidence > 0);
michael@0 1134 }
michael@0 1135
michael@0 1136 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
michael@0 1137 {
michael@0 1138 // nothing to do
michael@0 1139 }
michael@0 1140
michael@0 1141 const char *CharsetRecog_KOI8_R::getName() const
michael@0 1142 {
michael@0 1143 return "KOI8-R";
michael@0 1144 }
michael@0 1145
michael@0 1146 const char *CharsetRecog_KOI8_R::getLanguage() const
michael@0 1147 {
michael@0 1148 return "ru";
michael@0 1149 }
michael@0 1150
michael@0 1151 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
michael@0 1152 {
michael@0 1153 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
michael@0 1154 results->set(textIn, this, confidence);
michael@0 1155 return (confidence > 0);
michael@0 1156 }
michael@0 1157
michael@0 1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
michael@0 1159 {
michael@0 1160 // nothing to do
michael@0 1161 }
michael@0 1162
michael@0 1163 const char *CharsetRecog_IBM424_he::getLanguage() const
michael@0 1164 {
michael@0 1165 return "he";
michael@0 1166 }
michael@0 1167
michael@0 1168 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
michael@0 1169 {
michael@0 1170 // nothing to do
michael@0 1171 }
michael@0 1172
michael@0 1173 const char *CharsetRecog_IBM424_he_rtl::getName() const
michael@0 1174 {
michael@0 1175 return "IBM424_rtl";
michael@0 1176 }
michael@0 1177
michael@0 1178 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
michael@0 1179 {
michael@0 1180 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
michael@0 1181 results->set(textIn, this, confidence);
michael@0 1182 return (confidence > 0);
michael@0 1183 }
michael@0 1184
michael@0 1185 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
michael@0 1186 {
michael@0 1187 // nothing to do
michael@0 1188 }
michael@0 1189
michael@0 1190 const char *CharsetRecog_IBM424_he_ltr::getName() const
michael@0 1191 {
michael@0 1192 return "IBM424_ltr";
michael@0 1193 }
michael@0 1194
michael@0 1195 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
michael@0 1196 {
michael@0 1197 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
michael@0 1198 results->set(textIn, this, confidence);
michael@0 1199 return (confidence > 0);
michael@0 1200 }
michael@0 1201
michael@0 1202 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
michael@0 1203 {
michael@0 1204 // nothing to do
michael@0 1205 }
michael@0 1206
michael@0 1207 const char *CharsetRecog_IBM420_ar::getLanguage() const
michael@0 1208 {
michael@0 1209 return "ar";
michael@0 1210 }
michael@0 1211
michael@0 1212
michael@0 1213 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
michael@0 1214 {
michael@0 1215 NGramParser_IBM420 parser(ngrams, byteMap);
michael@0 1216 int32_t result;
michael@0 1217
michael@0 1218 result = parser.parse(det);
michael@0 1219
michael@0 1220 return result;
michael@0 1221 }
michael@0 1222
michael@0 1223 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
michael@0 1224 {
michael@0 1225 // nothing to do
michael@0 1226 }
michael@0 1227
michael@0 1228 const char *CharsetRecog_IBM420_ar_rtl::getName() const
michael@0 1229 {
michael@0 1230 return "IBM420_rtl";
michael@0 1231 }
michael@0 1232
michael@0 1233 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
michael@0 1234 {
michael@0 1235 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
michael@0 1236 results->set(textIn, this, confidence);
michael@0 1237 return (confidence > 0);
michael@0 1238 }
michael@0 1239
michael@0 1240 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
michael@0 1241 {
michael@0 1242 // nothing to do
michael@0 1243 }
michael@0 1244
michael@0 1245 const char *CharsetRecog_IBM420_ar_ltr::getName() const
michael@0 1246 {
michael@0 1247 return "IBM420_ltr";
michael@0 1248 }
michael@0 1249
michael@0 1250 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
michael@0 1251 {
michael@0 1252 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
michael@0 1253 results->set(textIn, this, confidence);
michael@0 1254 return (confidence > 0);
michael@0 1255 }
michael@0 1256
michael@0 1257 U_NAMESPACE_END
michael@0 1258 #endif
michael@0 1259

mercurial