intl/uconv/util/uscan.c

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 #include "unicpriv.h"
michael@0 6 #define CHK_GR94(b) ( (uint8_t) 0xa0 < (uint8_t) (b) && (uint8_t) (b) < (uint8_t) 0xff )
michael@0 7 #define CHK_GR94_2Byte(b1,b2) (CHK_GR94(b1) && CHK_GR94(b2))
michael@0 8 /*=================================================================================
michael@0 9
michael@0 10 =================================================================================*/
michael@0 11 typedef int (*uSubScannerFunc) (unsigned char* in, uint16_t* out);
michael@0 12 /*=================================================================================
michael@0 13
michael@0 14 =================================================================================*/
michael@0 15
michael@0 16 typedef int (*uScannerFunc) (
michael@0 17 int32_t* state,
michael@0 18 unsigned char *in,
michael@0 19 uint16_t *out,
michael@0 20 uint32_t inbuflen,
michael@0 21 uint32_t* inscanlen
michael@0 22 );
michael@0 23
michael@0 24 int uScan(
michael@0 25 uScanClassID scanClass,
michael@0 26 int32_t* state,
michael@0 27 unsigned char *in,
michael@0 28 uint16_t *out,
michael@0 29 uint32_t inbuflen,
michael@0 30 uint32_t* inscanlen
michael@0 31 );
michael@0 32
michael@0 33 #define uSubScanner(sub,in,out) (* m_subscanner[sub])((in),(out))
michael@0 34
michael@0 35 int uCheckAndScanAlways1Byte(
michael@0 36 int32_t* state,
michael@0 37 unsigned char *in,
michael@0 38 uint16_t *out,
michael@0 39 uint32_t inbuflen,
michael@0 40 uint32_t* inscanlen
michael@0 41 );
michael@0 42 int uCheckAndScanAlways2Byte(
michael@0 43 int32_t* state,
michael@0 44 unsigned char *in,
michael@0 45 uint16_t *out,
michael@0 46 uint32_t inbuflen,
michael@0 47 uint32_t* inscanlen
michael@0 48 );
michael@0 49 int uCheckAndScanAlways2ByteShiftGR(
michael@0 50 int32_t* state,
michael@0 51 unsigned char *in,
michael@0 52 uint16_t *out,
michael@0 53 uint32_t inbuflen,
michael@0 54 uint32_t* inscanlen
michael@0 55 );
michael@0 56 int uCheckAndScanAlways2ByteGR128(
michael@0 57 int32_t* state,
michael@0 58 unsigned char *in,
michael@0 59 uint16_t *out,
michael@0 60 uint32_t inbuflen,
michael@0 61 uint32_t* inscanlen
michael@0 62 );
michael@0 63 int uScanShift(
michael@0 64 uShiftInTable *shift,
michael@0 65 int32_t* state,
michael@0 66 unsigned char *in,
michael@0 67 uint16_t *out,
michael@0 68 uint32_t inbuflen,
michael@0 69 uint32_t* inscanlen
michael@0 70 );
michael@0 71
michael@0 72 int uCheckAndScan2ByteGRPrefix8F(
michael@0 73 int32_t* state,
michael@0 74 unsigned char *in,
michael@0 75 uint16_t *out,
michael@0 76 uint32_t inbuflen,
michael@0 77 uint32_t* inscanlen
michael@0 78 );
michael@0 79 int uCheckAndScan2ByteGRPrefix8EA2(
michael@0 80 int32_t* state,
michael@0 81 unsigned char *in,
michael@0 82 uint16_t *out,
michael@0 83 uint32_t inbuflen,
michael@0 84 uint32_t* inscanlen
michael@0 85 );
michael@0 86 int uCheckAndScan2ByteGRPrefix8EA3(
michael@0 87 int32_t* state,
michael@0 88 unsigned char *in,
michael@0 89 uint16_t *out,
michael@0 90 uint32_t inbuflen,
michael@0 91 uint32_t* inscanlen
michael@0 92 );
michael@0 93 int uCheckAndScan2ByteGRPrefix8EA4(
michael@0 94 int32_t* state,
michael@0 95 unsigned char *in,
michael@0 96 uint16_t *out,
michael@0 97 uint32_t inbuflen,
michael@0 98 uint32_t* inscanlen
michael@0 99 );
michael@0 100 int uCheckAndScan2ByteGRPrefix8EA5(
michael@0 101 int32_t* state,
michael@0 102 unsigned char *in,
michael@0 103 uint16_t *out,
michael@0 104 uint32_t inbuflen,
michael@0 105 uint32_t* inscanlen
michael@0 106 );
michael@0 107 int uCheckAndScan2ByteGRPrefix8EA6(
michael@0 108 int32_t* state,
michael@0 109 unsigned char *in,
michael@0 110 uint16_t *out,
michael@0 111 uint32_t inbuflen,
michael@0 112 uint32_t* inscanlen
michael@0 113 );
michael@0 114 int uCheckAndScan2ByteGRPrefix8EA7(
michael@0 115 int32_t* state,
michael@0 116 unsigned char *in,
michael@0 117 uint16_t *out,
michael@0 118 uint32_t inbuflen,
michael@0 119 uint32_t* inscanlen
michael@0 120 );
michael@0 121 int uCnSAlways8BytesDecomposedHangul(
michael@0 122 int32_t* state,
michael@0 123 unsigned char *in,
michael@0 124 uint16_t *out,
michael@0 125 uint32_t inbuflen,
michael@0 126 uint32_t* inscanlen
michael@0 127 );
michael@0 128 int uCheckAndScanJohabHangul(
michael@0 129 int32_t* state,
michael@0 130 unsigned char *in,
michael@0 131 uint16_t *out,
michael@0 132 uint32_t inbuflen,
michael@0 133 uint32_t* inscanlen
michael@0 134 );
michael@0 135 int uCheckAndScanJohabSymbol(
michael@0 136 int32_t* state,
michael@0 137 unsigned char *in,
michael@0 138 uint16_t *out,
michael@0 139 uint32_t inbuflen,
michael@0 140 uint32_t* inscanlen
michael@0 141 );
michael@0 142
michael@0 143 int uCheckAndScan4BytesGB18030(
michael@0 144 int32_t* state,
michael@0 145 unsigned char *in,
michael@0 146 uint16_t *out,
michael@0 147 uint32_t inbuflen,
michael@0 148 uint32_t* inscanlen
michael@0 149 );
michael@0 150
michael@0 151 int uScanAlways2Byte(
michael@0 152 unsigned char* in,
michael@0 153 uint16_t* out
michael@0 154 );
michael@0 155 int uScanAlways2ByteShiftGR(
michael@0 156 unsigned char* in,
michael@0 157 uint16_t* out
michael@0 158 );
michael@0 159 int uScanAlways1Byte(
michael@0 160 unsigned char* in,
michael@0 161 uint16_t* out
michael@0 162 );
michael@0 163 int uScanAlways1BytePrefix8E(
michael@0 164 unsigned char* in,
michael@0 165 uint16_t* out
michael@0 166 );
michael@0 167 /*=================================================================================
michael@0 168
michael@0 169 =================================================================================*/
michael@0 170 const uScannerFunc m_scanner[uNumOfCharsetType] =
michael@0 171 {
michael@0 172 uCheckAndScanAlways1Byte,
michael@0 173 uCheckAndScanAlways2Byte,
michael@0 174 uCheckAndScanAlways2ByteShiftGR,
michael@0 175 uCheckAndScan2ByteGRPrefix8F,
michael@0 176 uCheckAndScan2ByteGRPrefix8EA2,
michael@0 177 uCheckAndScan2ByteGRPrefix8EA3,
michael@0 178 uCheckAndScan2ByteGRPrefix8EA4,
michael@0 179 uCheckAndScan2ByteGRPrefix8EA5,
michael@0 180 uCheckAndScan2ByteGRPrefix8EA6,
michael@0 181 uCheckAndScan2ByteGRPrefix8EA7,
michael@0 182 uCnSAlways8BytesDecomposedHangul,
michael@0 183 uCheckAndScanJohabHangul,
michael@0 184 uCheckAndScanJohabSymbol,
michael@0 185 uCheckAndScan4BytesGB18030,
michael@0 186 uCheckAndScanAlways2ByteGR128
michael@0 187 };
michael@0 188
michael@0 189 /*=================================================================================
michael@0 190
michael@0 191 =================================================================================*/
michael@0 192
michael@0 193 const uSubScannerFunc m_subscanner[uNumOfCharType] =
michael@0 194 {
michael@0 195 uScanAlways1Byte,
michael@0 196 uScanAlways2Byte,
michael@0 197 uScanAlways2ByteShiftGR,
michael@0 198 uScanAlways1BytePrefix8E
michael@0 199 };
michael@0 200 /*=================================================================================
michael@0 201
michael@0 202 =================================================================================*/
michael@0 203 int uScan(
michael@0 204 uScanClassID scanClass,
michael@0 205 int32_t* state,
michael@0 206 unsigned char *in,
michael@0 207 uint16_t *out,
michael@0 208 uint32_t inbuflen,
michael@0 209 uint32_t* inscanlen
michael@0 210 )
michael@0 211 {
michael@0 212 return (* m_scanner[scanClass]) (state,in,out,inbuflen,inscanlen);
michael@0 213 }
michael@0 214 /*=================================================================================
michael@0 215
michael@0 216 =================================================================================*/
michael@0 217 int uScanAlways1Byte(
michael@0 218 unsigned char* in,
michael@0 219 uint16_t* out
michael@0 220 )
michael@0 221 {
michael@0 222 *out = (uint16_t) in[0];
michael@0 223 return 1;
michael@0 224 }
michael@0 225
michael@0 226 /*=================================================================================
michael@0 227
michael@0 228 =================================================================================*/
michael@0 229 int uScanAlways2Byte(
michael@0 230 unsigned char* in,
michael@0 231 uint16_t* out
michael@0 232 )
michael@0 233 {
michael@0 234 *out = (uint16_t) (( in[0] << 8) | (in[1]));
michael@0 235 return 1;
michael@0 236 }
michael@0 237 /*=================================================================================
michael@0 238
michael@0 239 =================================================================================*/
michael@0 240 int uScanAlways2ByteShiftGR(
michael@0 241 unsigned char* in,
michael@0 242 uint16_t* out
michael@0 243 )
michael@0 244 {
michael@0 245 *out = (uint16_t) ((( in[0] << 8) | (in[1])) & 0x7F7F);
michael@0 246 return 1;
michael@0 247 }
michael@0 248
michael@0 249 /*=================================================================================
michael@0 250
michael@0 251 =================================================================================*/
michael@0 252 int uScanAlways1BytePrefix8E(
michael@0 253 unsigned char* in,
michael@0 254 uint16_t* out
michael@0 255 )
michael@0 256 {
michael@0 257 *out = (uint16_t) in[1];
michael@0 258 return 1;
michael@0 259 }
michael@0 260 /*=================================================================================
michael@0 261
michael@0 262 =================================================================================*/
michael@0 263 int uCheckAndScanAlways1Byte(
michael@0 264 int32_t* state,
michael@0 265 unsigned char *in,
michael@0 266 uint16_t *out,
michael@0 267 uint32_t inbuflen,
michael@0 268 uint32_t* inscanlen
michael@0 269 )
michael@0 270 {
michael@0 271 /* Don't check inlen. The caller should ensure it is larger than 0 */
michael@0 272 *inscanlen = 1;
michael@0 273 *out = (uint16_t) in[0];
michael@0 274
michael@0 275 return 1;
michael@0 276 }
michael@0 277
michael@0 278 /*=================================================================================
michael@0 279
michael@0 280 =================================================================================*/
michael@0 281 int uCheckAndScanAlways2Byte(
michael@0 282 int32_t* state,
michael@0 283 unsigned char *in,
michael@0 284 uint16_t *out,
michael@0 285 uint32_t inbuflen,
michael@0 286 uint32_t* inscanlen
michael@0 287 )
michael@0 288 {
michael@0 289 if(inbuflen < 2)
michael@0 290 return 0;
michael@0 291 else
michael@0 292 {
michael@0 293 *inscanlen = 2;
michael@0 294 *out = ((in[0] << 8) | ( in[1])) ;
michael@0 295 return 1;
michael@0 296 }
michael@0 297 }
michael@0 298 /*=================================================================================
michael@0 299
michael@0 300 =================================================================================*/
michael@0 301 int uCheckAndScanAlways2ByteShiftGR(
michael@0 302 int32_t* state,
michael@0 303 unsigned char *in,
michael@0 304 uint16_t *out,
michael@0 305 uint32_t inbuflen,
michael@0 306 uint32_t* inscanlen
michael@0 307 )
michael@0 308 {
michael@0 309 /*
michael@0 310 * Both bytes should be in the range of [0xa1,0xfe] for 94x94 character sets
michael@0 311 * invoked on GR. No encoding implemented in Mozilla uses 96x96 char. sets.
michael@0 312 * Only 2nd byte range needs to be checked because
michael@0 313 * 1st byte is checked before calling this in nsUnicodeDecoerHelper.cpp
michael@0 314 */
michael@0 315 if(inbuflen < 2) /* will lead to NS_OK_UDEC_MOREINPUT */
michael@0 316 return 0;
michael@0 317 else if (! CHK_GR94(in[1]))
michael@0 318 {
michael@0 319 *inscanlen = 2;
michael@0 320 *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */
michael@0 321 return 1;
michael@0 322 }
michael@0 323 else
michael@0 324 {
michael@0 325 *inscanlen = 2;
michael@0 326 *out = (((in[0] << 8) | ( in[1])) & 0x7F7F);
michael@0 327 return 1;
michael@0 328 }
michael@0 329 }
michael@0 330 /*=================================================================================
michael@0 331
michael@0 332 =================================================================================*/
michael@0 333 int uCheckAndScanAlways2ByteGR128(
michael@0 334 int32_t* state,
michael@0 335 unsigned char *in,
michael@0 336 uint16_t *out,
michael@0 337 uint32_t inbuflen,
michael@0 338 uint32_t* inscanlen
michael@0 339 )
michael@0 340 {
michael@0 341 /*
michael@0 342 * The first byte should be in [0xa1,0xfe]
michael@0 343 * and the second byte in [0x41,0xfe]
michael@0 344 * Used by CP949 -> Unicode converter.
michael@0 345 * Only 2nd byte range needs to be checked because
michael@0 346 * 1st byte is checked before calling this in nsUnicodeDecoderHelper.cpp
michael@0 347 */
michael@0 348 if(inbuflen < 2) /* will lead to NS_OK_UDEC_MOREINPUT */
michael@0 349 return 0;
michael@0 350 else if (in[1] < 0x41) /* 2nd byte range check */
michael@0 351 {
michael@0 352 *inscanlen = 2;
michael@0 353 *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */
michael@0 354 return 1;
michael@0 355 }
michael@0 356 else
michael@0 357 {
michael@0 358 *inscanlen = 2;
michael@0 359 *out = (in[0] << 8) | in[1];
michael@0 360 return 1;
michael@0 361 }
michael@0 362 }
michael@0 363 /*=================================================================================
michael@0 364
michael@0 365 =================================================================================*/
michael@0 366 int uScanShift(
michael@0 367 uShiftInTable *shift,
michael@0 368 int32_t* state,
michael@0 369 unsigned char *in,
michael@0 370 uint16_t *out,
michael@0 371 uint32_t inbuflen,
michael@0 372 uint32_t* inscanlen
michael@0 373 )
michael@0 374 {
michael@0 375 int16_t i;
michael@0 376 const uShiftInCell* cell = &(shift->shiftcell[0]);
michael@0 377 int16_t itemnum = shift->numOfItem;
michael@0 378 for(i=0;i<itemnum;i++)
michael@0 379 {
michael@0 380 if( ( in[0] >= cell[i].shiftin_Min) &&
michael@0 381 ( in[0] <= cell[i].shiftin_Max))
michael@0 382 {
michael@0 383 if(inbuflen < cell[i].reserveLen)
michael@0 384 return 0;
michael@0 385 else
michael@0 386 {
michael@0 387 *inscanlen = cell[i].reserveLen;
michael@0 388 return (uSubScanner(cell[i].classID,in,out));
michael@0 389 }
michael@0 390 }
michael@0 391 }
michael@0 392 return 0;
michael@0 393 }
michael@0 394 /*=================================================================================
michael@0 395
michael@0 396 =================================================================================*/
michael@0 397 int uCheckAndScan2ByteGRPrefix8F(
michael@0 398 int32_t* state,
michael@0 399 unsigned char *in,
michael@0 400 uint16_t *out,
michael@0 401 uint32_t inbuflen,
michael@0 402 uint32_t* inscanlen
michael@0 403 )
michael@0 404 {
michael@0 405 if((inbuflen < 3) ||(in[0] != 0x8F))
michael@0 406 return 0;
michael@0 407 else if (! CHK_GR94(in[1])) /* 2nd byte range check */
michael@0 408 {
michael@0 409 *inscanlen = 2;
michael@0 410 *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */
michael@0 411 return 1;
michael@0 412 }
michael@0 413 else if (! CHK_GR94(in[2])) /* 3rd byte range check */
michael@0 414 {
michael@0 415 *inscanlen = 3;
michael@0 416 *out = 0xFF; /* for 2-byte table, uMap() is guaranteed to fail for 0xFF. */
michael@0 417 return 1;
michael@0 418 }
michael@0 419 else
michael@0 420 {
michael@0 421 *inscanlen = 3;
michael@0 422 *out = (((in[1] << 8) | ( in[2])) & 0x7F7F);
michael@0 423 return 1;
michael@0 424 }
michael@0 425 }
michael@0 426 /*=================================================================================
michael@0 427
michael@0 428 =================================================================================*/
michael@0 429
michael@0 430 /* Macro definition to use for uCheckAndScan2ByteGRPrefix8EAX()
michael@0 431 * where X is 2,3,4,5,6,7
michael@0 432 */
michael@0 433 #define CNS_8EAX_4BYTE(PREFIX) \
michael@0 434 if((inbuflen < 4) || (in[0] != 0x8E)) \
michael@0 435 return 0; \
michael@0 436 else if((in[1] != (PREFIX))) \
michael@0 437 { \
michael@0 438 *inscanlen = 2; \
michael@0 439 *out = 0xFF; \
michael@0 440 return 1; \
michael@0 441 } \
michael@0 442 else if(! CHK_GR94(in[2])) \
michael@0 443 { \
michael@0 444 *inscanlen = 3; \
michael@0 445 *out = 0xFF; \
michael@0 446 return 1; \
michael@0 447 } \
michael@0 448 else if(! CHK_GR94(in[3])) \
michael@0 449 { \
michael@0 450 *inscanlen = 4; \
michael@0 451 *out = 0xFF; \
michael@0 452 return 1; \
michael@0 453 } \
michael@0 454 else \
michael@0 455 { \
michael@0 456 *inscanlen = 4; \
michael@0 457 *out = (((in[2] << 8) | ( in[3])) & 0x7F7F); \
michael@0 458 return 1; \
michael@0 459 }
michael@0 460
michael@0 461 int uCheckAndScan2ByteGRPrefix8EA2(
michael@0 462 int32_t* state,
michael@0 463 unsigned char *in,
michael@0 464 uint16_t *out,
michael@0 465 uint32_t inbuflen,
michael@0 466 uint32_t* inscanlen
michael@0 467 )
michael@0 468 {
michael@0 469 CNS_8EAX_4BYTE(0xA2)
michael@0 470 }
michael@0 471
michael@0 472 /*=================================================================================
michael@0 473
michael@0 474 =================================================================================*/
michael@0 475 int uCheckAndScan2ByteGRPrefix8EA3(
michael@0 476 int32_t* state,
michael@0 477 unsigned char *in,
michael@0 478 uint16_t *out,
michael@0 479 uint32_t inbuflen,
michael@0 480 uint32_t* inscanlen
michael@0 481 )
michael@0 482 {
michael@0 483 CNS_8EAX_4BYTE(0xA3)
michael@0 484 }
michael@0 485 /*=================================================================================
michael@0 486
michael@0 487 =================================================================================*/
michael@0 488 int uCheckAndScan2ByteGRPrefix8EA4(
michael@0 489 int32_t* state,
michael@0 490 unsigned char *in,
michael@0 491 uint16_t *out,
michael@0 492 uint32_t inbuflen,
michael@0 493 uint32_t* inscanlen
michael@0 494 )
michael@0 495 {
michael@0 496 CNS_8EAX_4BYTE(0xA4)
michael@0 497 }
michael@0 498 /*=================================================================================
michael@0 499
michael@0 500 =================================================================================*/
michael@0 501 int uCheckAndScan2ByteGRPrefix8EA5(
michael@0 502 int32_t* state,
michael@0 503 unsigned char *in,
michael@0 504 uint16_t *out,
michael@0 505 uint32_t inbuflen,
michael@0 506 uint32_t* inscanlen
michael@0 507 )
michael@0 508 {
michael@0 509 CNS_8EAX_4BYTE(0xA5)
michael@0 510 }
michael@0 511 /*=================================================================================
michael@0 512
michael@0 513 =================================================================================*/
michael@0 514 int uCheckAndScan2ByteGRPrefix8EA6(
michael@0 515 int32_t* state,
michael@0 516 unsigned char *in,
michael@0 517 uint16_t *out,
michael@0 518 uint32_t inbuflen,
michael@0 519 uint32_t* inscanlen
michael@0 520 )
michael@0 521 {
michael@0 522 CNS_8EAX_4BYTE(0xA6)
michael@0 523 }
michael@0 524 /*=================================================================================
michael@0 525
michael@0 526 =================================================================================*/
michael@0 527 int uCheckAndScan2ByteGRPrefix8EA7(
michael@0 528 int32_t* state,
michael@0 529 unsigned char *in,
michael@0 530 uint16_t *out,
michael@0 531 uint32_t inbuflen,
michael@0 532 uint32_t* inscanlen
michael@0 533 )
michael@0 534 {
michael@0 535 CNS_8EAX_4BYTE(0xA7)
michael@0 536 }
michael@0 537 /*=================================================================================
michael@0 538
michael@0 539 =================================================================================*/
michael@0 540 #define SBase 0xAC00
michael@0 541 #define SCount 11172
michael@0 542 #define LCount 19
michael@0 543 #define VCount 21
michael@0 544 #define TCount 28
michael@0 545 #define NCount (VCount * TCount)
michael@0 546
michael@0 547 int uCnSAlways8BytesDecomposedHangul(
michael@0 548 int32_t* state,
michael@0 549 unsigned char *in,
michael@0 550 uint16_t *out,
michael@0 551 uint32_t inbuflen,
michael@0 552 uint32_t* inscanlen
michael@0 553 )
michael@0 554 {
michael@0 555
michael@0 556 uint16_t LIndex, VIndex, TIndex;
michael@0 557 /* no 8 bytes, not in a4 range, or the first 2 byte are not a4d4 */
michael@0 558 if((inbuflen < 8) || (0xa4 != in[0]) || (0xd4 != in[1]) ||
michael@0 559 (0xa4 != in[2] ) || (0xa4 != in[4]) || (0xa4 != in[6]))
michael@0 560 return 0;
michael@0 561
michael@0 562 /* Compute LIndex */
michael@0 563 if((in[3] < 0xa1) || (in[3] > 0xbe)) { /* illegal leading consonant */
michael@0 564 return 0;
michael@0 565 }
michael@0 566 else {
michael@0 567 static const uint8_t lMap[] = {
michael@0 568 /* A1 A2 A3 A4 A5 A6 A7 */
michael@0 569 0, 1,0xff, 2,0xff,0xff, 3,
michael@0 570 /* A8 A9 AA AB AC AD AE AF */
michael@0 571 4, 5,0xff,0xff,0xff,0xff,0xff,0xff,
michael@0 572 /* B0 B1 B2 B3 B4 B5 B6 B7 */
michael@0 573 0xff, 6, 7, 8,0xff, 9, 10, 11,
michael@0 574 /* B8 B9 BA BB BC BD BE */
michael@0 575 12, 13, 14, 15, 16, 17, 18
michael@0 576 };
michael@0 577
michael@0 578 LIndex = lMap[in[3] - 0xa1];
michael@0 579 if(0xff == (0xff & LIndex))
michael@0 580 return 0;
michael@0 581 }
michael@0 582
michael@0 583 /* Compute VIndex */
michael@0 584 if((in[5] < 0xbf) || (in[5] > 0xd3)) { /* illegal medial vowel */
michael@0 585 return 0;
michael@0 586 }
michael@0 587 else {
michael@0 588 VIndex = in[5] - 0xbf;
michael@0 589 }
michael@0 590
michael@0 591 /* Compute TIndex */
michael@0 592 if(0xd4 == in[7])
michael@0 593 {
michael@0 594 TIndex = 0;
michael@0 595 }
michael@0 596 else if((in[7] < 0xa1) || (in[7] > 0xbe)) {/* illegal trailing consonant */
michael@0 597 return 0;
michael@0 598 }
michael@0 599 else {
michael@0 600 static const uint8_t tMap[] = {
michael@0 601 /* A1 A2 A3 A4 A5 A6 A7 */
michael@0 602 1, 2, 3, 4, 5, 6, 7,
michael@0 603 /* A8 A9 AA AB AC AD AE AF */
michael@0 604 0xff, 8, 9, 10, 11, 12, 13, 14,
michael@0 605 /* B0 B1 B2 B3 B4 B5 B6 B7 */
michael@0 606 15, 16, 17,0xff, 18, 19, 20, 21,
michael@0 607 /* B8 B9 BA BB BC BD BE */
michael@0 608 22,0xff, 23, 24, 25, 26, 27
michael@0 609 };
michael@0 610 TIndex = tMap[in[7] - 0xa1];
michael@0 611 if(0xff == (0xff & TIndex))
michael@0 612 return 0;
michael@0 613 }
michael@0 614
michael@0 615 *inscanlen = 8;
michael@0 616 /* the following line is from Unicode 2.0 page 3-13 item 5 */
michael@0 617 *out = ( LIndex * VCount + VIndex) * TCount + TIndex + SBase;
michael@0 618
michael@0 619 return 1;
michael@0 620 }
michael@0 621 /*=================================================================================
michael@0 622
michael@0 623 =================================================================================*/
michael@0 624
michael@0 625 int uCheckAndScanJohabHangul(
michael@0 626 int32_t* state,
michael@0 627 unsigned char *in,
michael@0 628 uint16_t *out,
michael@0 629 uint32_t inbuflen,
michael@0 630 uint32_t* inscanlen
michael@0 631 )
michael@0 632 {
michael@0 633 /* since we don't have code to convert Johab to Unicode right now *
michael@0 634 * make this part of code #if 0 to save space until we fully test it */
michael@0 635 if(inbuflen < 2)
michael@0 636 return 0;
michael@0 637 else {
michael@0 638 /*
michael@0 639 * See Table 4-45 Johab Encoding's Five-Bit Binary Patterns in page 183
michael@0 640 * of "CJKV Information Processing" for details
michael@0 641 */
michael@0 642 static const uint8_t lMap[32]={ /* totaly 19 */
michael@0 643 0xff,0xff,0, 1, 2, 3, 4, 5, /* 0-7 */
michael@0 644 6, 7, 8, 9, 10, 11, 12, 13, /* 8-15 */
michael@0 645 14, 15, 16, 17, 18, 0xff,0xff,0xff, /* 16-23 */
michael@0 646 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff /* 24-31 */
michael@0 647 };
michael@0 648 static const uint8_t vMap[32]={ /* totaly 21 */
michael@0 649 0xff,0xff,0xff,0, 1, 2, 3, 4, /* 0-7 */
michael@0 650 0xff,0xff,5, 6, 7, 8, 9, 10, /* 8-15 */
michael@0 651 0xff,0xff,11, 12, 13, 14, 15, 16, /* 16-23 */
michael@0 652 0xff,0xff,17, 18, 19, 20, 0xff,0xff /* 24-31 */
michael@0 653 };
michael@0 654 static const uint8_t tMap[32]={ /* totaly 29 */
michael@0 655 0xff,0, 1, 2, 3, 4, 5, 6, /* 0-7 */
michael@0 656 7, 8, 9, 10, 11, 12, 13, 14, /* 8-15 */
michael@0 657 15, 16, 0xff,17, 18, 19, 20, 21, /* 16-23 */
michael@0 658 22, 23, 24, 25, 26, 27, 0xff,0xff /* 24-31 */
michael@0 659 };
michael@0 660 uint16_t ch = (in[0] << 8) | in[1];
michael@0 661 uint16_t LIndex, VIndex, TIndex;
michael@0 662 if(0 == (0x8000 & ch))
michael@0 663 return 0;
michael@0 664 LIndex=lMap[(ch>>10)& 0x1F];
michael@0 665 VIndex=vMap[(ch>>5) & 0x1F];
michael@0 666 TIndex=tMap[(ch>>0) & 0x1F];
michael@0 667 if((0xff==(LIndex)) ||
michael@0 668 (0xff==(VIndex)) ||
michael@0 669 (0xff==(TIndex)))
michael@0 670 return 0;
michael@0 671 /* the following line is from Unicode 2.0 page 3-13 item 5 */
michael@0 672 *out = ( LIndex * VCount + VIndex) * TCount + TIndex + SBase;
michael@0 673 *inscanlen = 2;
michael@0 674 return 1;
michael@0 675 }
michael@0 676 }
michael@0 677 int uCheckAndScanJohabSymbol(
michael@0 678 int32_t* state,
michael@0 679 unsigned char *in,
michael@0 680 uint16_t *out,
michael@0 681 uint32_t inbuflen,
michael@0 682 uint32_t* inscanlen
michael@0 683 )
michael@0 684 {
michael@0 685 if(inbuflen < 2)
michael@0 686 return 0;
michael@0 687 else {
michael@0 688 /*
michael@0 689 * The following code are based on the Perl code lised under
michael@0 690 * "Johab to ISO-2022-KR or EUC-KR Conversion" in page 1014 of
michael@0 691 * "CJKV Information Processing" by Ken Lunde <lunde@adobe.com>
michael@0 692 *
michael@0 693 * sub johab2ks ($) { # Convert Johab to ISO-2022-KR
michael@0 694 * my @johab = unpack("C*", $_[0]);
michael@0 695 * my ($offset, $d8_off) = (0,0);
michael@0 696 * my @out = ();
michael@0 697 * while(($hi, $lo) = splice($johab, 0, 2)) {
michael@0 698 * $offset = 1 if ($hi > 223 and $hi < 250);
michael@0 699 * $d8_off = ($hi == 216 and ($lo > 160 ? 94 : 42));
michael@0 700 * push (@out, (((($hi - ($hi < 223 ? 200 : 187)) << 1) -
michael@0 701 * ($lo < 161 ? 1 : 0) + $offset) + $d8_off),
michael@0 702 * $lo - ($lo < 161 ? ($lo > 126 ? 34 : 16) : 128 ));
michael@0 703 * }
michael@0 704 * return pack ("C*", @out);
michael@0 705 * }
michael@0 706 * additional comments from Ken Lunde
michael@0 707 * $d8_off = ($hi == 216 and ($lo > 160 ? 94 : 42));
michael@0 708 * has three possible return values:
michael@0 709 * 0 if $hi is not equal to 216
michael@0 710 * 94 if $hi is euqal to 216 and if $lo is greater than 160
michael@0 711 * 42 if $hi is euqal to 216 and if $lo is not greater than 160
michael@0 712 */
michael@0 713 unsigned char hi = in[0];
michael@0 714 unsigned char lo = in[1];
michael@0 715 uint16_t offset = (( hi > 223 ) && ( hi < 250)) ? 1 : 0;
michael@0 716 uint16_t d8_off = 0;
michael@0 717 if(216 == hi) {
michael@0 718 if( lo > 160)
michael@0 719 d8_off = 94;
michael@0 720 else
michael@0 721 d8_off = 42;
michael@0 722 }
michael@0 723
michael@0 724 *out = (((((hi - ((hi < 223) ? 200 : 187)) << 1) -
michael@0 725 (lo < 161 ? 1 : 0) + offset) + d8_off) << 8 ) |
michael@0 726 (lo - ((lo < 161) ? ((lo > 126) ? 34 : 16) :
michael@0 727 128));
michael@0 728 *inscanlen = 2;
michael@0 729 return 1;
michael@0 730 }
michael@0 731 }
michael@0 732 int uCheckAndScan4BytesGB18030(
michael@0 733 int32_t* state,
michael@0 734 unsigned char *in,
michael@0 735 uint16_t *out,
michael@0 736 uint32_t inbuflen,
michael@0 737 uint32_t* inscanlen
michael@0 738 )
michael@0 739 {
michael@0 740 uint32_t data;
michael@0 741 if(inbuflen < 4)
michael@0 742 return 0;
michael@0 743
michael@0 744 if((in[0] < 0x81 ) || (0xfe < in[0]))
michael@0 745 return 0;
michael@0 746 if((in[1] < 0x30 ) || (0x39 < in[1]))
michael@0 747 return 0;
michael@0 748 if((in[2] < 0x81 ) || (0xfe < in[2]))
michael@0 749 return 0;
michael@0 750 if((in[3] < 0x30 ) || (0x39 < in[3]))
michael@0 751 return 0;
michael@0 752
michael@0 753 data = (((((in[0] - 0x81) * 10 + (in[1] - 0x30)) * 126) +
michael@0 754 (in[2] - 0x81)) * 10 ) + (in[3] - 0x30);
michael@0 755
michael@0 756 *inscanlen = 4;
michael@0 757 *out = (data < 0x00010000) ? data : 0xFFFD;
michael@0 758 return 1;
michael@0 759 }

mercurial