intl/icu/source/tools/toolutil/ucm.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2003-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: ucm.c
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2003jun20
michael@0 14 * created by: Markus W. Scherer
michael@0 15 *
michael@0 16 * This file reads a .ucm file, stores its mappings and sorts them.
michael@0 17 * It implements handling of Unicode conversion mappings from .ucm files
michael@0 18 * for makeconv, canonucm, rptp2ucm, etc.
michael@0 19 *
michael@0 20 * Unicode code point sequences with a length of more than 1,
michael@0 21 * as well as byte sequences with more than 4 bytes or more than one complete
michael@0 22 * character sequence are handled to support m:n mappings.
michael@0 23 */
michael@0 24
michael@0 25 #include "unicode/utypes.h"
michael@0 26 #include "unicode/ustring.h"
michael@0 27 #include "cstring.h"
michael@0 28 #include "cmemory.h"
michael@0 29 #include "filestrm.h"
michael@0 30 #include "uarrsort.h"
michael@0 31 #include "ucnvmbcs.h"
michael@0 32 #include "ucnv_bld.h"
michael@0 33 #include "ucnv_ext.h"
michael@0 34 #include "uparse.h"
michael@0 35 #include "ucm.h"
michael@0 36 #include <stdio.h>
michael@0 37
michael@0 38 #if !UCONFIG_NO_CONVERSION
michael@0 39
michael@0 40 /* -------------------------------------------------------------------------- */
michael@0 41
michael@0 42 static void
michael@0 43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
michael@0 44 int32_t j;
michael@0 45
michael@0 46 for(j=0; j<m->uLen; ++j) {
michael@0 47 fprintf(f, "<U%04lX>", (long)codePoints[j]);
michael@0 48 }
michael@0 49
michael@0 50 fputc(' ', f);
michael@0 51
michael@0 52 for(j=0; j<m->bLen; ++j) {
michael@0 53 fprintf(f, "\\x%02X", bytes[j]);
michael@0 54 }
michael@0 55
michael@0 56 if(m->f>=0) {
michael@0 57 fprintf(f, " |%u\n", m->f);
michael@0 58 } else {
michael@0 59 fputs("\n", f);
michael@0 60 }
michael@0 61 }
michael@0 62
michael@0 63 U_CAPI void U_EXPORT2
michael@0 64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
michael@0 65 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
michael@0 66 }
michael@0 67
michael@0 68 U_CAPI void U_EXPORT2
michael@0 69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
michael@0 70 UCMapping *m;
michael@0 71 int32_t i, length;
michael@0 72
michael@0 73 m=table->mappings;
michael@0 74 length=table->mappingsLength;
michael@0 75 if(byUnicode) {
michael@0 76 for(i=0; i<length; ++m, ++i) {
michael@0 77 ucm_printMapping(table, m, f);
michael@0 78 }
michael@0 79 } else {
michael@0 80 const int32_t *map=table->reverseMap;
michael@0 81 for(i=0; i<length; ++i) {
michael@0 82 ucm_printMapping(table, m+map[i], f);
michael@0 83 }
michael@0 84 }
michael@0 85 }
michael@0 86
michael@0 87 /* mapping comparisons ------------------------------------------------------ */
michael@0 88
michael@0 89 static int32_t
michael@0 90 compareUnicode(UCMTable *lTable, const UCMapping *l,
michael@0 91 UCMTable *rTable, const UCMapping *r) {
michael@0 92 const UChar32 *lu, *ru;
michael@0 93 int32_t result, i, length;
michael@0 94
michael@0 95 if(l->uLen==1 && r->uLen==1) {
michael@0 96 /* compare two single code points */
michael@0 97 return l->u-r->u;
michael@0 98 }
michael@0 99
michael@0 100 /* get pointers to the code point sequences */
michael@0 101 lu=UCM_GET_CODE_POINTS(lTable, l);
michael@0 102 ru=UCM_GET_CODE_POINTS(rTable, r);
michael@0 103
michael@0 104 /* get the minimum length */
michael@0 105 if(l->uLen<=r->uLen) {
michael@0 106 length=l->uLen;
michael@0 107 } else {
michael@0 108 length=r->uLen;
michael@0 109 }
michael@0 110
michael@0 111 /* compare the code points */
michael@0 112 for(i=0; i<length; ++i) {
michael@0 113 result=lu[i]-ru[i];
michael@0 114 if(result!=0) {
michael@0 115 return result;
michael@0 116 }
michael@0 117 }
michael@0 118
michael@0 119 /* compare the lengths */
michael@0 120 return l->uLen-r->uLen;
michael@0 121 }
michael@0 122
michael@0 123 static int32_t
michael@0 124 compareBytes(UCMTable *lTable, const UCMapping *l,
michael@0 125 UCMTable *rTable, const UCMapping *r,
michael@0 126 UBool lexical) {
michael@0 127 const uint8_t *lb, *rb;
michael@0 128 int32_t result, i, length;
michael@0 129
michael@0 130 /*
michael@0 131 * A lexical comparison is used for sorting in the builder, to allow
michael@0 132 * an efficient search for a byte sequence that could be a prefix
michael@0 133 * of a previously entered byte sequence.
michael@0 134 *
michael@0 135 * Comparing by lengths first is for compatibility with old .ucm tools
michael@0 136 * like canonucm and rptp2ucm.
michael@0 137 */
michael@0 138 if(lexical) {
michael@0 139 /* get the minimum length and continue */
michael@0 140 if(l->bLen<=r->bLen) {
michael@0 141 length=l->bLen;
michael@0 142 } else {
michael@0 143 length=r->bLen;
michael@0 144 }
michael@0 145 } else {
michael@0 146 /* compare lengths first */
michael@0 147 result=l->bLen-r->bLen;
michael@0 148 if(result!=0) {
michael@0 149 return result;
michael@0 150 } else {
michael@0 151 length=l->bLen;
michael@0 152 }
michael@0 153 }
michael@0 154
michael@0 155 /* get pointers to the byte sequences */
michael@0 156 lb=UCM_GET_BYTES(lTable, l);
michael@0 157 rb=UCM_GET_BYTES(rTable, r);
michael@0 158
michael@0 159 /* compare the bytes */
michael@0 160 for(i=0; i<length; ++i) {
michael@0 161 result=lb[i]-rb[i];
michael@0 162 if(result!=0) {
michael@0 163 return result;
michael@0 164 }
michael@0 165 }
michael@0 166
michael@0 167 /* compare the lengths */
michael@0 168 return l->bLen-r->bLen;
michael@0 169 }
michael@0 170
michael@0 171 /* compare UCMappings for sorting */
michael@0 172 static int32_t
michael@0 173 compareMappings(UCMTable *lTable, const UCMapping *l,
michael@0 174 UCMTable *rTable, const UCMapping *r,
michael@0 175 UBool uFirst) {
michael@0 176 int32_t result;
michael@0 177
michael@0 178 /* choose which side to compare first */
michael@0 179 if(uFirst) {
michael@0 180 /* Unicode then bytes */
michael@0 181 result=compareUnicode(lTable, l, rTable, r);
michael@0 182 if(result==0) {
michael@0 183 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
michael@0 184 }
michael@0 185 } else {
michael@0 186 /* bytes then Unicode */
michael@0 187 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
michael@0 188 if(result==0) {
michael@0 189 result=compareUnicode(lTable, l, rTable, r);
michael@0 190 }
michael@0 191 }
michael@0 192
michael@0 193 if(result!=0) {
michael@0 194 return result;
michael@0 195 }
michael@0 196
michael@0 197 /* compare the flags */
michael@0 198 return l->f-r->f;
michael@0 199 }
michael@0 200
michael@0 201 /* sorting by Unicode first sorts mappings directly */
michael@0 202 static int32_t
michael@0 203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
michael@0 204 return compareMappings(
michael@0 205 (UCMTable *)context, (const UCMapping *)left,
michael@0 206 (UCMTable *)context, (const UCMapping *)right, TRUE);
michael@0 207 }
michael@0 208
michael@0 209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
michael@0 210 static int32_t
michael@0 211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
michael@0 212 UCMTable *table=(UCMTable *)context;
michael@0 213 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
michael@0 214 return compareMappings(
michael@0 215 table, table->mappings+l,
michael@0 216 table, table->mappings+r, FALSE);
michael@0 217 }
michael@0 218
michael@0 219 U_CAPI void U_EXPORT2
michael@0 220 ucm_sortTable(UCMTable *t) {
michael@0 221 UErrorCode errorCode;
michael@0 222 int32_t i;
michael@0 223
michael@0 224 if(t->isSorted) {
michael@0 225 return;
michael@0 226 }
michael@0 227
michael@0 228 errorCode=U_ZERO_ERROR;
michael@0 229
michael@0 230 /* 1. sort by Unicode first */
michael@0 231 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
michael@0 232 compareMappingsUnicodeFirst, t,
michael@0 233 FALSE, &errorCode);
michael@0 234
michael@0 235 /* build the reverseMap */
michael@0 236 if(t->reverseMap==NULL) {
michael@0 237 /*
michael@0 238 * allocate mappingsCapacity instead of mappingsLength so that
michael@0 239 * if mappings are added, the reverseMap need not be
michael@0 240 * reallocated each time
michael@0 241 * (see ucm_moveMappings() and ucm_addMapping())
michael@0 242 */
michael@0 243 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
michael@0 244 if(t->reverseMap==NULL) {
michael@0 245 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
michael@0 246 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 247 }
michael@0 248 }
michael@0 249 for(i=0; i<t->mappingsLength; ++i) {
michael@0 250 t->reverseMap[i]=i;
michael@0 251 }
michael@0 252
michael@0 253 /* 2. sort reverseMap by mappings bytes first */
michael@0 254 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
michael@0 255 compareMappingsBytesFirst, t,
michael@0 256 FALSE, &errorCode);
michael@0 257
michael@0 258 if(U_FAILURE(errorCode)) {
michael@0 259 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
michael@0 260 u_errorName(errorCode));
michael@0 261 exit(errorCode);
michael@0 262 }
michael@0 263
michael@0 264 t->isSorted=TRUE;
michael@0 265 }
michael@0 266
michael@0 267 /*
michael@0 268 * remove mappings with their move flag set from the base table
michael@0 269 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
michael@0 270 */
michael@0 271 U_CAPI void U_EXPORT2
michael@0 272 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
michael@0 273 UCMapping *mb, *mbLimit;
michael@0 274 int8_t flag;
michael@0 275
michael@0 276 mb=base->mappings;
michael@0 277 mbLimit=mb+base->mappingsLength;
michael@0 278
michael@0 279 while(mb<mbLimit) {
michael@0 280 flag=mb->moveFlag;
michael@0 281 if(flag!=0) {
michael@0 282 /* reset the move flag */
michael@0 283 mb->moveFlag=0;
michael@0 284
michael@0 285 if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
michael@0 286 /* add the mapping to the extension table */
michael@0 287 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
michael@0 288 }
michael@0 289
michael@0 290 /* remove this mapping: move the last base mapping down and overwrite the current one */
michael@0 291 if(mb<(mbLimit-1)) {
michael@0 292 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
michael@0 293 }
michael@0 294 --mbLimit;
michael@0 295 --base->mappingsLength;
michael@0 296 base->isSorted=FALSE;
michael@0 297 } else {
michael@0 298 ++mb;
michael@0 299 }
michael@0 300 }
michael@0 301 }
michael@0 302
michael@0 303 enum {
michael@0 304 NEEDS_MOVE=1,
michael@0 305 HAS_ERRORS=2
michael@0 306 };
michael@0 307
michael@0 308 static uint8_t
michael@0 309 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
michael@0 310 UBool moveToExt, UBool intersectBase) {
michael@0 311 UCMapping *mb, *me, *mbLimit, *meLimit;
michael@0 312 int32_t cmp;
michael@0 313 uint8_t result;
michael@0 314
michael@0 315 mb=base->mappings;
michael@0 316 mbLimit=mb+base->mappingsLength;
michael@0 317
michael@0 318 me=ext->mappings;
michael@0 319 meLimit=me+ext->mappingsLength;
michael@0 320
michael@0 321 result=0;
michael@0 322
michael@0 323 for(;;) {
michael@0 324 /* skip irrelevant mappings on both sides */
michael@0 325 for(;;) {
michael@0 326 if(mb==mbLimit) {
michael@0 327 return result;
michael@0 328 }
michael@0 329
michael@0 330 if((0<=mb->f && mb->f<=2) || mb->f==4) {
michael@0 331 break;
michael@0 332 }
michael@0 333
michael@0 334 ++mb;
michael@0 335 }
michael@0 336
michael@0 337 for(;;) {
michael@0 338 if(me==meLimit) {
michael@0 339 return result;
michael@0 340 }
michael@0 341
michael@0 342 if((0<=me->f && me->f<=2) || me->f==4) {
michael@0 343 break;
michael@0 344 }
michael@0 345
michael@0 346 ++me;
michael@0 347 }
michael@0 348
michael@0 349 /* compare the base and extension mappings */
michael@0 350 cmp=compareUnicode(base, mb, ext, me);
michael@0 351 if(cmp<0) {
michael@0 352 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
michael@0 353 /*
michael@0 354 * mapping in base but not in ext, move it
michael@0 355 *
michael@0 356 * if ext is DBCS, move DBCS mappings here
michael@0 357 * and check SBCS ones for Unicode prefix below
michael@0 358 */
michael@0 359 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 360 result|=NEEDS_MOVE;
michael@0 361
michael@0 362 /* does mb map from an input sequence that is a prefix of me's? */
michael@0 363 } else if( mb->uLen<me->uLen &&
michael@0 364 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
michael@0 365 ) {
michael@0 366 if(moveToExt) {
michael@0 367 /* mark this mapping to be moved to the extension table */
michael@0 368 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 369 result|=NEEDS_MOVE;
michael@0 370 } else {
michael@0 371 fprintf(stderr,
michael@0 372 "ucm error: the base table contains a mapping whose input sequence\n"
michael@0 373 " is a prefix of the input sequence of an extension mapping\n");
michael@0 374 ucm_printMapping(base, mb, stderr);
michael@0 375 ucm_printMapping(ext, me, stderr);
michael@0 376 result|=HAS_ERRORS;
michael@0 377 }
michael@0 378 }
michael@0 379
michael@0 380 ++mb;
michael@0 381 } else if(cmp==0) {
michael@0 382 /*
michael@0 383 * same output: remove the extension mapping,
michael@0 384 * otherwise treat as an error
michael@0 385 */
michael@0 386 if( mb->f==me->f && mb->bLen==me->bLen &&
michael@0 387 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
michael@0 388 ) {
michael@0 389 me->moveFlag|=UCM_REMOVE_MAPPING;
michael@0 390 result|=NEEDS_MOVE;
michael@0 391 } else if(intersectBase) {
michael@0 392 /* mapping in base but not in ext, move it */
michael@0 393 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 394 result|=NEEDS_MOVE;
michael@0 395 } else {
michael@0 396 fprintf(stderr,
michael@0 397 "ucm error: the base table contains a mapping whose input sequence\n"
michael@0 398 " is the same as the input sequence of an extension mapping\n"
michael@0 399 " but it maps differently\n");
michael@0 400 ucm_printMapping(base, mb, stderr);
michael@0 401 ucm_printMapping(ext, me, stderr);
michael@0 402 result|=HAS_ERRORS;
michael@0 403 }
michael@0 404
michael@0 405 ++mb;
michael@0 406 } else /* cmp>0 */ {
michael@0 407 ++me;
michael@0 408 }
michael@0 409 }
michael@0 410 }
michael@0 411
michael@0 412 static uint8_t
michael@0 413 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
michael@0 414 UBool moveToExt, UBool intersectBase) {
michael@0 415 UCMapping *mb, *me;
michael@0 416 int32_t *baseMap, *extMap;
michael@0 417 int32_t b, e, bLimit, eLimit, cmp;
michael@0 418 uint8_t result;
michael@0 419 UBool isSISO;
michael@0 420
michael@0 421 baseMap=base->reverseMap;
michael@0 422 extMap=ext->reverseMap;
michael@0 423
michael@0 424 b=e=0;
michael@0 425 bLimit=base->mappingsLength;
michael@0 426 eLimit=ext->mappingsLength;
michael@0 427
michael@0 428 result=0;
michael@0 429
michael@0 430 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
michael@0 431
michael@0 432 for(;;) {
michael@0 433 /* skip irrelevant mappings on both sides */
michael@0 434 for(;; ++b) {
michael@0 435 if(b==bLimit) {
michael@0 436 return result;
michael@0 437 }
michael@0 438 mb=base->mappings+baseMap[b];
michael@0 439
michael@0 440 if(intersectBase==2 && mb->bLen==1) {
michael@0 441 /*
michael@0 442 * comparing a base against a DBCS extension:
michael@0 443 * leave SBCS base mappings alone
michael@0 444 */
michael@0 445 continue;
michael@0 446 }
michael@0 447
michael@0 448 if(mb->f==0 || mb->f==3) {
michael@0 449 break;
michael@0 450 }
michael@0 451 }
michael@0 452
michael@0 453 for(;;) {
michael@0 454 if(e==eLimit) {
michael@0 455 return result;
michael@0 456 }
michael@0 457 me=ext->mappings+extMap[e];
michael@0 458
michael@0 459 if(me->f==0 || me->f==3) {
michael@0 460 break;
michael@0 461 }
michael@0 462
michael@0 463 ++e;
michael@0 464 }
michael@0 465
michael@0 466 /* compare the base and extension mappings */
michael@0 467 cmp=compareBytes(base, mb, ext, me, TRUE);
michael@0 468 if(cmp<0) {
michael@0 469 if(intersectBase) {
michael@0 470 /* mapping in base but not in ext, move it */
michael@0 471 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 472 result|=NEEDS_MOVE;
michael@0 473
michael@0 474 /*
michael@0 475 * does mb map from an input sequence that is a prefix of me's?
michael@0 476 * for SI/SO tables, a single byte is never a prefix because it
michael@0 477 * occurs in a separate single-byte state
michael@0 478 */
michael@0 479 } else if( mb->bLen<me->bLen &&
michael@0 480 (!isSISO || mb->bLen>1) &&
michael@0 481 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
michael@0 482 ) {
michael@0 483 if(moveToExt) {
michael@0 484 /* mark this mapping to be moved to the extension table */
michael@0 485 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 486 result|=NEEDS_MOVE;
michael@0 487 } else {
michael@0 488 fprintf(stderr,
michael@0 489 "ucm error: the base table contains a mapping whose input sequence\n"
michael@0 490 " is a prefix of the input sequence of an extension mapping\n");
michael@0 491 ucm_printMapping(base, mb, stderr);
michael@0 492 ucm_printMapping(ext, me, stderr);
michael@0 493 result|=HAS_ERRORS;
michael@0 494 }
michael@0 495 }
michael@0 496
michael@0 497 ++b;
michael@0 498 } else if(cmp==0) {
michael@0 499 /*
michael@0 500 * same output: remove the extension mapping,
michael@0 501 * otherwise treat as an error
michael@0 502 */
michael@0 503 if( mb->f==me->f && mb->uLen==me->uLen &&
michael@0 504 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
michael@0 505 ) {
michael@0 506 me->moveFlag|=UCM_REMOVE_MAPPING;
michael@0 507 result|=NEEDS_MOVE;
michael@0 508 } else if(intersectBase) {
michael@0 509 /* mapping in base but not in ext, move it */
michael@0 510 mb->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 511 result|=NEEDS_MOVE;
michael@0 512 } else {
michael@0 513 fprintf(stderr,
michael@0 514 "ucm error: the base table contains a mapping whose input sequence\n"
michael@0 515 " is the same as the input sequence of an extension mapping\n"
michael@0 516 " but it maps differently\n");
michael@0 517 ucm_printMapping(base, mb, stderr);
michael@0 518 ucm_printMapping(ext, me, stderr);
michael@0 519 result|=HAS_ERRORS;
michael@0 520 }
michael@0 521
michael@0 522 ++b;
michael@0 523 } else /* cmp>0 */ {
michael@0 524 ++e;
michael@0 525 }
michael@0 526 }
michael@0 527 }
michael@0 528
michael@0 529 U_CAPI UBool U_EXPORT2
michael@0 530 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
michael@0 531 UCMapping *m, *mLimit;
michael@0 532 int32_t count;
michael@0 533 UBool isOK;
michael@0 534
michael@0 535 m=table->mappings;
michael@0 536 mLimit=m+table->mappingsLength;
michael@0 537 isOK=TRUE;
michael@0 538
michael@0 539 while(m<mLimit) {
michael@0 540 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
michael@0 541 if(count<1) {
michael@0 542 ucm_printMapping(table, m, stderr);
michael@0 543 isOK=FALSE;
michael@0 544 }
michael@0 545 ++m;
michael@0 546 }
michael@0 547
michael@0 548 return isOK;
michael@0 549 }
michael@0 550
michael@0 551 U_CAPI UBool U_EXPORT2
michael@0 552 ucm_checkBaseExt(UCMStates *baseStates,
michael@0 553 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
michael@0 554 UBool intersectBase) {
michael@0 555 uint8_t result;
michael@0 556
michael@0 557 /* if we have an extension table, we must always use precision flags */
michael@0 558 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
michael@0 559 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
michael@0 560 return FALSE;
michael@0 561 }
michael@0 562 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
michael@0 563 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
michael@0 564 return FALSE;
michael@0 565 }
michael@0 566
michael@0 567 /* checking requires both tables to be sorted */
michael@0 568 ucm_sortTable(base);
michael@0 569 ucm_sortTable(ext);
michael@0 570
michael@0 571 /* check */
michael@0 572 result=
michael@0 573 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
michael@0 574 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
michael@0 575
michael@0 576 if(result&HAS_ERRORS) {
michael@0 577 return FALSE;
michael@0 578 }
michael@0 579
michael@0 580 if(result&NEEDS_MOVE) {
michael@0 581 ucm_moveMappings(ext, NULL);
michael@0 582 ucm_moveMappings(base, moveTarget);
michael@0 583 ucm_sortTable(base);
michael@0 584 ucm_sortTable(ext);
michael@0 585 if(moveTarget!=NULL) {
michael@0 586 ucm_sortTable(moveTarget);
michael@0 587 }
michael@0 588 }
michael@0 589
michael@0 590 return TRUE;
michael@0 591 }
michael@0 592
michael@0 593 /* merge tables for rptp2ucm ------------------------------------------------ */
michael@0 594
michael@0 595 U_CAPI void U_EXPORT2
michael@0 596 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
michael@0 597 const uint8_t *subchar, int32_t subcharLength,
michael@0 598 uint8_t subchar1) {
michael@0 599 UCMapping *fromUMapping, *toUMapping;
michael@0 600 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
michael@0 601
michael@0 602 ucm_sortTable(fromUTable);
michael@0 603 ucm_sortTable(toUTable);
michael@0 604
michael@0 605 fromUMapping=fromUTable->mappings;
michael@0 606 toUMapping=toUTable->mappings;
michael@0 607
michael@0 608 fromUTop=fromUTable->mappingsLength;
michael@0 609 toUTop=toUTable->mappingsLength;
michael@0 610
michael@0 611 fromUIndex=toUIndex=0;
michael@0 612
michael@0 613 while(fromUIndex<fromUTop && toUIndex<toUTop) {
michael@0 614 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
michael@0 615 if(cmp==0) {
michael@0 616 /* equal: roundtrip, nothing to do (flags are initially 0) */
michael@0 617 ++fromUMapping;
michael@0 618 ++toUMapping;
michael@0 619
michael@0 620 ++fromUIndex;
michael@0 621 ++toUIndex;
michael@0 622 } else if(cmp<0) {
michael@0 623 /*
michael@0 624 * the fromU mapping does not have a toU counterpart:
michael@0 625 * fallback Unicode->codepage
michael@0 626 */
michael@0 627 if( (fromUMapping->bLen==subcharLength &&
michael@0 628 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
michael@0 629 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
michael@0 630 ) {
michael@0 631 fromUMapping->f=2; /* SUB mapping */
michael@0 632 } else {
michael@0 633 fromUMapping->f=1; /* normal fallback */
michael@0 634 }
michael@0 635
michael@0 636 ++fromUMapping;
michael@0 637 ++fromUIndex;
michael@0 638 } else {
michael@0 639 /*
michael@0 640 * the toU mapping does not have a fromU counterpart:
michael@0 641 * (reverse) fallback codepage->Unicode, copy it to the fromU table
michael@0 642 */
michael@0 643
michael@0 644 /* ignore reverse fallbacks to Unicode SUB */
michael@0 645 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
michael@0 646 toUMapping->f=3; /* reverse fallback */
michael@0 647 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
michael@0 648
michael@0 649 /* the table may have been reallocated */
michael@0 650 fromUMapping=fromUTable->mappings+fromUIndex;
michael@0 651 }
michael@0 652
michael@0 653 ++toUMapping;
michael@0 654 ++toUIndex;
michael@0 655 }
michael@0 656 }
michael@0 657
michael@0 658 /* either one or both tables are exhausted */
michael@0 659 while(fromUIndex<fromUTop) {
michael@0 660 /* leftover fromU mappings are fallbacks */
michael@0 661 if( (fromUMapping->bLen==subcharLength &&
michael@0 662 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
michael@0 663 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
michael@0 664 ) {
michael@0 665 fromUMapping->f=2; /* SUB mapping */
michael@0 666 } else {
michael@0 667 fromUMapping->f=1; /* normal fallback */
michael@0 668 }
michael@0 669
michael@0 670 ++fromUMapping;
michael@0 671 ++fromUIndex;
michael@0 672 }
michael@0 673
michael@0 674 while(toUIndex<toUTop) {
michael@0 675 /* leftover toU mappings are reverse fallbacks */
michael@0 676
michael@0 677 /* ignore reverse fallbacks to Unicode SUB */
michael@0 678 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
michael@0 679 toUMapping->f=3; /* reverse fallback */
michael@0 680 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
michael@0 681 }
michael@0 682
michael@0 683 ++toUMapping;
michael@0 684 ++toUIndex;
michael@0 685 }
michael@0 686
michael@0 687 fromUTable->isSorted=FALSE;
michael@0 688 }
michael@0 689
michael@0 690 /* separate extension mappings out of base table for rptp2ucm --------------- */
michael@0 691
michael@0 692 U_CAPI UBool U_EXPORT2
michael@0 693 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
michael@0 694 UCMTable *table;
michael@0 695 UCMapping *m, *mLimit;
michael@0 696 int32_t type;
michael@0 697 UBool needsMove, isOK;
michael@0 698
michael@0 699 table=ucm->base;
michael@0 700 m=table->mappings;
michael@0 701 mLimit=m+table->mappingsLength;
michael@0 702
michael@0 703 needsMove=FALSE;
michael@0 704 isOK=TRUE;
michael@0 705
michael@0 706 for(; m<mLimit; ++m) {
michael@0 707 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
michael@0 708 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
michael@0 709 ucm_printMapping(table, m, stderr);
michael@0 710 m->moveFlag|=UCM_REMOVE_MAPPING;
michael@0 711 needsMove=TRUE;
michael@0 712 continue;
michael@0 713 }
michael@0 714
michael@0 715 type=ucm_mappingType(
michael@0 716 &ucm->states, m,
michael@0 717 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
michael@0 718 if(type<0) {
michael@0 719 /* illegal byte sequence */
michael@0 720 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
michael@0 721 isOK=FALSE;
michael@0 722 } else if(type>0) {
michael@0 723 m->moveFlag|=UCM_MOVE_TO_EXT;
michael@0 724 needsMove=TRUE;
michael@0 725 }
michael@0 726 }
michael@0 727
michael@0 728 if(!isOK) {
michael@0 729 return FALSE;
michael@0 730 }
michael@0 731 if(needsMove) {
michael@0 732 ucm_moveMappings(ucm->base, ucm->ext);
michael@0 733 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
michael@0 734 } else {
michael@0 735 ucm_sortTable(ucm->base);
michael@0 736 return TRUE;
michael@0 737 }
michael@0 738 }
michael@0 739
michael@0 740 /* ucm parser --------------------------------------------------------------- */
michael@0 741
michael@0 742 U_CAPI int8_t U_EXPORT2
michael@0 743 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
michael@0 744 const char *s=*ps;
michael@0 745 char *end;
michael@0 746 uint8_t byte;
michael@0 747 int8_t bLen;
michael@0 748
michael@0 749 bLen=0;
michael@0 750 for(;;) {
michael@0 751 /* skip an optional plus sign */
michael@0 752 if(bLen>0 && *s=='+') {
michael@0 753 ++s;
michael@0 754 }
michael@0 755 if(*s!='\\') {
michael@0 756 break;
michael@0 757 }
michael@0 758
michael@0 759 if( s[1]!='x' ||
michael@0 760 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
michael@0 761 ) {
michael@0 762 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
michael@0 763 return -1;
michael@0 764 }
michael@0 765
michael@0 766 if(bLen==UCNV_EXT_MAX_BYTES) {
michael@0 767 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
michael@0 768 return -1;
michael@0 769 }
michael@0 770 bytes[bLen++]=byte;
michael@0 771 s=end;
michael@0 772 }
michael@0 773
michael@0 774 *ps=s;
michael@0 775 return bLen;
michael@0 776 }
michael@0 777
michael@0 778 /* parse a mapping line; must not be empty */
michael@0 779 U_CAPI UBool U_EXPORT2
michael@0 780 ucm_parseMappingLine(UCMapping *m,
michael@0 781 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
michael@0 782 uint8_t bytes[UCNV_EXT_MAX_BYTES],
michael@0 783 const char *line) {
michael@0 784 const char *s;
michael@0 785 char *end;
michael@0 786 UChar32 cp;
michael@0 787 int32_t u16Length;
michael@0 788 int8_t uLen, bLen, f;
michael@0 789
michael@0 790 s=line;
michael@0 791 uLen=bLen=0;
michael@0 792
michael@0 793 /* parse code points */
michael@0 794 for(;;) {
michael@0 795 /* skip an optional plus sign */
michael@0 796 if(uLen>0 && *s=='+') {
michael@0 797 ++s;
michael@0 798 }
michael@0 799 if(*s!='<') {
michael@0 800 break;
michael@0 801 }
michael@0 802
michael@0 803 if( s[1]!='U' ||
michael@0 804 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
michael@0 805 *end!='>'
michael@0 806 ) {
michael@0 807 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
michael@0 808 return FALSE;
michael@0 809 }
michael@0 810 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
michael@0 811 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
michael@0 812 return FALSE;
michael@0 813 }
michael@0 814
michael@0 815 if(uLen==UCNV_EXT_MAX_UCHARS) {
michael@0 816 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
michael@0 817 return FALSE;
michael@0 818 }
michael@0 819 codePoints[uLen++]=cp;
michael@0 820 s=end+1;
michael@0 821 }
michael@0 822
michael@0 823 if(uLen==0) {
michael@0 824 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
michael@0 825 return FALSE;
michael@0 826 } else if(uLen==1) {
michael@0 827 m->u=codePoints[0];
michael@0 828 } else {
michael@0 829 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 830 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
michael@0 831 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
michael@0 832 u16Length>UCNV_EXT_MAX_UCHARS
michael@0 833 ) {
michael@0 834 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
michael@0 835 return FALSE;
michael@0 836 }
michael@0 837 }
michael@0 838
michael@0 839 s=u_skipWhitespace(s);
michael@0 840
michael@0 841 /* parse bytes */
michael@0 842 bLen=ucm_parseBytes(bytes, line, &s);
michael@0 843
michael@0 844 if(bLen<0) {
michael@0 845 return FALSE;
michael@0 846 } else if(bLen==0) {
michael@0 847 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
michael@0 848 return FALSE;
michael@0 849 } else if(bLen<=4) {
michael@0 850 uprv_memcpy(m->b.bytes, bytes, bLen);
michael@0 851 }
michael@0 852
michael@0 853 /* skip everything until the fallback indicator, even the start of a comment */
michael@0 854 for(;;) {
michael@0 855 if(*s==0) {
michael@0 856 f=-1; /* no fallback indicator */
michael@0 857 break;
michael@0 858 } else if(*s=='|') {
michael@0 859 f=(int8_t)(s[1]-'0');
michael@0 860 if((uint8_t)f>4) {
michael@0 861 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
michael@0 862 return FALSE;
michael@0 863 }
michael@0 864 break;
michael@0 865 }
michael@0 866 ++s;
michael@0 867 }
michael@0 868
michael@0 869 m->uLen=uLen;
michael@0 870 m->bLen=bLen;
michael@0 871 m->f=f;
michael@0 872 return TRUE;
michael@0 873 }
michael@0 874
michael@0 875 /* general APIs ------------------------------------------------------------- */
michael@0 876
michael@0 877 U_CAPI UCMTable * U_EXPORT2
michael@0 878 ucm_openTable() {
michael@0 879 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
michael@0 880 if(table==NULL) {
michael@0 881 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
michael@0 882 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 883 }
michael@0 884
michael@0 885 memset(table, 0, sizeof(UCMTable));
michael@0 886 return table;
michael@0 887 }
michael@0 888
michael@0 889 U_CAPI void U_EXPORT2
michael@0 890 ucm_closeTable(UCMTable *table) {
michael@0 891 if(table!=NULL) {
michael@0 892 uprv_free(table->mappings);
michael@0 893 uprv_free(table->codePoints);
michael@0 894 uprv_free(table->bytes);
michael@0 895 uprv_free(table->reverseMap);
michael@0 896 uprv_free(table);
michael@0 897 }
michael@0 898 }
michael@0 899
michael@0 900 U_CAPI void U_EXPORT2
michael@0 901 ucm_resetTable(UCMTable *table) {
michael@0 902 if(table!=NULL) {
michael@0 903 table->mappingsLength=0;
michael@0 904 table->flagsType=0;
michael@0 905 table->unicodeMask=0;
michael@0 906 table->bytesLength=table->codePointsLength=0;
michael@0 907 table->isSorted=FALSE;
michael@0 908 }
michael@0 909 }
michael@0 910
michael@0 911 U_CAPI void U_EXPORT2
michael@0 912 ucm_addMapping(UCMTable *table,
michael@0 913 UCMapping *m,
michael@0 914 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
michael@0 915 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
michael@0 916 UCMapping *tm;
michael@0 917 UChar32 c;
michael@0 918 int32_t idx;
michael@0 919
michael@0 920 if(table->mappingsLength>=table->mappingsCapacity) {
michael@0 921 /* make the mappings array larger */
michael@0 922 if(table->mappingsCapacity==0) {
michael@0 923 table->mappingsCapacity=1000;
michael@0 924 } else {
michael@0 925 table->mappingsCapacity*=10;
michael@0 926 }
michael@0 927 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
michael@0 928 table->mappingsCapacity*sizeof(UCMapping));
michael@0 929 if(table->mappings==NULL) {
michael@0 930 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
michael@0 931 (int)table->mappingsCapacity);
michael@0 932 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 933 }
michael@0 934
michael@0 935 if(table->reverseMap!=NULL) {
michael@0 936 /* the reverseMap must be reallocated in a new sort */
michael@0 937 uprv_free(table->reverseMap);
michael@0 938 table->reverseMap=NULL;
michael@0 939 }
michael@0 940 }
michael@0 941
michael@0 942 if(m->uLen>1 && table->codePointsCapacity==0) {
michael@0 943 table->codePointsCapacity=10000;
michael@0 944 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
michael@0 945 if(table->codePoints==NULL) {
michael@0 946 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
michael@0 947 (int)table->codePointsCapacity);
michael@0 948 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 949 }
michael@0 950 }
michael@0 951
michael@0 952 if(m->bLen>4 && table->bytesCapacity==0) {
michael@0 953 table->bytesCapacity=10000;
michael@0 954 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
michael@0 955 if(table->bytes==NULL) {
michael@0 956 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
michael@0 957 (int)table->bytesCapacity);
michael@0 958 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 959 }
michael@0 960 }
michael@0 961
michael@0 962 if(m->uLen>1) {
michael@0 963 idx=table->codePointsLength;
michael@0 964 table->codePointsLength+=m->uLen;
michael@0 965 if(table->codePointsLength>table->codePointsCapacity) {
michael@0 966 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
michael@0 967 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 968 }
michael@0 969
michael@0 970 uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
michael@0 971 m->u=idx;
michael@0 972 }
michael@0 973
michael@0 974 if(m->bLen>4) {
michael@0 975 idx=table->bytesLength;
michael@0 976 table->bytesLength+=m->bLen;
michael@0 977 if(table->bytesLength>table->bytesCapacity) {
michael@0 978 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
michael@0 979 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 980 }
michael@0 981
michael@0 982 uprv_memcpy(table->bytes+idx, bytes, m->bLen);
michael@0 983 m->b.idx=idx;
michael@0 984 }
michael@0 985
michael@0 986 /* set unicodeMask */
michael@0 987 for(idx=0; idx<m->uLen; ++idx) {
michael@0 988 c=codePoints[idx];
michael@0 989 if(c>=0x10000) {
michael@0 990 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
michael@0 991 } else if(U_IS_SURROGATE(c)) {
michael@0 992 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
michael@0 993 }
michael@0 994 }
michael@0 995
michael@0 996 /* set flagsType */
michael@0 997 if(m->f<0) {
michael@0 998 table->flagsType|=UCM_FLAGS_IMPLICIT;
michael@0 999 } else {
michael@0 1000 table->flagsType|=UCM_FLAGS_EXPLICIT;
michael@0 1001 }
michael@0 1002
michael@0 1003 tm=table->mappings+table->mappingsLength++;
michael@0 1004 uprv_memcpy(tm, m, sizeof(UCMapping));
michael@0 1005
michael@0 1006 table->isSorted=FALSE;
michael@0 1007 }
michael@0 1008
michael@0 1009 U_CAPI UCMFile * U_EXPORT2
michael@0 1010 ucm_open() {
michael@0 1011 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
michael@0 1012 if(ucm==NULL) {
michael@0 1013 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
michael@0 1014 exit(U_MEMORY_ALLOCATION_ERROR);
michael@0 1015 }
michael@0 1016
michael@0 1017 memset(ucm, 0, sizeof(UCMFile));
michael@0 1018
michael@0 1019 ucm->base=ucm_openTable();
michael@0 1020 ucm->ext=ucm_openTable();
michael@0 1021
michael@0 1022 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
michael@0 1023 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
michael@0 1024 ucm->states.outputType=-1;
michael@0 1025 ucm->states.minCharLength=ucm->states.maxCharLength=1;
michael@0 1026
michael@0 1027 return ucm;
michael@0 1028 }
michael@0 1029
michael@0 1030 U_CAPI void U_EXPORT2
michael@0 1031 ucm_close(UCMFile *ucm) {
michael@0 1032 if(ucm!=NULL) {
michael@0 1033 ucm_closeTable(ucm->base);
michael@0 1034 ucm_closeTable(ucm->ext);
michael@0 1035 uprv_free(ucm);
michael@0 1036 }
michael@0 1037 }
michael@0 1038
michael@0 1039 U_CAPI int32_t U_EXPORT2
michael@0 1040 ucm_mappingType(UCMStates *baseStates,
michael@0 1041 UCMapping *m,
michael@0 1042 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
michael@0 1043 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
michael@0 1044 /* check validity of the bytes and count the characters in them */
michael@0 1045 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
michael@0 1046 if(count<1) {
michael@0 1047 /* illegal byte sequence */
michael@0 1048 return -1;
michael@0 1049 }
michael@0 1050
michael@0 1051 /*
michael@0 1052 * Suitable for an ICU conversion base table means:
michael@0 1053 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
michael@0 1054 * - precision flag 0..3
michael@0 1055 * - SBCS: any 1:1 mapping
michael@0 1056 * (the table stores additional bits to distinguish mapping types)
michael@0 1057 * - MBCS: not a |2 SUB mapping for <subchar1>
michael@0 1058 * - MBCS: not a |1 fallback to 0x00
michael@0 1059 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
michael@0 1060 *
michael@0 1061 * Further restrictions for fromUnicode tables
michael@0 1062 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
michael@0 1063 *
michael@0 1064 * All of the MBCS fromUnicode specific tests could be removed from here,
michael@0 1065 * but the ones above are for unusual mappings, and removing the tests
michael@0 1066 * from here would change canonucm output which seems gratuitous.
michael@0 1067 * (Markus Scherer 2006-nov-28)
michael@0 1068 *
michael@0 1069 * Exception: All implicit mappings (f<0) that need to be moved
michael@0 1070 * because of fromUnicode restrictions _must_ be moved here because
michael@0 1071 * makeconv uses a hack for moving mappings only for the fromUnicode table
michael@0 1072 * that only works with non-negative values of f.
michael@0 1073 */
michael@0 1074 if( m->uLen==1 && count==1 && m->f<=3 &&
michael@0 1075 (baseStates->maxCharLength==1 ||
michael@0 1076 !((m->f==2 && m->bLen==1) ||
michael@0 1077 (m->f==1 && bytes[0]==0) ||
michael@0 1078 (m->f<=1 && m->bLen>1 && bytes[0]==0)))
michael@0 1079 ) {
michael@0 1080 return 0; /* suitable for a base table */
michael@0 1081 } else {
michael@0 1082 return 1; /* needs to go into an extension table */
michael@0 1083 }
michael@0 1084 }
michael@0 1085
michael@0 1086 U_CAPI UBool U_EXPORT2
michael@0 1087 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
michael@0 1088 UCMapping *m,
michael@0 1089 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
michael@0 1090 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
michael@0 1091 int32_t type;
michael@0 1092
michael@0 1093 if(m->f==2 && m->uLen>1) {
michael@0 1094 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
michael@0 1095 printMapping(m, codePoints, bytes, stderr);
michael@0 1096 return FALSE;
michael@0 1097 }
michael@0 1098
michael@0 1099 if(baseStates!=NULL) {
michael@0 1100 /* check validity of the bytes and count the characters in them */
michael@0 1101 type=ucm_mappingType(baseStates, m, codePoints, bytes);
michael@0 1102 if(type<0) {
michael@0 1103 /* illegal byte sequence */
michael@0 1104 printMapping(m, codePoints, bytes, stderr);
michael@0 1105 return FALSE;
michael@0 1106 }
michael@0 1107 } else {
michael@0 1108 /* not used - adding a mapping for an extension-only table before its base table is read */
michael@0 1109 type=1;
michael@0 1110 }
michael@0 1111
michael@0 1112 /*
michael@0 1113 * Add the mapping to the base table if this is requested and suitable.
michael@0 1114 * Otherwise, add it to the extension table.
michael@0 1115 */
michael@0 1116 if(forBase && type==0) {
michael@0 1117 ucm_addMapping(ucm->base, m, codePoints, bytes);
michael@0 1118 } else {
michael@0 1119 ucm_addMapping(ucm->ext, m, codePoints, bytes);
michael@0 1120 }
michael@0 1121
michael@0 1122 return TRUE;
michael@0 1123 }
michael@0 1124
michael@0 1125 U_CAPI UBool U_EXPORT2
michael@0 1126 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
michael@0 1127 UCMapping m={ 0 };
michael@0 1128 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
michael@0 1129 uint8_t bytes[UCNV_EXT_MAX_BYTES];
michael@0 1130
michael@0 1131 const char *s;
michael@0 1132
michael@0 1133 /* ignore empty and comment lines */
michael@0 1134 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
michael@0 1135 return TRUE;
michael@0 1136 }
michael@0 1137
michael@0 1138 return
michael@0 1139 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
michael@0 1140 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
michael@0 1141 }
michael@0 1142
michael@0 1143 U_CAPI void U_EXPORT2
michael@0 1144 ucm_readTable(UCMFile *ucm, FileStream* convFile,
michael@0 1145 UBool forBase, UCMStates *baseStates,
michael@0 1146 UErrorCode *pErrorCode) {
michael@0 1147 char line[500];
michael@0 1148 char *end;
michael@0 1149 UBool isOK;
michael@0 1150
michael@0 1151 if(U_FAILURE(*pErrorCode)) {
michael@0 1152 return;
michael@0 1153 }
michael@0 1154
michael@0 1155 isOK=TRUE;
michael@0 1156
michael@0 1157 for(;;) {
michael@0 1158 /* read the next line */
michael@0 1159 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
michael@0 1160 fprintf(stderr, "incomplete charmap section\n");
michael@0 1161 isOK=FALSE;
michael@0 1162 break;
michael@0 1163 }
michael@0 1164
michael@0 1165 /* remove CR LF */
michael@0 1166 end=uprv_strchr(line, 0);
michael@0 1167 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
michael@0 1168 --end;
michael@0 1169 }
michael@0 1170 *end=0;
michael@0 1171
michael@0 1172 /* ignore empty and comment lines */
michael@0 1173 if(line[0]==0 || line[0]=='#') {
michael@0 1174 continue;
michael@0 1175 }
michael@0 1176
michael@0 1177 /* stop at the end of the mapping table */
michael@0 1178 if(0==uprv_strcmp(line, "END CHARMAP")) {
michael@0 1179 break;
michael@0 1180 }
michael@0 1181
michael@0 1182 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
michael@0 1183 }
michael@0 1184
michael@0 1185 if(!isOK) {
michael@0 1186 *pErrorCode=U_INVALID_TABLE_FORMAT;
michael@0 1187 }
michael@0 1188 }
michael@0 1189 #endif

mercurial