Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2003-2013, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: ucm.c |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2003jun20 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | * |
michael@0 | 16 | * This file reads a .ucm file, stores its mappings and sorts them. |
michael@0 | 17 | * It implements handling of Unicode conversion mappings from .ucm files |
michael@0 | 18 | * for makeconv, canonucm, rptp2ucm, etc. |
michael@0 | 19 | * |
michael@0 | 20 | * Unicode code point sequences with a length of more than 1, |
michael@0 | 21 | * as well as byte sequences with more than 4 bytes or more than one complete |
michael@0 | 22 | * character sequence are handled to support m:n mappings. |
michael@0 | 23 | */ |
michael@0 | 24 | |
michael@0 | 25 | #include "unicode/utypes.h" |
michael@0 | 26 | #include "unicode/ustring.h" |
michael@0 | 27 | #include "cstring.h" |
michael@0 | 28 | #include "cmemory.h" |
michael@0 | 29 | #include "filestrm.h" |
michael@0 | 30 | #include "uarrsort.h" |
michael@0 | 31 | #include "ucnvmbcs.h" |
michael@0 | 32 | #include "ucnv_bld.h" |
michael@0 | 33 | #include "ucnv_ext.h" |
michael@0 | 34 | #include "uparse.h" |
michael@0 | 35 | #include "ucm.h" |
michael@0 | 36 | #include <stdio.h> |
michael@0 | 37 | |
michael@0 | 38 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 39 | |
michael@0 | 40 | /* -------------------------------------------------------------------------- */ |
michael@0 | 41 | |
michael@0 | 42 | static void |
michael@0 | 43 | printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { |
michael@0 | 44 | int32_t j; |
michael@0 | 45 | |
michael@0 | 46 | for(j=0; j<m->uLen; ++j) { |
michael@0 | 47 | fprintf(f, "<U%04lX>", (long)codePoints[j]); |
michael@0 | 48 | } |
michael@0 | 49 | |
michael@0 | 50 | fputc(' ', f); |
michael@0 | 51 | |
michael@0 | 52 | for(j=0; j<m->bLen; ++j) { |
michael@0 | 53 | fprintf(f, "\\x%02X", bytes[j]); |
michael@0 | 54 | } |
michael@0 | 55 | |
michael@0 | 56 | if(m->f>=0) { |
michael@0 | 57 | fprintf(f, " |%u\n", m->f); |
michael@0 | 58 | } else { |
michael@0 | 59 | fputs("\n", f); |
michael@0 | 60 | } |
michael@0 | 61 | } |
michael@0 | 62 | |
michael@0 | 63 | U_CAPI void U_EXPORT2 |
michael@0 | 64 | ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { |
michael@0 | 65 | printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); |
michael@0 | 66 | } |
michael@0 | 67 | |
michael@0 | 68 | U_CAPI void U_EXPORT2 |
michael@0 | 69 | ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { |
michael@0 | 70 | UCMapping *m; |
michael@0 | 71 | int32_t i, length; |
michael@0 | 72 | |
michael@0 | 73 | m=table->mappings; |
michael@0 | 74 | length=table->mappingsLength; |
michael@0 | 75 | if(byUnicode) { |
michael@0 | 76 | for(i=0; i<length; ++m, ++i) { |
michael@0 | 77 | ucm_printMapping(table, m, f); |
michael@0 | 78 | } |
michael@0 | 79 | } else { |
michael@0 | 80 | const int32_t *map=table->reverseMap; |
michael@0 | 81 | for(i=0; i<length; ++i) { |
michael@0 | 82 | ucm_printMapping(table, m+map[i], f); |
michael@0 | 83 | } |
michael@0 | 84 | } |
michael@0 | 85 | } |
michael@0 | 86 | |
michael@0 | 87 | /* mapping comparisons ------------------------------------------------------ */ |
michael@0 | 88 | |
michael@0 | 89 | static int32_t |
michael@0 | 90 | compareUnicode(UCMTable *lTable, const UCMapping *l, |
michael@0 | 91 | UCMTable *rTable, const UCMapping *r) { |
michael@0 | 92 | const UChar32 *lu, *ru; |
michael@0 | 93 | int32_t result, i, length; |
michael@0 | 94 | |
michael@0 | 95 | if(l->uLen==1 && r->uLen==1) { |
michael@0 | 96 | /* compare two single code points */ |
michael@0 | 97 | return l->u-r->u; |
michael@0 | 98 | } |
michael@0 | 99 | |
michael@0 | 100 | /* get pointers to the code point sequences */ |
michael@0 | 101 | lu=UCM_GET_CODE_POINTS(lTable, l); |
michael@0 | 102 | ru=UCM_GET_CODE_POINTS(rTable, r); |
michael@0 | 103 | |
michael@0 | 104 | /* get the minimum length */ |
michael@0 | 105 | if(l->uLen<=r->uLen) { |
michael@0 | 106 | length=l->uLen; |
michael@0 | 107 | } else { |
michael@0 | 108 | length=r->uLen; |
michael@0 | 109 | } |
michael@0 | 110 | |
michael@0 | 111 | /* compare the code points */ |
michael@0 | 112 | for(i=0; i<length; ++i) { |
michael@0 | 113 | result=lu[i]-ru[i]; |
michael@0 | 114 | if(result!=0) { |
michael@0 | 115 | return result; |
michael@0 | 116 | } |
michael@0 | 117 | } |
michael@0 | 118 | |
michael@0 | 119 | /* compare the lengths */ |
michael@0 | 120 | return l->uLen-r->uLen; |
michael@0 | 121 | } |
michael@0 | 122 | |
michael@0 | 123 | static int32_t |
michael@0 | 124 | compareBytes(UCMTable *lTable, const UCMapping *l, |
michael@0 | 125 | UCMTable *rTable, const UCMapping *r, |
michael@0 | 126 | UBool lexical) { |
michael@0 | 127 | const uint8_t *lb, *rb; |
michael@0 | 128 | int32_t result, i, length; |
michael@0 | 129 | |
michael@0 | 130 | /* |
michael@0 | 131 | * A lexical comparison is used for sorting in the builder, to allow |
michael@0 | 132 | * an efficient search for a byte sequence that could be a prefix |
michael@0 | 133 | * of a previously entered byte sequence. |
michael@0 | 134 | * |
michael@0 | 135 | * Comparing by lengths first is for compatibility with old .ucm tools |
michael@0 | 136 | * like canonucm and rptp2ucm. |
michael@0 | 137 | */ |
michael@0 | 138 | if(lexical) { |
michael@0 | 139 | /* get the minimum length and continue */ |
michael@0 | 140 | if(l->bLen<=r->bLen) { |
michael@0 | 141 | length=l->bLen; |
michael@0 | 142 | } else { |
michael@0 | 143 | length=r->bLen; |
michael@0 | 144 | } |
michael@0 | 145 | } else { |
michael@0 | 146 | /* compare lengths first */ |
michael@0 | 147 | result=l->bLen-r->bLen; |
michael@0 | 148 | if(result!=0) { |
michael@0 | 149 | return result; |
michael@0 | 150 | } else { |
michael@0 | 151 | length=l->bLen; |
michael@0 | 152 | } |
michael@0 | 153 | } |
michael@0 | 154 | |
michael@0 | 155 | /* get pointers to the byte sequences */ |
michael@0 | 156 | lb=UCM_GET_BYTES(lTable, l); |
michael@0 | 157 | rb=UCM_GET_BYTES(rTable, r); |
michael@0 | 158 | |
michael@0 | 159 | /* compare the bytes */ |
michael@0 | 160 | for(i=0; i<length; ++i) { |
michael@0 | 161 | result=lb[i]-rb[i]; |
michael@0 | 162 | if(result!=0) { |
michael@0 | 163 | return result; |
michael@0 | 164 | } |
michael@0 | 165 | } |
michael@0 | 166 | |
michael@0 | 167 | /* compare the lengths */ |
michael@0 | 168 | return l->bLen-r->bLen; |
michael@0 | 169 | } |
michael@0 | 170 | |
michael@0 | 171 | /* compare UCMappings for sorting */ |
michael@0 | 172 | static int32_t |
michael@0 | 173 | compareMappings(UCMTable *lTable, const UCMapping *l, |
michael@0 | 174 | UCMTable *rTable, const UCMapping *r, |
michael@0 | 175 | UBool uFirst) { |
michael@0 | 176 | int32_t result; |
michael@0 | 177 | |
michael@0 | 178 | /* choose which side to compare first */ |
michael@0 | 179 | if(uFirst) { |
michael@0 | 180 | /* Unicode then bytes */ |
michael@0 | 181 | result=compareUnicode(lTable, l, rTable, r); |
michael@0 | 182 | if(result==0) { |
michael@0 | 183 | result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */ |
michael@0 | 184 | } |
michael@0 | 185 | } else { |
michael@0 | 186 | /* bytes then Unicode */ |
michael@0 | 187 | result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */ |
michael@0 | 188 | if(result==0) { |
michael@0 | 189 | result=compareUnicode(lTable, l, rTable, r); |
michael@0 | 190 | } |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | if(result!=0) { |
michael@0 | 194 | return result; |
michael@0 | 195 | } |
michael@0 | 196 | |
michael@0 | 197 | /* compare the flags */ |
michael@0 | 198 | return l->f-r->f; |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | /* sorting by Unicode first sorts mappings directly */ |
michael@0 | 202 | static int32_t |
michael@0 | 203 | compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { |
michael@0 | 204 | return compareMappings( |
michael@0 | 205 | (UCMTable *)context, (const UCMapping *)left, |
michael@0 | 206 | (UCMTable *)context, (const UCMapping *)right, TRUE); |
michael@0 | 207 | } |
michael@0 | 208 | |
michael@0 | 209 | /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ |
michael@0 | 210 | static int32_t |
michael@0 | 211 | compareMappingsBytesFirst(const void *context, const void *left, const void *right) { |
michael@0 | 212 | UCMTable *table=(UCMTable *)context; |
michael@0 | 213 | int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; |
michael@0 | 214 | return compareMappings( |
michael@0 | 215 | table, table->mappings+l, |
michael@0 | 216 | table, table->mappings+r, FALSE); |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | U_CAPI void U_EXPORT2 |
michael@0 | 220 | ucm_sortTable(UCMTable *t) { |
michael@0 | 221 | UErrorCode errorCode; |
michael@0 | 222 | int32_t i; |
michael@0 | 223 | |
michael@0 | 224 | if(t->isSorted) { |
michael@0 | 225 | return; |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | errorCode=U_ZERO_ERROR; |
michael@0 | 229 | |
michael@0 | 230 | /* 1. sort by Unicode first */ |
michael@0 | 231 | uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), |
michael@0 | 232 | compareMappingsUnicodeFirst, t, |
michael@0 | 233 | FALSE, &errorCode); |
michael@0 | 234 | |
michael@0 | 235 | /* build the reverseMap */ |
michael@0 | 236 | if(t->reverseMap==NULL) { |
michael@0 | 237 | /* |
michael@0 | 238 | * allocate mappingsCapacity instead of mappingsLength so that |
michael@0 | 239 | * if mappings are added, the reverseMap need not be |
michael@0 | 240 | * reallocated each time |
michael@0 | 241 | * (see ucm_moveMappings() and ucm_addMapping()) |
michael@0 | 242 | */ |
michael@0 | 243 | t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); |
michael@0 | 244 | if(t->reverseMap==NULL) { |
michael@0 | 245 | fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); |
michael@0 | 246 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 247 | } |
michael@0 | 248 | } |
michael@0 | 249 | for(i=0; i<t->mappingsLength; ++i) { |
michael@0 | 250 | t->reverseMap[i]=i; |
michael@0 | 251 | } |
michael@0 | 252 | |
michael@0 | 253 | /* 2. sort reverseMap by mappings bytes first */ |
michael@0 | 254 | uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), |
michael@0 | 255 | compareMappingsBytesFirst, t, |
michael@0 | 256 | FALSE, &errorCode); |
michael@0 | 257 | |
michael@0 | 258 | if(U_FAILURE(errorCode)) { |
michael@0 | 259 | fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", |
michael@0 | 260 | u_errorName(errorCode)); |
michael@0 | 261 | exit(errorCode); |
michael@0 | 262 | } |
michael@0 | 263 | |
michael@0 | 264 | t->isSorted=TRUE; |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | /* |
michael@0 | 268 | * remove mappings with their move flag set from the base table |
michael@0 | 269 | * and move some of them (with UCM_MOVE_TO_EXT) to the extension table |
michael@0 | 270 | */ |
michael@0 | 271 | U_CAPI void U_EXPORT2 |
michael@0 | 272 | ucm_moveMappings(UCMTable *base, UCMTable *ext) { |
michael@0 | 273 | UCMapping *mb, *mbLimit; |
michael@0 | 274 | int8_t flag; |
michael@0 | 275 | |
michael@0 | 276 | mb=base->mappings; |
michael@0 | 277 | mbLimit=mb+base->mappingsLength; |
michael@0 | 278 | |
michael@0 | 279 | while(mb<mbLimit) { |
michael@0 | 280 | flag=mb->moveFlag; |
michael@0 | 281 | if(flag!=0) { |
michael@0 | 282 | /* reset the move flag */ |
michael@0 | 283 | mb->moveFlag=0; |
michael@0 | 284 | |
michael@0 | 285 | if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { |
michael@0 | 286 | /* add the mapping to the extension table */ |
michael@0 | 287 | ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); |
michael@0 | 288 | } |
michael@0 | 289 | |
michael@0 | 290 | /* remove this mapping: move the last base mapping down and overwrite the current one */ |
michael@0 | 291 | if(mb<(mbLimit-1)) { |
michael@0 | 292 | uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); |
michael@0 | 293 | } |
michael@0 | 294 | --mbLimit; |
michael@0 | 295 | --base->mappingsLength; |
michael@0 | 296 | base->isSorted=FALSE; |
michael@0 | 297 | } else { |
michael@0 | 298 | ++mb; |
michael@0 | 299 | } |
michael@0 | 300 | } |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | enum { |
michael@0 | 304 | NEEDS_MOVE=1, |
michael@0 | 305 | HAS_ERRORS=2 |
michael@0 | 306 | }; |
michael@0 | 307 | |
michael@0 | 308 | static uint8_t |
michael@0 | 309 | checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
michael@0 | 310 | UBool moveToExt, UBool intersectBase) { |
michael@0 | 311 | UCMapping *mb, *me, *mbLimit, *meLimit; |
michael@0 | 312 | int32_t cmp; |
michael@0 | 313 | uint8_t result; |
michael@0 | 314 | |
michael@0 | 315 | mb=base->mappings; |
michael@0 | 316 | mbLimit=mb+base->mappingsLength; |
michael@0 | 317 | |
michael@0 | 318 | me=ext->mappings; |
michael@0 | 319 | meLimit=me+ext->mappingsLength; |
michael@0 | 320 | |
michael@0 | 321 | result=0; |
michael@0 | 322 | |
michael@0 | 323 | for(;;) { |
michael@0 | 324 | /* skip irrelevant mappings on both sides */ |
michael@0 | 325 | for(;;) { |
michael@0 | 326 | if(mb==mbLimit) { |
michael@0 | 327 | return result; |
michael@0 | 328 | } |
michael@0 | 329 | |
michael@0 | 330 | if((0<=mb->f && mb->f<=2) || mb->f==4) { |
michael@0 | 331 | break; |
michael@0 | 332 | } |
michael@0 | 333 | |
michael@0 | 334 | ++mb; |
michael@0 | 335 | } |
michael@0 | 336 | |
michael@0 | 337 | for(;;) { |
michael@0 | 338 | if(me==meLimit) { |
michael@0 | 339 | return result; |
michael@0 | 340 | } |
michael@0 | 341 | |
michael@0 | 342 | if((0<=me->f && me->f<=2) || me->f==4) { |
michael@0 | 343 | break; |
michael@0 | 344 | } |
michael@0 | 345 | |
michael@0 | 346 | ++me; |
michael@0 | 347 | } |
michael@0 | 348 | |
michael@0 | 349 | /* compare the base and extension mappings */ |
michael@0 | 350 | cmp=compareUnicode(base, mb, ext, me); |
michael@0 | 351 | if(cmp<0) { |
michael@0 | 352 | if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { |
michael@0 | 353 | /* |
michael@0 | 354 | * mapping in base but not in ext, move it |
michael@0 | 355 | * |
michael@0 | 356 | * if ext is DBCS, move DBCS mappings here |
michael@0 | 357 | * and check SBCS ones for Unicode prefix below |
michael@0 | 358 | */ |
michael@0 | 359 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 360 | result|=NEEDS_MOVE; |
michael@0 | 361 | |
michael@0 | 362 | /* does mb map from an input sequence that is a prefix of me's? */ |
michael@0 | 363 | } else if( mb->uLen<me->uLen && |
michael@0 | 364 | 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
michael@0 | 365 | ) { |
michael@0 | 366 | if(moveToExt) { |
michael@0 | 367 | /* mark this mapping to be moved to the extension table */ |
michael@0 | 368 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 369 | result|=NEEDS_MOVE; |
michael@0 | 370 | } else { |
michael@0 | 371 | fprintf(stderr, |
michael@0 | 372 | "ucm error: the base table contains a mapping whose input sequence\n" |
michael@0 | 373 | " is a prefix of the input sequence of an extension mapping\n"); |
michael@0 | 374 | ucm_printMapping(base, mb, stderr); |
michael@0 | 375 | ucm_printMapping(ext, me, stderr); |
michael@0 | 376 | result|=HAS_ERRORS; |
michael@0 | 377 | } |
michael@0 | 378 | } |
michael@0 | 379 | |
michael@0 | 380 | ++mb; |
michael@0 | 381 | } else if(cmp==0) { |
michael@0 | 382 | /* |
michael@0 | 383 | * same output: remove the extension mapping, |
michael@0 | 384 | * otherwise treat as an error |
michael@0 | 385 | */ |
michael@0 | 386 | if( mb->f==me->f && mb->bLen==me->bLen && |
michael@0 | 387 | 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
michael@0 | 388 | ) { |
michael@0 | 389 | me->moveFlag|=UCM_REMOVE_MAPPING; |
michael@0 | 390 | result|=NEEDS_MOVE; |
michael@0 | 391 | } else if(intersectBase) { |
michael@0 | 392 | /* mapping in base but not in ext, move it */ |
michael@0 | 393 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 394 | result|=NEEDS_MOVE; |
michael@0 | 395 | } else { |
michael@0 | 396 | fprintf(stderr, |
michael@0 | 397 | "ucm error: the base table contains a mapping whose input sequence\n" |
michael@0 | 398 | " is the same as the input sequence of an extension mapping\n" |
michael@0 | 399 | " but it maps differently\n"); |
michael@0 | 400 | ucm_printMapping(base, mb, stderr); |
michael@0 | 401 | ucm_printMapping(ext, me, stderr); |
michael@0 | 402 | result|=HAS_ERRORS; |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | ++mb; |
michael@0 | 406 | } else /* cmp>0 */ { |
michael@0 | 407 | ++me; |
michael@0 | 408 | } |
michael@0 | 409 | } |
michael@0 | 410 | } |
michael@0 | 411 | |
michael@0 | 412 | static uint8_t |
michael@0 | 413 | checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
michael@0 | 414 | UBool moveToExt, UBool intersectBase) { |
michael@0 | 415 | UCMapping *mb, *me; |
michael@0 | 416 | int32_t *baseMap, *extMap; |
michael@0 | 417 | int32_t b, e, bLimit, eLimit, cmp; |
michael@0 | 418 | uint8_t result; |
michael@0 | 419 | UBool isSISO; |
michael@0 | 420 | |
michael@0 | 421 | baseMap=base->reverseMap; |
michael@0 | 422 | extMap=ext->reverseMap; |
michael@0 | 423 | |
michael@0 | 424 | b=e=0; |
michael@0 | 425 | bLimit=base->mappingsLength; |
michael@0 | 426 | eLimit=ext->mappingsLength; |
michael@0 | 427 | |
michael@0 | 428 | result=0; |
michael@0 | 429 | |
michael@0 | 430 | isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); |
michael@0 | 431 | |
michael@0 | 432 | for(;;) { |
michael@0 | 433 | /* skip irrelevant mappings on both sides */ |
michael@0 | 434 | for(;; ++b) { |
michael@0 | 435 | if(b==bLimit) { |
michael@0 | 436 | return result; |
michael@0 | 437 | } |
michael@0 | 438 | mb=base->mappings+baseMap[b]; |
michael@0 | 439 | |
michael@0 | 440 | if(intersectBase==2 && mb->bLen==1) { |
michael@0 | 441 | /* |
michael@0 | 442 | * comparing a base against a DBCS extension: |
michael@0 | 443 | * leave SBCS base mappings alone |
michael@0 | 444 | */ |
michael@0 | 445 | continue; |
michael@0 | 446 | } |
michael@0 | 447 | |
michael@0 | 448 | if(mb->f==0 || mb->f==3) { |
michael@0 | 449 | break; |
michael@0 | 450 | } |
michael@0 | 451 | } |
michael@0 | 452 | |
michael@0 | 453 | for(;;) { |
michael@0 | 454 | if(e==eLimit) { |
michael@0 | 455 | return result; |
michael@0 | 456 | } |
michael@0 | 457 | me=ext->mappings+extMap[e]; |
michael@0 | 458 | |
michael@0 | 459 | if(me->f==0 || me->f==3) { |
michael@0 | 460 | break; |
michael@0 | 461 | } |
michael@0 | 462 | |
michael@0 | 463 | ++e; |
michael@0 | 464 | } |
michael@0 | 465 | |
michael@0 | 466 | /* compare the base and extension mappings */ |
michael@0 | 467 | cmp=compareBytes(base, mb, ext, me, TRUE); |
michael@0 | 468 | if(cmp<0) { |
michael@0 | 469 | if(intersectBase) { |
michael@0 | 470 | /* mapping in base but not in ext, move it */ |
michael@0 | 471 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 472 | result|=NEEDS_MOVE; |
michael@0 | 473 | |
michael@0 | 474 | /* |
michael@0 | 475 | * does mb map from an input sequence that is a prefix of me's? |
michael@0 | 476 | * for SI/SO tables, a single byte is never a prefix because it |
michael@0 | 477 | * occurs in a separate single-byte state |
michael@0 | 478 | */ |
michael@0 | 479 | } else if( mb->bLen<me->bLen && |
michael@0 | 480 | (!isSISO || mb->bLen>1) && |
michael@0 | 481 | 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
michael@0 | 482 | ) { |
michael@0 | 483 | if(moveToExt) { |
michael@0 | 484 | /* mark this mapping to be moved to the extension table */ |
michael@0 | 485 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 486 | result|=NEEDS_MOVE; |
michael@0 | 487 | } else { |
michael@0 | 488 | fprintf(stderr, |
michael@0 | 489 | "ucm error: the base table contains a mapping whose input sequence\n" |
michael@0 | 490 | " is a prefix of the input sequence of an extension mapping\n"); |
michael@0 | 491 | ucm_printMapping(base, mb, stderr); |
michael@0 | 492 | ucm_printMapping(ext, me, stderr); |
michael@0 | 493 | result|=HAS_ERRORS; |
michael@0 | 494 | } |
michael@0 | 495 | } |
michael@0 | 496 | |
michael@0 | 497 | ++b; |
michael@0 | 498 | } else if(cmp==0) { |
michael@0 | 499 | /* |
michael@0 | 500 | * same output: remove the extension mapping, |
michael@0 | 501 | * otherwise treat as an error |
michael@0 | 502 | */ |
michael@0 | 503 | if( mb->f==me->f && mb->uLen==me->uLen && |
michael@0 | 504 | 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
michael@0 | 505 | ) { |
michael@0 | 506 | me->moveFlag|=UCM_REMOVE_MAPPING; |
michael@0 | 507 | result|=NEEDS_MOVE; |
michael@0 | 508 | } else if(intersectBase) { |
michael@0 | 509 | /* mapping in base but not in ext, move it */ |
michael@0 | 510 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 511 | result|=NEEDS_MOVE; |
michael@0 | 512 | } else { |
michael@0 | 513 | fprintf(stderr, |
michael@0 | 514 | "ucm error: the base table contains a mapping whose input sequence\n" |
michael@0 | 515 | " is the same as the input sequence of an extension mapping\n" |
michael@0 | 516 | " but it maps differently\n"); |
michael@0 | 517 | ucm_printMapping(base, mb, stderr); |
michael@0 | 518 | ucm_printMapping(ext, me, stderr); |
michael@0 | 519 | result|=HAS_ERRORS; |
michael@0 | 520 | } |
michael@0 | 521 | |
michael@0 | 522 | ++b; |
michael@0 | 523 | } else /* cmp>0 */ { |
michael@0 | 524 | ++e; |
michael@0 | 525 | } |
michael@0 | 526 | } |
michael@0 | 527 | } |
michael@0 | 528 | |
michael@0 | 529 | U_CAPI UBool U_EXPORT2 |
michael@0 | 530 | ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { |
michael@0 | 531 | UCMapping *m, *mLimit; |
michael@0 | 532 | int32_t count; |
michael@0 | 533 | UBool isOK; |
michael@0 | 534 | |
michael@0 | 535 | m=table->mappings; |
michael@0 | 536 | mLimit=m+table->mappingsLength; |
michael@0 | 537 | isOK=TRUE; |
michael@0 | 538 | |
michael@0 | 539 | while(m<mLimit) { |
michael@0 | 540 | count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); |
michael@0 | 541 | if(count<1) { |
michael@0 | 542 | ucm_printMapping(table, m, stderr); |
michael@0 | 543 | isOK=FALSE; |
michael@0 | 544 | } |
michael@0 | 545 | ++m; |
michael@0 | 546 | } |
michael@0 | 547 | |
michael@0 | 548 | return isOK; |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | U_CAPI UBool U_EXPORT2 |
michael@0 | 552 | ucm_checkBaseExt(UCMStates *baseStates, |
michael@0 | 553 | UCMTable *base, UCMTable *ext, UCMTable *moveTarget, |
michael@0 | 554 | UBool intersectBase) { |
michael@0 | 555 | uint8_t result; |
michael@0 | 556 | |
michael@0 | 557 | /* if we have an extension table, we must always use precision flags */ |
michael@0 | 558 | if(base->flagsType&UCM_FLAGS_IMPLICIT) { |
michael@0 | 559 | fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); |
michael@0 | 560 | return FALSE; |
michael@0 | 561 | } |
michael@0 | 562 | if(ext->flagsType&UCM_FLAGS_IMPLICIT) { |
michael@0 | 563 | fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); |
michael@0 | 564 | return FALSE; |
michael@0 | 565 | } |
michael@0 | 566 | |
michael@0 | 567 | /* checking requires both tables to be sorted */ |
michael@0 | 568 | ucm_sortTable(base); |
michael@0 | 569 | ucm_sortTable(ext); |
michael@0 | 570 | |
michael@0 | 571 | /* check */ |
michael@0 | 572 | result= |
michael@0 | 573 | checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| |
michael@0 | 574 | checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); |
michael@0 | 575 | |
michael@0 | 576 | if(result&HAS_ERRORS) { |
michael@0 | 577 | return FALSE; |
michael@0 | 578 | } |
michael@0 | 579 | |
michael@0 | 580 | if(result&NEEDS_MOVE) { |
michael@0 | 581 | ucm_moveMappings(ext, NULL); |
michael@0 | 582 | ucm_moveMappings(base, moveTarget); |
michael@0 | 583 | ucm_sortTable(base); |
michael@0 | 584 | ucm_sortTable(ext); |
michael@0 | 585 | if(moveTarget!=NULL) { |
michael@0 | 586 | ucm_sortTable(moveTarget); |
michael@0 | 587 | } |
michael@0 | 588 | } |
michael@0 | 589 | |
michael@0 | 590 | return TRUE; |
michael@0 | 591 | } |
michael@0 | 592 | |
michael@0 | 593 | /* merge tables for rptp2ucm ------------------------------------------------ */ |
michael@0 | 594 | |
michael@0 | 595 | U_CAPI void U_EXPORT2 |
michael@0 | 596 | ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, |
michael@0 | 597 | const uint8_t *subchar, int32_t subcharLength, |
michael@0 | 598 | uint8_t subchar1) { |
michael@0 | 599 | UCMapping *fromUMapping, *toUMapping; |
michael@0 | 600 | int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; |
michael@0 | 601 | |
michael@0 | 602 | ucm_sortTable(fromUTable); |
michael@0 | 603 | ucm_sortTable(toUTable); |
michael@0 | 604 | |
michael@0 | 605 | fromUMapping=fromUTable->mappings; |
michael@0 | 606 | toUMapping=toUTable->mappings; |
michael@0 | 607 | |
michael@0 | 608 | fromUTop=fromUTable->mappingsLength; |
michael@0 | 609 | toUTop=toUTable->mappingsLength; |
michael@0 | 610 | |
michael@0 | 611 | fromUIndex=toUIndex=0; |
michael@0 | 612 | |
michael@0 | 613 | while(fromUIndex<fromUTop && toUIndex<toUTop) { |
michael@0 | 614 | cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); |
michael@0 | 615 | if(cmp==0) { |
michael@0 | 616 | /* equal: roundtrip, nothing to do (flags are initially 0) */ |
michael@0 | 617 | ++fromUMapping; |
michael@0 | 618 | ++toUMapping; |
michael@0 | 619 | |
michael@0 | 620 | ++fromUIndex; |
michael@0 | 621 | ++toUIndex; |
michael@0 | 622 | } else if(cmp<0) { |
michael@0 | 623 | /* |
michael@0 | 624 | * the fromU mapping does not have a toU counterpart: |
michael@0 | 625 | * fallback Unicode->codepage |
michael@0 | 626 | */ |
michael@0 | 627 | if( (fromUMapping->bLen==subcharLength && |
michael@0 | 628 | 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
michael@0 | 629 | (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
michael@0 | 630 | ) { |
michael@0 | 631 | fromUMapping->f=2; /* SUB mapping */ |
michael@0 | 632 | } else { |
michael@0 | 633 | fromUMapping->f=1; /* normal fallback */ |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | ++fromUMapping; |
michael@0 | 637 | ++fromUIndex; |
michael@0 | 638 | } else { |
michael@0 | 639 | /* |
michael@0 | 640 | * the toU mapping does not have a fromU counterpart: |
michael@0 | 641 | * (reverse) fallback codepage->Unicode, copy it to the fromU table |
michael@0 | 642 | */ |
michael@0 | 643 | |
michael@0 | 644 | /* ignore reverse fallbacks to Unicode SUB */ |
michael@0 | 645 | if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
michael@0 | 646 | toUMapping->f=3; /* reverse fallback */ |
michael@0 | 647 | ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
michael@0 | 648 | |
michael@0 | 649 | /* the table may have been reallocated */ |
michael@0 | 650 | fromUMapping=fromUTable->mappings+fromUIndex; |
michael@0 | 651 | } |
michael@0 | 652 | |
michael@0 | 653 | ++toUMapping; |
michael@0 | 654 | ++toUIndex; |
michael@0 | 655 | } |
michael@0 | 656 | } |
michael@0 | 657 | |
michael@0 | 658 | /* either one or both tables are exhausted */ |
michael@0 | 659 | while(fromUIndex<fromUTop) { |
michael@0 | 660 | /* leftover fromU mappings are fallbacks */ |
michael@0 | 661 | if( (fromUMapping->bLen==subcharLength && |
michael@0 | 662 | 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
michael@0 | 663 | (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
michael@0 | 664 | ) { |
michael@0 | 665 | fromUMapping->f=2; /* SUB mapping */ |
michael@0 | 666 | } else { |
michael@0 | 667 | fromUMapping->f=1; /* normal fallback */ |
michael@0 | 668 | } |
michael@0 | 669 | |
michael@0 | 670 | ++fromUMapping; |
michael@0 | 671 | ++fromUIndex; |
michael@0 | 672 | } |
michael@0 | 673 | |
michael@0 | 674 | while(toUIndex<toUTop) { |
michael@0 | 675 | /* leftover toU mappings are reverse fallbacks */ |
michael@0 | 676 | |
michael@0 | 677 | /* ignore reverse fallbacks to Unicode SUB */ |
michael@0 | 678 | if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
michael@0 | 679 | toUMapping->f=3; /* reverse fallback */ |
michael@0 | 680 | ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
michael@0 | 681 | } |
michael@0 | 682 | |
michael@0 | 683 | ++toUMapping; |
michael@0 | 684 | ++toUIndex; |
michael@0 | 685 | } |
michael@0 | 686 | |
michael@0 | 687 | fromUTable->isSorted=FALSE; |
michael@0 | 688 | } |
michael@0 | 689 | |
michael@0 | 690 | /* separate extension mappings out of base table for rptp2ucm --------------- */ |
michael@0 | 691 | |
michael@0 | 692 | U_CAPI UBool U_EXPORT2 |
michael@0 | 693 | ucm_separateMappings(UCMFile *ucm, UBool isSISO) { |
michael@0 | 694 | UCMTable *table; |
michael@0 | 695 | UCMapping *m, *mLimit; |
michael@0 | 696 | int32_t type; |
michael@0 | 697 | UBool needsMove, isOK; |
michael@0 | 698 | |
michael@0 | 699 | table=ucm->base; |
michael@0 | 700 | m=table->mappings; |
michael@0 | 701 | mLimit=m+table->mappingsLength; |
michael@0 | 702 | |
michael@0 | 703 | needsMove=FALSE; |
michael@0 | 704 | isOK=TRUE; |
michael@0 | 705 | |
michael@0 | 706 | for(; m<mLimit; ++m) { |
michael@0 | 707 | if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { |
michael@0 | 708 | fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); |
michael@0 | 709 | ucm_printMapping(table, m, stderr); |
michael@0 | 710 | m->moveFlag|=UCM_REMOVE_MAPPING; |
michael@0 | 711 | needsMove=TRUE; |
michael@0 | 712 | continue; |
michael@0 | 713 | } |
michael@0 | 714 | |
michael@0 | 715 | type=ucm_mappingType( |
michael@0 | 716 | &ucm->states, m, |
michael@0 | 717 | UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); |
michael@0 | 718 | if(type<0) { |
michael@0 | 719 | /* illegal byte sequence */ |
michael@0 | 720 | printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); |
michael@0 | 721 | isOK=FALSE; |
michael@0 | 722 | } else if(type>0) { |
michael@0 | 723 | m->moveFlag|=UCM_MOVE_TO_EXT; |
michael@0 | 724 | needsMove=TRUE; |
michael@0 | 725 | } |
michael@0 | 726 | } |
michael@0 | 727 | |
michael@0 | 728 | if(!isOK) { |
michael@0 | 729 | return FALSE; |
michael@0 | 730 | } |
michael@0 | 731 | if(needsMove) { |
michael@0 | 732 | ucm_moveMappings(ucm->base, ucm->ext); |
michael@0 | 733 | return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); |
michael@0 | 734 | } else { |
michael@0 | 735 | ucm_sortTable(ucm->base); |
michael@0 | 736 | return TRUE; |
michael@0 | 737 | } |
michael@0 | 738 | } |
michael@0 | 739 | |
michael@0 | 740 | /* ucm parser --------------------------------------------------------------- */ |
michael@0 | 741 | |
michael@0 | 742 | U_CAPI int8_t U_EXPORT2 |
michael@0 | 743 | ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { |
michael@0 | 744 | const char *s=*ps; |
michael@0 | 745 | char *end; |
michael@0 | 746 | uint8_t byte; |
michael@0 | 747 | int8_t bLen; |
michael@0 | 748 | |
michael@0 | 749 | bLen=0; |
michael@0 | 750 | for(;;) { |
michael@0 | 751 | /* skip an optional plus sign */ |
michael@0 | 752 | if(bLen>0 && *s=='+') { |
michael@0 | 753 | ++s; |
michael@0 | 754 | } |
michael@0 | 755 | if(*s!='\\') { |
michael@0 | 756 | break; |
michael@0 | 757 | } |
michael@0 | 758 | |
michael@0 | 759 | if( s[1]!='x' || |
michael@0 | 760 | (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 |
michael@0 | 761 | ) { |
michael@0 | 762 | fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); |
michael@0 | 763 | return -1; |
michael@0 | 764 | } |
michael@0 | 765 | |
michael@0 | 766 | if(bLen==UCNV_EXT_MAX_BYTES) { |
michael@0 | 767 | fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); |
michael@0 | 768 | return -1; |
michael@0 | 769 | } |
michael@0 | 770 | bytes[bLen++]=byte; |
michael@0 | 771 | s=end; |
michael@0 | 772 | } |
michael@0 | 773 | |
michael@0 | 774 | *ps=s; |
michael@0 | 775 | return bLen; |
michael@0 | 776 | } |
michael@0 | 777 | |
michael@0 | 778 | /* parse a mapping line; must not be empty */ |
michael@0 | 779 | U_CAPI UBool U_EXPORT2 |
michael@0 | 780 | ucm_parseMappingLine(UCMapping *m, |
michael@0 | 781 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 782 | uint8_t bytes[UCNV_EXT_MAX_BYTES], |
michael@0 | 783 | const char *line) { |
michael@0 | 784 | const char *s; |
michael@0 | 785 | char *end; |
michael@0 | 786 | UChar32 cp; |
michael@0 | 787 | int32_t u16Length; |
michael@0 | 788 | int8_t uLen, bLen, f; |
michael@0 | 789 | |
michael@0 | 790 | s=line; |
michael@0 | 791 | uLen=bLen=0; |
michael@0 | 792 | |
michael@0 | 793 | /* parse code points */ |
michael@0 | 794 | for(;;) { |
michael@0 | 795 | /* skip an optional plus sign */ |
michael@0 | 796 | if(uLen>0 && *s=='+') { |
michael@0 | 797 | ++s; |
michael@0 | 798 | } |
michael@0 | 799 | if(*s!='<') { |
michael@0 | 800 | break; |
michael@0 | 801 | } |
michael@0 | 802 | |
michael@0 | 803 | if( s[1]!='U' || |
michael@0 | 804 | (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || |
michael@0 | 805 | *end!='>' |
michael@0 | 806 | ) { |
michael@0 | 807 | fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); |
michael@0 | 808 | return FALSE; |
michael@0 | 809 | } |
michael@0 | 810 | if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { |
michael@0 | 811 | fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); |
michael@0 | 812 | return FALSE; |
michael@0 | 813 | } |
michael@0 | 814 | |
michael@0 | 815 | if(uLen==UCNV_EXT_MAX_UCHARS) { |
michael@0 | 816 | fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); |
michael@0 | 817 | return FALSE; |
michael@0 | 818 | } |
michael@0 | 819 | codePoints[uLen++]=cp; |
michael@0 | 820 | s=end+1; |
michael@0 | 821 | } |
michael@0 | 822 | |
michael@0 | 823 | if(uLen==0) { |
michael@0 | 824 | fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); |
michael@0 | 825 | return FALSE; |
michael@0 | 826 | } else if(uLen==1) { |
michael@0 | 827 | m->u=codePoints[0]; |
michael@0 | 828 | } else { |
michael@0 | 829 | UErrorCode errorCode=U_ZERO_ERROR; |
michael@0 | 830 | u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); |
michael@0 | 831 | if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || |
michael@0 | 832 | u16Length>UCNV_EXT_MAX_UCHARS |
michael@0 | 833 | ) { |
michael@0 | 834 | fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); |
michael@0 | 835 | return FALSE; |
michael@0 | 836 | } |
michael@0 | 837 | } |
michael@0 | 838 | |
michael@0 | 839 | s=u_skipWhitespace(s); |
michael@0 | 840 | |
michael@0 | 841 | /* parse bytes */ |
michael@0 | 842 | bLen=ucm_parseBytes(bytes, line, &s); |
michael@0 | 843 | |
michael@0 | 844 | if(bLen<0) { |
michael@0 | 845 | return FALSE; |
michael@0 | 846 | } else if(bLen==0) { |
michael@0 | 847 | fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); |
michael@0 | 848 | return FALSE; |
michael@0 | 849 | } else if(bLen<=4) { |
michael@0 | 850 | uprv_memcpy(m->b.bytes, bytes, bLen); |
michael@0 | 851 | } |
michael@0 | 852 | |
michael@0 | 853 | /* skip everything until the fallback indicator, even the start of a comment */ |
michael@0 | 854 | for(;;) { |
michael@0 | 855 | if(*s==0) { |
michael@0 | 856 | f=-1; /* no fallback indicator */ |
michael@0 | 857 | break; |
michael@0 | 858 | } else if(*s=='|') { |
michael@0 | 859 | f=(int8_t)(s[1]-'0'); |
michael@0 | 860 | if((uint8_t)f>4) { |
michael@0 | 861 | fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); |
michael@0 | 862 | return FALSE; |
michael@0 | 863 | } |
michael@0 | 864 | break; |
michael@0 | 865 | } |
michael@0 | 866 | ++s; |
michael@0 | 867 | } |
michael@0 | 868 | |
michael@0 | 869 | m->uLen=uLen; |
michael@0 | 870 | m->bLen=bLen; |
michael@0 | 871 | m->f=f; |
michael@0 | 872 | return TRUE; |
michael@0 | 873 | } |
michael@0 | 874 | |
michael@0 | 875 | /* general APIs ------------------------------------------------------------- */ |
michael@0 | 876 | |
michael@0 | 877 | U_CAPI UCMTable * U_EXPORT2 |
michael@0 | 878 | ucm_openTable() { |
michael@0 | 879 | UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); |
michael@0 | 880 | if(table==NULL) { |
michael@0 | 881 | fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); |
michael@0 | 882 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 883 | } |
michael@0 | 884 | |
michael@0 | 885 | memset(table, 0, sizeof(UCMTable)); |
michael@0 | 886 | return table; |
michael@0 | 887 | } |
michael@0 | 888 | |
michael@0 | 889 | U_CAPI void U_EXPORT2 |
michael@0 | 890 | ucm_closeTable(UCMTable *table) { |
michael@0 | 891 | if(table!=NULL) { |
michael@0 | 892 | uprv_free(table->mappings); |
michael@0 | 893 | uprv_free(table->codePoints); |
michael@0 | 894 | uprv_free(table->bytes); |
michael@0 | 895 | uprv_free(table->reverseMap); |
michael@0 | 896 | uprv_free(table); |
michael@0 | 897 | } |
michael@0 | 898 | } |
michael@0 | 899 | |
michael@0 | 900 | U_CAPI void U_EXPORT2 |
michael@0 | 901 | ucm_resetTable(UCMTable *table) { |
michael@0 | 902 | if(table!=NULL) { |
michael@0 | 903 | table->mappingsLength=0; |
michael@0 | 904 | table->flagsType=0; |
michael@0 | 905 | table->unicodeMask=0; |
michael@0 | 906 | table->bytesLength=table->codePointsLength=0; |
michael@0 | 907 | table->isSorted=FALSE; |
michael@0 | 908 | } |
michael@0 | 909 | } |
michael@0 | 910 | |
michael@0 | 911 | U_CAPI void U_EXPORT2 |
michael@0 | 912 | ucm_addMapping(UCMTable *table, |
michael@0 | 913 | UCMapping *m, |
michael@0 | 914 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 915 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
michael@0 | 916 | UCMapping *tm; |
michael@0 | 917 | UChar32 c; |
michael@0 | 918 | int32_t idx; |
michael@0 | 919 | |
michael@0 | 920 | if(table->mappingsLength>=table->mappingsCapacity) { |
michael@0 | 921 | /* make the mappings array larger */ |
michael@0 | 922 | if(table->mappingsCapacity==0) { |
michael@0 | 923 | table->mappingsCapacity=1000; |
michael@0 | 924 | } else { |
michael@0 | 925 | table->mappingsCapacity*=10; |
michael@0 | 926 | } |
michael@0 | 927 | table->mappings=(UCMapping *)uprv_realloc(table->mappings, |
michael@0 | 928 | table->mappingsCapacity*sizeof(UCMapping)); |
michael@0 | 929 | if(table->mappings==NULL) { |
michael@0 | 930 | fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", |
michael@0 | 931 | (int)table->mappingsCapacity); |
michael@0 | 932 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 933 | } |
michael@0 | 934 | |
michael@0 | 935 | if(table->reverseMap!=NULL) { |
michael@0 | 936 | /* the reverseMap must be reallocated in a new sort */ |
michael@0 | 937 | uprv_free(table->reverseMap); |
michael@0 | 938 | table->reverseMap=NULL; |
michael@0 | 939 | } |
michael@0 | 940 | } |
michael@0 | 941 | |
michael@0 | 942 | if(m->uLen>1 && table->codePointsCapacity==0) { |
michael@0 | 943 | table->codePointsCapacity=10000; |
michael@0 | 944 | table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); |
michael@0 | 945 | if(table->codePoints==NULL) { |
michael@0 | 946 | fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", |
michael@0 | 947 | (int)table->codePointsCapacity); |
michael@0 | 948 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 949 | } |
michael@0 | 950 | } |
michael@0 | 951 | |
michael@0 | 952 | if(m->bLen>4 && table->bytesCapacity==0) { |
michael@0 | 953 | table->bytesCapacity=10000; |
michael@0 | 954 | table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); |
michael@0 | 955 | if(table->bytes==NULL) { |
michael@0 | 956 | fprintf(stderr, "ucm error: unable to allocate %d bytes\n", |
michael@0 | 957 | (int)table->bytesCapacity); |
michael@0 | 958 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 959 | } |
michael@0 | 960 | } |
michael@0 | 961 | |
michael@0 | 962 | if(m->uLen>1) { |
michael@0 | 963 | idx=table->codePointsLength; |
michael@0 | 964 | table->codePointsLength+=m->uLen; |
michael@0 | 965 | if(table->codePointsLength>table->codePointsCapacity) { |
michael@0 | 966 | fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); |
michael@0 | 967 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 968 | } |
michael@0 | 969 | |
michael@0 | 970 | uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4); |
michael@0 | 971 | m->u=idx; |
michael@0 | 972 | } |
michael@0 | 973 | |
michael@0 | 974 | if(m->bLen>4) { |
michael@0 | 975 | idx=table->bytesLength; |
michael@0 | 976 | table->bytesLength+=m->bLen; |
michael@0 | 977 | if(table->bytesLength>table->bytesCapacity) { |
michael@0 | 978 | fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); |
michael@0 | 979 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 980 | } |
michael@0 | 981 | |
michael@0 | 982 | uprv_memcpy(table->bytes+idx, bytes, m->bLen); |
michael@0 | 983 | m->b.idx=idx; |
michael@0 | 984 | } |
michael@0 | 985 | |
michael@0 | 986 | /* set unicodeMask */ |
michael@0 | 987 | for(idx=0; idx<m->uLen; ++idx) { |
michael@0 | 988 | c=codePoints[idx]; |
michael@0 | 989 | if(c>=0x10000) { |
michael@0 | 990 | table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ |
michael@0 | 991 | } else if(U_IS_SURROGATE(c)) { |
michael@0 | 992 | table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ |
michael@0 | 993 | } |
michael@0 | 994 | } |
michael@0 | 995 | |
michael@0 | 996 | /* set flagsType */ |
michael@0 | 997 | if(m->f<0) { |
michael@0 | 998 | table->flagsType|=UCM_FLAGS_IMPLICIT; |
michael@0 | 999 | } else { |
michael@0 | 1000 | table->flagsType|=UCM_FLAGS_EXPLICIT; |
michael@0 | 1001 | } |
michael@0 | 1002 | |
michael@0 | 1003 | tm=table->mappings+table->mappingsLength++; |
michael@0 | 1004 | uprv_memcpy(tm, m, sizeof(UCMapping)); |
michael@0 | 1005 | |
michael@0 | 1006 | table->isSorted=FALSE; |
michael@0 | 1007 | } |
michael@0 | 1008 | |
michael@0 | 1009 | U_CAPI UCMFile * U_EXPORT2 |
michael@0 | 1010 | ucm_open() { |
michael@0 | 1011 | UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); |
michael@0 | 1012 | if(ucm==NULL) { |
michael@0 | 1013 | fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); |
michael@0 | 1014 | exit(U_MEMORY_ALLOCATION_ERROR); |
michael@0 | 1015 | } |
michael@0 | 1016 | |
michael@0 | 1017 | memset(ucm, 0, sizeof(UCMFile)); |
michael@0 | 1018 | |
michael@0 | 1019 | ucm->base=ucm_openTable(); |
michael@0 | 1020 | ucm->ext=ucm_openTable(); |
michael@0 | 1021 | |
michael@0 | 1022 | ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; |
michael@0 | 1023 | ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; |
michael@0 | 1024 | ucm->states.outputType=-1; |
michael@0 | 1025 | ucm->states.minCharLength=ucm->states.maxCharLength=1; |
michael@0 | 1026 | |
michael@0 | 1027 | return ucm; |
michael@0 | 1028 | } |
michael@0 | 1029 | |
michael@0 | 1030 | U_CAPI void U_EXPORT2 |
michael@0 | 1031 | ucm_close(UCMFile *ucm) { |
michael@0 | 1032 | if(ucm!=NULL) { |
michael@0 | 1033 | ucm_closeTable(ucm->base); |
michael@0 | 1034 | ucm_closeTable(ucm->ext); |
michael@0 | 1035 | uprv_free(ucm); |
michael@0 | 1036 | } |
michael@0 | 1037 | } |
michael@0 | 1038 | |
michael@0 | 1039 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 1040 | ucm_mappingType(UCMStates *baseStates, |
michael@0 | 1041 | UCMapping *m, |
michael@0 | 1042 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 1043 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
michael@0 | 1044 | /* check validity of the bytes and count the characters in them */ |
michael@0 | 1045 | int32_t count=ucm_countChars(baseStates, bytes, m->bLen); |
michael@0 | 1046 | if(count<1) { |
michael@0 | 1047 | /* illegal byte sequence */ |
michael@0 | 1048 | return -1; |
michael@0 | 1049 | } |
michael@0 | 1050 | |
michael@0 | 1051 | /* |
michael@0 | 1052 | * Suitable for an ICU conversion base table means: |
michael@0 | 1053 | * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) |
michael@0 | 1054 | * - precision flag 0..3 |
michael@0 | 1055 | * - SBCS: any 1:1 mapping |
michael@0 | 1056 | * (the table stores additional bits to distinguish mapping types) |
michael@0 | 1057 | * - MBCS: not a |2 SUB mapping for <subchar1> |
michael@0 | 1058 | * - MBCS: not a |1 fallback to 0x00 |
michael@0 | 1059 | * - MBCS: not a multi-byte mapping with leading 0x00 bytes |
michael@0 | 1060 | * |
michael@0 | 1061 | * Further restrictions for fromUnicode tables |
michael@0 | 1062 | * are enforced in makeconv (MBCSOkForBaseFromUnicode()). |
michael@0 | 1063 | * |
michael@0 | 1064 | * All of the MBCS fromUnicode specific tests could be removed from here, |
michael@0 | 1065 | * but the ones above are for unusual mappings, and removing the tests |
michael@0 | 1066 | * from here would change canonucm output which seems gratuitous. |
michael@0 | 1067 | * (Markus Scherer 2006-nov-28) |
michael@0 | 1068 | * |
michael@0 | 1069 | * Exception: All implicit mappings (f<0) that need to be moved |
michael@0 | 1070 | * because of fromUnicode restrictions _must_ be moved here because |
michael@0 | 1071 | * makeconv uses a hack for moving mappings only for the fromUnicode table |
michael@0 | 1072 | * that only works with non-negative values of f. |
michael@0 | 1073 | */ |
michael@0 | 1074 | if( m->uLen==1 && count==1 && m->f<=3 && |
michael@0 | 1075 | (baseStates->maxCharLength==1 || |
michael@0 | 1076 | !((m->f==2 && m->bLen==1) || |
michael@0 | 1077 | (m->f==1 && bytes[0]==0) || |
michael@0 | 1078 | (m->f<=1 && m->bLen>1 && bytes[0]==0))) |
michael@0 | 1079 | ) { |
michael@0 | 1080 | return 0; /* suitable for a base table */ |
michael@0 | 1081 | } else { |
michael@0 | 1082 | return 1; /* needs to go into an extension table */ |
michael@0 | 1083 | } |
michael@0 | 1084 | } |
michael@0 | 1085 | |
michael@0 | 1086 | U_CAPI UBool U_EXPORT2 |
michael@0 | 1087 | ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, |
michael@0 | 1088 | UCMapping *m, |
michael@0 | 1089 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
michael@0 | 1090 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
michael@0 | 1091 | int32_t type; |
michael@0 | 1092 | |
michael@0 | 1093 | if(m->f==2 && m->uLen>1) { |
michael@0 | 1094 | fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); |
michael@0 | 1095 | printMapping(m, codePoints, bytes, stderr); |
michael@0 | 1096 | return FALSE; |
michael@0 | 1097 | } |
michael@0 | 1098 | |
michael@0 | 1099 | if(baseStates!=NULL) { |
michael@0 | 1100 | /* check validity of the bytes and count the characters in them */ |
michael@0 | 1101 | type=ucm_mappingType(baseStates, m, codePoints, bytes); |
michael@0 | 1102 | if(type<0) { |
michael@0 | 1103 | /* illegal byte sequence */ |
michael@0 | 1104 | printMapping(m, codePoints, bytes, stderr); |
michael@0 | 1105 | return FALSE; |
michael@0 | 1106 | } |
michael@0 | 1107 | } else { |
michael@0 | 1108 | /* not used - adding a mapping for an extension-only table before its base table is read */ |
michael@0 | 1109 | type=1; |
michael@0 | 1110 | } |
michael@0 | 1111 | |
michael@0 | 1112 | /* |
michael@0 | 1113 | * Add the mapping to the base table if this is requested and suitable. |
michael@0 | 1114 | * Otherwise, add it to the extension table. |
michael@0 | 1115 | */ |
michael@0 | 1116 | if(forBase && type==0) { |
michael@0 | 1117 | ucm_addMapping(ucm->base, m, codePoints, bytes); |
michael@0 | 1118 | } else { |
michael@0 | 1119 | ucm_addMapping(ucm->ext, m, codePoints, bytes); |
michael@0 | 1120 | } |
michael@0 | 1121 | |
michael@0 | 1122 | return TRUE; |
michael@0 | 1123 | } |
michael@0 | 1124 | |
michael@0 | 1125 | U_CAPI UBool U_EXPORT2 |
michael@0 | 1126 | ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { |
michael@0 | 1127 | UCMapping m={ 0 }; |
michael@0 | 1128 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; |
michael@0 | 1129 | uint8_t bytes[UCNV_EXT_MAX_BYTES]; |
michael@0 | 1130 | |
michael@0 | 1131 | const char *s; |
michael@0 | 1132 | |
michael@0 | 1133 | /* ignore empty and comment lines */ |
michael@0 | 1134 | if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { |
michael@0 | 1135 | return TRUE; |
michael@0 | 1136 | } |
michael@0 | 1137 | |
michael@0 | 1138 | return |
michael@0 | 1139 | ucm_parseMappingLine(&m, codePoints, bytes, line) && |
michael@0 | 1140 | ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); |
michael@0 | 1141 | } |
michael@0 | 1142 | |
michael@0 | 1143 | U_CAPI void U_EXPORT2 |
michael@0 | 1144 | ucm_readTable(UCMFile *ucm, FileStream* convFile, |
michael@0 | 1145 | UBool forBase, UCMStates *baseStates, |
michael@0 | 1146 | UErrorCode *pErrorCode) { |
michael@0 | 1147 | char line[500]; |
michael@0 | 1148 | char *end; |
michael@0 | 1149 | UBool isOK; |
michael@0 | 1150 | |
michael@0 | 1151 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1152 | return; |
michael@0 | 1153 | } |
michael@0 | 1154 | |
michael@0 | 1155 | isOK=TRUE; |
michael@0 | 1156 | |
michael@0 | 1157 | for(;;) { |
michael@0 | 1158 | /* read the next line */ |
michael@0 | 1159 | if(!T_FileStream_readLine(convFile, line, sizeof(line))) { |
michael@0 | 1160 | fprintf(stderr, "incomplete charmap section\n"); |
michael@0 | 1161 | isOK=FALSE; |
michael@0 | 1162 | break; |
michael@0 | 1163 | } |
michael@0 | 1164 | |
michael@0 | 1165 | /* remove CR LF */ |
michael@0 | 1166 | end=uprv_strchr(line, 0); |
michael@0 | 1167 | while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { |
michael@0 | 1168 | --end; |
michael@0 | 1169 | } |
michael@0 | 1170 | *end=0; |
michael@0 | 1171 | |
michael@0 | 1172 | /* ignore empty and comment lines */ |
michael@0 | 1173 | if(line[0]==0 || line[0]=='#') { |
michael@0 | 1174 | continue; |
michael@0 | 1175 | } |
michael@0 | 1176 | |
michael@0 | 1177 | /* stop at the end of the mapping table */ |
michael@0 | 1178 | if(0==uprv_strcmp(line, "END CHARMAP")) { |
michael@0 | 1179 | break; |
michael@0 | 1180 | } |
michael@0 | 1181 | |
michael@0 | 1182 | isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); |
michael@0 | 1183 | } |
michael@0 | 1184 | |
michael@0 | 1185 | if(!isOK) { |
michael@0 | 1186 | *pErrorCode=U_INVALID_TABLE_FORMAT; |
michael@0 | 1187 | } |
michael@0 | 1188 | } |
michael@0 | 1189 | #endif |