intl/icu/source/tools/toolutil/ucm.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/tools/toolutil/ucm.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1189 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2003-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  ucm.c
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2003jun20
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   This file reads a .ucm file, stores its mappings and sorts them.
    1.20 +*   It implements handling of Unicode conversion mappings from .ucm files
    1.21 +*   for makeconv, canonucm, rptp2ucm, etc.
    1.22 +*
    1.23 +*   Unicode code point sequences with a length of more than 1,
    1.24 +*   as well as byte sequences with more than 4 bytes or more than one complete
    1.25 +*   character sequence are handled to support m:n mappings.
    1.26 +*/
    1.27 +
    1.28 +#include "unicode/utypes.h"
    1.29 +#include "unicode/ustring.h"
    1.30 +#include "cstring.h"
    1.31 +#include "cmemory.h"
    1.32 +#include "filestrm.h"
    1.33 +#include "uarrsort.h"
    1.34 +#include "ucnvmbcs.h"
    1.35 +#include "ucnv_bld.h"
    1.36 +#include "ucnv_ext.h"
    1.37 +#include "uparse.h"
    1.38 +#include "ucm.h"
    1.39 +#include <stdio.h>
    1.40 +
    1.41 +#if !UCONFIG_NO_CONVERSION
    1.42 +
    1.43 +/* -------------------------------------------------------------------------- */
    1.44 +
    1.45 +static void
    1.46 +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
    1.47 +    int32_t j;
    1.48 +
    1.49 +    for(j=0; j<m->uLen; ++j) {
    1.50 +        fprintf(f, "<U%04lX>", (long)codePoints[j]);
    1.51 +    }
    1.52 +
    1.53 +    fputc(' ', f);
    1.54 +
    1.55 +    for(j=0; j<m->bLen; ++j) {
    1.56 +        fprintf(f, "\\x%02X", bytes[j]);
    1.57 +    }
    1.58 +
    1.59 +    if(m->f>=0) {
    1.60 +        fprintf(f, " |%u\n", m->f);
    1.61 +    } else {
    1.62 +        fputs("\n", f);
    1.63 +    }
    1.64 +}
    1.65 +
    1.66 +U_CAPI void U_EXPORT2
    1.67 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
    1.68 +    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
    1.69 +}
    1.70 +
    1.71 +U_CAPI void U_EXPORT2
    1.72 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
    1.73 +    UCMapping *m;
    1.74 +    int32_t i, length;
    1.75 +
    1.76 +    m=table->mappings;
    1.77 +    length=table->mappingsLength;
    1.78 +    if(byUnicode) {
    1.79 +        for(i=0; i<length; ++m, ++i) {
    1.80 +            ucm_printMapping(table, m, f);
    1.81 +        }
    1.82 +    } else {
    1.83 +        const int32_t *map=table->reverseMap;
    1.84 +        for(i=0; i<length; ++i) {
    1.85 +            ucm_printMapping(table, m+map[i], f);
    1.86 +        }
    1.87 +    }
    1.88 +}
    1.89 +
    1.90 +/* mapping comparisons ------------------------------------------------------ */
    1.91 +
    1.92 +static int32_t
    1.93 +compareUnicode(UCMTable *lTable, const UCMapping *l,
    1.94 +               UCMTable *rTable, const UCMapping *r) {
    1.95 +    const UChar32 *lu, *ru;
    1.96 +    int32_t result, i, length;
    1.97 +
    1.98 +    if(l->uLen==1 && r->uLen==1) {
    1.99 +        /* compare two single code points */
   1.100 +        return l->u-r->u;
   1.101 +    }
   1.102 +
   1.103 +    /* get pointers to the code point sequences */
   1.104 +    lu=UCM_GET_CODE_POINTS(lTable, l);
   1.105 +    ru=UCM_GET_CODE_POINTS(rTable, r);
   1.106 +
   1.107 +    /* get the minimum length */
   1.108 +    if(l->uLen<=r->uLen) {
   1.109 +        length=l->uLen;
   1.110 +    } else {
   1.111 +        length=r->uLen;
   1.112 +    }
   1.113 +
   1.114 +    /* compare the code points */
   1.115 +    for(i=0; i<length; ++i) {
   1.116 +        result=lu[i]-ru[i];
   1.117 +        if(result!=0) {
   1.118 +            return result;
   1.119 +        }
   1.120 +    }
   1.121 +
   1.122 +    /* compare the lengths */
   1.123 +    return l->uLen-r->uLen;
   1.124 +}
   1.125 +
   1.126 +static int32_t
   1.127 +compareBytes(UCMTable *lTable, const UCMapping *l,
   1.128 +             UCMTable *rTable, const UCMapping *r,
   1.129 +             UBool lexical) {
   1.130 +    const uint8_t *lb, *rb;
   1.131 +    int32_t result, i, length;
   1.132 +
   1.133 +    /*
   1.134 +     * A lexical comparison is used for sorting in the builder, to allow
   1.135 +     * an efficient search for a byte sequence that could be a prefix
   1.136 +     * of a previously entered byte sequence.
   1.137 +     *
   1.138 +     * Comparing by lengths first is for compatibility with old .ucm tools
   1.139 +     * like canonucm and rptp2ucm.
   1.140 +     */
   1.141 +    if(lexical) {
   1.142 +        /* get the minimum length and continue */
   1.143 +        if(l->bLen<=r->bLen) {
   1.144 +            length=l->bLen;
   1.145 +        } else {
   1.146 +            length=r->bLen;
   1.147 +        }
   1.148 +    } else {
   1.149 +        /* compare lengths first */
   1.150 +        result=l->bLen-r->bLen;
   1.151 +        if(result!=0) {
   1.152 +            return result;
   1.153 +        } else {
   1.154 +            length=l->bLen;
   1.155 +        }
   1.156 +    }
   1.157 +
   1.158 +    /* get pointers to the byte sequences */
   1.159 +    lb=UCM_GET_BYTES(lTable, l);
   1.160 +    rb=UCM_GET_BYTES(rTable, r);
   1.161 +
   1.162 +    /* compare the bytes */
   1.163 +    for(i=0; i<length; ++i) {
   1.164 +        result=lb[i]-rb[i];
   1.165 +        if(result!=0) {
   1.166 +            return result;
   1.167 +        }
   1.168 +    }
   1.169 +
   1.170 +    /* compare the lengths */
   1.171 +    return l->bLen-r->bLen;
   1.172 +}
   1.173 +
   1.174 +/* compare UCMappings for sorting */
   1.175 +static int32_t
   1.176 +compareMappings(UCMTable *lTable, const UCMapping *l,
   1.177 +                UCMTable *rTable, const UCMapping *r,
   1.178 +                UBool uFirst) {
   1.179 +    int32_t result;
   1.180 +
   1.181 +    /* choose which side to compare first */
   1.182 +    if(uFirst) {
   1.183 +        /* Unicode then bytes */
   1.184 +        result=compareUnicode(lTable, l, rTable, r);
   1.185 +        if(result==0) {
   1.186 +            result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
   1.187 +        }
   1.188 +    } else {
   1.189 +        /* bytes then Unicode */
   1.190 +        result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
   1.191 +        if(result==0) {
   1.192 +            result=compareUnicode(lTable, l, rTable, r);
   1.193 +        }
   1.194 +    }
   1.195 +
   1.196 +    if(result!=0) {
   1.197 +        return result;
   1.198 +    }
   1.199 +
   1.200 +    /* compare the flags */
   1.201 +    return l->f-r->f;
   1.202 +}
   1.203 +
   1.204 +/* sorting by Unicode first sorts mappings directly */
   1.205 +static int32_t
   1.206 +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
   1.207 +    return compareMappings(
   1.208 +        (UCMTable *)context, (const UCMapping *)left,
   1.209 +        (UCMTable *)context, (const UCMapping *)right, TRUE);
   1.210 +}
   1.211 +
   1.212 +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
   1.213 +static int32_t
   1.214 +compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
   1.215 +    UCMTable *table=(UCMTable *)context;
   1.216 +    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
   1.217 +    return compareMappings(
   1.218 +        table, table->mappings+l,
   1.219 +        table, table->mappings+r, FALSE);
   1.220 +}
   1.221 +
   1.222 +U_CAPI void U_EXPORT2
   1.223 +ucm_sortTable(UCMTable *t) {
   1.224 +    UErrorCode errorCode;
   1.225 +    int32_t i;
   1.226 +
   1.227 +    if(t->isSorted) {
   1.228 +        return;
   1.229 +    }
   1.230 +
   1.231 +    errorCode=U_ZERO_ERROR;
   1.232 +
   1.233 +    /* 1. sort by Unicode first */
   1.234 +    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
   1.235 +                   compareMappingsUnicodeFirst, t,
   1.236 +                   FALSE, &errorCode);
   1.237 +
   1.238 +    /* build the reverseMap */
   1.239 +    if(t->reverseMap==NULL) {
   1.240 +        /*
   1.241 +         * allocate mappingsCapacity instead of mappingsLength so that
   1.242 +         * if mappings are added, the reverseMap need not be
   1.243 +         * reallocated each time
   1.244 +         * (see ucm_moveMappings() and ucm_addMapping())
   1.245 +         */
   1.246 +        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
   1.247 +        if(t->reverseMap==NULL) {
   1.248 +            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
   1.249 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.250 +        }
   1.251 +    }
   1.252 +    for(i=0; i<t->mappingsLength; ++i) {
   1.253 +        t->reverseMap[i]=i;
   1.254 +    }
   1.255 +
   1.256 +    /* 2. sort reverseMap by mappings bytes first */
   1.257 +    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
   1.258 +                   compareMappingsBytesFirst, t,
   1.259 +                   FALSE, &errorCode);
   1.260 +
   1.261 +    if(U_FAILURE(errorCode)) {
   1.262 +        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
   1.263 +                u_errorName(errorCode));
   1.264 +        exit(errorCode);
   1.265 +    }
   1.266 +
   1.267 +    t->isSorted=TRUE;
   1.268 +}
   1.269 +
   1.270 +/*
   1.271 + * remove mappings with their move flag set from the base table
   1.272 + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
   1.273 + */
   1.274 +U_CAPI void U_EXPORT2
   1.275 +ucm_moveMappings(UCMTable *base, UCMTable *ext) {
   1.276 +    UCMapping *mb, *mbLimit;
   1.277 +    int8_t flag;
   1.278 +
   1.279 +    mb=base->mappings;
   1.280 +    mbLimit=mb+base->mappingsLength;
   1.281 +
   1.282 +    while(mb<mbLimit) {
   1.283 +        flag=mb->moveFlag;
   1.284 +        if(flag!=0) {
   1.285 +            /* reset the move flag */
   1.286 +            mb->moveFlag=0;
   1.287 +
   1.288 +            if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
   1.289 +                /* add the mapping to the extension table */
   1.290 +                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
   1.291 +            }
   1.292 +
   1.293 +            /* remove this mapping: move the last base mapping down and overwrite the current one */
   1.294 +            if(mb<(mbLimit-1)) {
   1.295 +                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
   1.296 +            }
   1.297 +            --mbLimit;
   1.298 +            --base->mappingsLength;
   1.299 +            base->isSorted=FALSE;
   1.300 +        } else {
   1.301 +            ++mb;
   1.302 +        }
   1.303 +    }
   1.304 +}
   1.305 +
   1.306 +enum {
   1.307 +    NEEDS_MOVE=1,
   1.308 +    HAS_ERRORS=2
   1.309 +};
   1.310 +
   1.311 +static uint8_t
   1.312 +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
   1.313 +                    UBool moveToExt, UBool intersectBase) {
   1.314 +    UCMapping *mb, *me, *mbLimit, *meLimit;
   1.315 +    int32_t cmp;
   1.316 +    uint8_t result;
   1.317 +
   1.318 +    mb=base->mappings;
   1.319 +    mbLimit=mb+base->mappingsLength;
   1.320 +
   1.321 +    me=ext->mappings;
   1.322 +    meLimit=me+ext->mappingsLength;
   1.323 +
   1.324 +    result=0;
   1.325 +
   1.326 +    for(;;) {
   1.327 +        /* skip irrelevant mappings on both sides */
   1.328 +        for(;;) {
   1.329 +            if(mb==mbLimit) {
   1.330 +                return result;
   1.331 +            }
   1.332 +
   1.333 +            if((0<=mb->f && mb->f<=2) || mb->f==4) {
   1.334 +                break;
   1.335 +            }
   1.336 +
   1.337 +            ++mb;
   1.338 +        }
   1.339 +
   1.340 +        for(;;) {
   1.341 +            if(me==meLimit) {
   1.342 +                return result;
   1.343 +            }
   1.344 +
   1.345 +            if((0<=me->f && me->f<=2) || me->f==4) {
   1.346 +                break;
   1.347 +            }
   1.348 +
   1.349 +            ++me;
   1.350 +        }
   1.351 +
   1.352 +        /* compare the base and extension mappings */
   1.353 +        cmp=compareUnicode(base, mb, ext, me);
   1.354 +        if(cmp<0) {
   1.355 +            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
   1.356 +                /*
   1.357 +                 * mapping in base but not in ext, move it
   1.358 +                 *
   1.359 +                 * if ext is DBCS, move DBCS mappings here
   1.360 +                 * and check SBCS ones for Unicode prefix below
   1.361 +                 */
   1.362 +                mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.363 +                result|=NEEDS_MOVE;
   1.364 +
   1.365 +            /* does mb map from an input sequence that is a prefix of me's? */
   1.366 +            } else if( mb->uLen<me->uLen &&
   1.367 +                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
   1.368 +            ) {
   1.369 +                if(moveToExt) {
   1.370 +                    /* mark this mapping to be moved to the extension table */
   1.371 +                    mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.372 +                    result|=NEEDS_MOVE;
   1.373 +                } else {
   1.374 +                    fprintf(stderr,
   1.375 +                            "ucm error: the base table contains a mapping whose input sequence\n"
   1.376 +                            "           is a prefix of the input sequence of an extension mapping\n");
   1.377 +                    ucm_printMapping(base, mb, stderr);
   1.378 +                    ucm_printMapping(ext, me, stderr);
   1.379 +                    result|=HAS_ERRORS;
   1.380 +                }
   1.381 +            }
   1.382 +
   1.383 +            ++mb;
   1.384 +        } else if(cmp==0) {
   1.385 +            /*
   1.386 +             * same output: remove the extension mapping,
   1.387 +             * otherwise treat as an error
   1.388 +             */
   1.389 +            if( mb->f==me->f && mb->bLen==me->bLen &&
   1.390 +                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
   1.391 +            ) {
   1.392 +                me->moveFlag|=UCM_REMOVE_MAPPING;
   1.393 +                result|=NEEDS_MOVE;
   1.394 +            } else if(intersectBase) {
   1.395 +                /* mapping in base but not in ext, move it */
   1.396 +                mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.397 +                result|=NEEDS_MOVE;
   1.398 +            } else {
   1.399 +                fprintf(stderr,
   1.400 +                        "ucm error: the base table contains a mapping whose input sequence\n"
   1.401 +                        "           is the same as the input sequence of an extension mapping\n"
   1.402 +                        "           but it maps differently\n");
   1.403 +                ucm_printMapping(base, mb, stderr);
   1.404 +                ucm_printMapping(ext, me, stderr);
   1.405 +                result|=HAS_ERRORS;
   1.406 +            }
   1.407 +
   1.408 +            ++mb;
   1.409 +        } else /* cmp>0 */ {
   1.410 +            ++me;
   1.411 +        }
   1.412 +    }
   1.413 +}
   1.414 +
   1.415 +static uint8_t
   1.416 +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
   1.417 +                  UBool moveToExt, UBool intersectBase) {
   1.418 +    UCMapping *mb, *me;
   1.419 +    int32_t *baseMap, *extMap;
   1.420 +    int32_t b, e, bLimit, eLimit, cmp;
   1.421 +    uint8_t result;
   1.422 +    UBool isSISO;
   1.423 +
   1.424 +    baseMap=base->reverseMap;
   1.425 +    extMap=ext->reverseMap;
   1.426 +
   1.427 +    b=e=0;
   1.428 +    bLimit=base->mappingsLength;
   1.429 +    eLimit=ext->mappingsLength;
   1.430 +
   1.431 +    result=0;
   1.432 +
   1.433 +    isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
   1.434 +
   1.435 +    for(;;) {
   1.436 +        /* skip irrelevant mappings on both sides */
   1.437 +        for(;; ++b) {
   1.438 +            if(b==bLimit) {
   1.439 +                return result;
   1.440 +            }
   1.441 +            mb=base->mappings+baseMap[b];
   1.442 +
   1.443 +            if(intersectBase==2 && mb->bLen==1) {
   1.444 +                /*
   1.445 +                 * comparing a base against a DBCS extension:
   1.446 +                 * leave SBCS base mappings alone
   1.447 +                 */
   1.448 +                continue;
   1.449 +            }
   1.450 +
   1.451 +            if(mb->f==0 || mb->f==3) {
   1.452 +                break;
   1.453 +            }
   1.454 +        }
   1.455 +
   1.456 +        for(;;) {
   1.457 +            if(e==eLimit) {
   1.458 +                return result;
   1.459 +            }
   1.460 +            me=ext->mappings+extMap[e];
   1.461 +
   1.462 +            if(me->f==0 || me->f==3) {
   1.463 +                break;
   1.464 +            }
   1.465 +
   1.466 +            ++e;
   1.467 +        }
   1.468 +
   1.469 +        /* compare the base and extension mappings */
   1.470 +        cmp=compareBytes(base, mb, ext, me, TRUE);
   1.471 +        if(cmp<0) {
   1.472 +            if(intersectBase) {
   1.473 +                /* mapping in base but not in ext, move it */
   1.474 +                mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.475 +                result|=NEEDS_MOVE;
   1.476 +
   1.477 +            /*
   1.478 +             * does mb map from an input sequence that is a prefix of me's?
   1.479 +             * for SI/SO tables, a single byte is never a prefix because it
   1.480 +             * occurs in a separate single-byte state
   1.481 +             */
   1.482 +            } else if( mb->bLen<me->bLen &&
   1.483 +                (!isSISO || mb->bLen>1) &&
   1.484 +                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
   1.485 +            ) {
   1.486 +                if(moveToExt) {
   1.487 +                    /* mark this mapping to be moved to the extension table */
   1.488 +                    mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.489 +                    result|=NEEDS_MOVE;
   1.490 +                } else {
   1.491 +                    fprintf(stderr,
   1.492 +                            "ucm error: the base table contains a mapping whose input sequence\n"
   1.493 +                            "           is a prefix of the input sequence of an extension mapping\n");
   1.494 +                    ucm_printMapping(base, mb, stderr);
   1.495 +                    ucm_printMapping(ext, me, stderr);
   1.496 +                    result|=HAS_ERRORS;
   1.497 +                }
   1.498 +            }
   1.499 +
   1.500 +            ++b;
   1.501 +        } else if(cmp==0) {
   1.502 +            /*
   1.503 +             * same output: remove the extension mapping,
   1.504 +             * otherwise treat as an error
   1.505 +             */
   1.506 +            if( mb->f==me->f && mb->uLen==me->uLen &&
   1.507 +                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
   1.508 +            ) {
   1.509 +                me->moveFlag|=UCM_REMOVE_MAPPING;
   1.510 +                result|=NEEDS_MOVE;
   1.511 +            } else if(intersectBase) {
   1.512 +                /* mapping in base but not in ext, move it */
   1.513 +                mb->moveFlag|=UCM_MOVE_TO_EXT;
   1.514 +                result|=NEEDS_MOVE;
   1.515 +            } else {
   1.516 +                fprintf(stderr,
   1.517 +                        "ucm error: the base table contains a mapping whose input sequence\n"
   1.518 +                        "           is the same as the input sequence of an extension mapping\n"
   1.519 +                        "           but it maps differently\n");
   1.520 +                ucm_printMapping(base, mb, stderr);
   1.521 +                ucm_printMapping(ext, me, stderr);
   1.522 +                result|=HAS_ERRORS;
   1.523 +            }
   1.524 +
   1.525 +            ++b;
   1.526 +        } else /* cmp>0 */ {
   1.527 +            ++e;
   1.528 +        }
   1.529 +    }
   1.530 +}
   1.531 +
   1.532 +U_CAPI UBool U_EXPORT2
   1.533 +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
   1.534 +    UCMapping *m, *mLimit;
   1.535 +    int32_t count;
   1.536 +    UBool isOK;
   1.537 +
   1.538 +    m=table->mappings;
   1.539 +    mLimit=m+table->mappingsLength;
   1.540 +    isOK=TRUE;
   1.541 +
   1.542 +    while(m<mLimit) {
   1.543 +        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
   1.544 +        if(count<1) {
   1.545 +            ucm_printMapping(table, m, stderr);
   1.546 +            isOK=FALSE;
   1.547 +        }
   1.548 +        ++m;
   1.549 +    }
   1.550 +
   1.551 +    return isOK;
   1.552 +}
   1.553 +
   1.554 +U_CAPI UBool U_EXPORT2
   1.555 +ucm_checkBaseExt(UCMStates *baseStates,
   1.556 +                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
   1.557 +                 UBool intersectBase) {
   1.558 +    uint8_t result;
   1.559 +
   1.560 +    /* if we have an extension table, we must always use precision flags */
   1.561 +    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
   1.562 +        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
   1.563 +        return FALSE;
   1.564 +    }
   1.565 +    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
   1.566 +        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
   1.567 +        return FALSE;
   1.568 +    }
   1.569 +
   1.570 +    /* checking requires both tables to be sorted */
   1.571 +    ucm_sortTable(base);
   1.572 +    ucm_sortTable(ext);
   1.573 +
   1.574 +    /* check */
   1.575 +    result=
   1.576 +        checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
   1.577 +        checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
   1.578 +
   1.579 +    if(result&HAS_ERRORS) {
   1.580 +        return FALSE;
   1.581 +    }
   1.582 +
   1.583 +    if(result&NEEDS_MOVE) {
   1.584 +        ucm_moveMappings(ext, NULL);
   1.585 +        ucm_moveMappings(base, moveTarget);
   1.586 +        ucm_sortTable(base);
   1.587 +        ucm_sortTable(ext);
   1.588 +        if(moveTarget!=NULL) {
   1.589 +            ucm_sortTable(moveTarget);
   1.590 +        }
   1.591 +    }
   1.592 +
   1.593 +    return TRUE;
   1.594 +}
   1.595 +
   1.596 +/* merge tables for rptp2ucm ------------------------------------------------ */
   1.597 +
   1.598 +U_CAPI void U_EXPORT2
   1.599 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
   1.600 +                const uint8_t *subchar, int32_t subcharLength,
   1.601 +                uint8_t subchar1) {
   1.602 +    UCMapping *fromUMapping, *toUMapping;
   1.603 +    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
   1.604 +
   1.605 +    ucm_sortTable(fromUTable);
   1.606 +    ucm_sortTable(toUTable);
   1.607 +
   1.608 +    fromUMapping=fromUTable->mappings;
   1.609 +    toUMapping=toUTable->mappings;
   1.610 +
   1.611 +    fromUTop=fromUTable->mappingsLength;
   1.612 +    toUTop=toUTable->mappingsLength;
   1.613 +
   1.614 +    fromUIndex=toUIndex=0;
   1.615 +
   1.616 +    while(fromUIndex<fromUTop && toUIndex<toUTop) {
   1.617 +        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
   1.618 +        if(cmp==0) {
   1.619 +            /* equal: roundtrip, nothing to do (flags are initially 0) */
   1.620 +            ++fromUMapping;
   1.621 +            ++toUMapping;
   1.622 +
   1.623 +            ++fromUIndex;
   1.624 +            ++toUIndex;
   1.625 +        } else if(cmp<0) {
   1.626 +            /*
   1.627 +             * the fromU mapping does not have a toU counterpart:
   1.628 +             * fallback Unicode->codepage
   1.629 +             */
   1.630 +            if( (fromUMapping->bLen==subcharLength &&
   1.631 +                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
   1.632 +                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
   1.633 +            ) {
   1.634 +                fromUMapping->f=2; /* SUB mapping */
   1.635 +            } else {
   1.636 +                fromUMapping->f=1; /* normal fallback */
   1.637 +            }
   1.638 +
   1.639 +            ++fromUMapping;
   1.640 +            ++fromUIndex;
   1.641 +        } else {
   1.642 +            /*
   1.643 +             * the toU mapping does not have a fromU counterpart:
   1.644 +             * (reverse) fallback codepage->Unicode, copy it to the fromU table
   1.645 +             */
   1.646 +
   1.647 +            /* ignore reverse fallbacks to Unicode SUB */
   1.648 +            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
   1.649 +                toUMapping->f=3; /* reverse fallback */
   1.650 +                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
   1.651 +
   1.652 +                /* the table may have been reallocated */
   1.653 +                fromUMapping=fromUTable->mappings+fromUIndex;
   1.654 +            }
   1.655 +
   1.656 +            ++toUMapping;
   1.657 +            ++toUIndex;
   1.658 +        }
   1.659 +    }
   1.660 +
   1.661 +    /* either one or both tables are exhausted */
   1.662 +    while(fromUIndex<fromUTop) {
   1.663 +        /* leftover fromU mappings are fallbacks */
   1.664 +        if( (fromUMapping->bLen==subcharLength &&
   1.665 +             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
   1.666 +            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
   1.667 +        ) {
   1.668 +            fromUMapping->f=2; /* SUB mapping */
   1.669 +        } else {
   1.670 +            fromUMapping->f=1; /* normal fallback */
   1.671 +        }
   1.672 +
   1.673 +        ++fromUMapping;
   1.674 +        ++fromUIndex;
   1.675 +    }
   1.676 +
   1.677 +    while(toUIndex<toUTop) {
   1.678 +        /* leftover toU mappings are reverse fallbacks */
   1.679 +
   1.680 +        /* ignore reverse fallbacks to Unicode SUB */
   1.681 +        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
   1.682 +            toUMapping->f=3; /* reverse fallback */
   1.683 +            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
   1.684 +        }
   1.685 +
   1.686 +        ++toUMapping;
   1.687 +        ++toUIndex;
   1.688 +    }
   1.689 +
   1.690 +    fromUTable->isSorted=FALSE;
   1.691 +}
   1.692 +
   1.693 +/* separate extension mappings out of base table for rptp2ucm --------------- */
   1.694 +
   1.695 +U_CAPI UBool U_EXPORT2
   1.696 +ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
   1.697 +    UCMTable *table;
   1.698 +    UCMapping *m, *mLimit;
   1.699 +    int32_t type;
   1.700 +    UBool needsMove, isOK;
   1.701 +
   1.702 +    table=ucm->base;
   1.703 +    m=table->mappings;
   1.704 +    mLimit=m+table->mappingsLength;
   1.705 +
   1.706 +    needsMove=FALSE;
   1.707 +    isOK=TRUE;
   1.708 +
   1.709 +    for(; m<mLimit; ++m) {
   1.710 +        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
   1.711 +            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
   1.712 +            ucm_printMapping(table, m, stderr);
   1.713 +            m->moveFlag|=UCM_REMOVE_MAPPING;
   1.714 +            needsMove=TRUE;
   1.715 +            continue;
   1.716 +        }
   1.717 +
   1.718 +        type=ucm_mappingType(
   1.719 +                &ucm->states, m,
   1.720 +                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
   1.721 +        if(type<0) {
   1.722 +            /* illegal byte sequence */
   1.723 +            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
   1.724 +            isOK=FALSE;
   1.725 +        } else if(type>0) {
   1.726 +            m->moveFlag|=UCM_MOVE_TO_EXT;
   1.727 +            needsMove=TRUE;
   1.728 +        }
   1.729 +    }
   1.730 +
   1.731 +    if(!isOK) {
   1.732 +        return FALSE;
   1.733 +    }
   1.734 +    if(needsMove) {
   1.735 +        ucm_moveMappings(ucm->base, ucm->ext);
   1.736 +        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
   1.737 +    } else {
   1.738 +        ucm_sortTable(ucm->base);
   1.739 +        return TRUE;
   1.740 +    }
   1.741 +}
   1.742 +
   1.743 +/* ucm parser --------------------------------------------------------------- */
   1.744 +
   1.745 +U_CAPI int8_t U_EXPORT2
   1.746 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
   1.747 +    const char *s=*ps;
   1.748 +    char *end;
   1.749 +    uint8_t byte;
   1.750 +    int8_t bLen;
   1.751 +
   1.752 +    bLen=0;
   1.753 +    for(;;) {
   1.754 +        /* skip an optional plus sign */
   1.755 +        if(bLen>0 && *s=='+') {
   1.756 +            ++s;
   1.757 +        }
   1.758 +        if(*s!='\\') {
   1.759 +            break;
   1.760 +        }
   1.761 +
   1.762 +        if( s[1]!='x' ||
   1.763 +            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
   1.764 +        ) {
   1.765 +            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
   1.766 +            return -1;
   1.767 +        }
   1.768 +
   1.769 +        if(bLen==UCNV_EXT_MAX_BYTES) {
   1.770 +            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
   1.771 +            return -1;
   1.772 +        }
   1.773 +        bytes[bLen++]=byte;
   1.774 +        s=end;
   1.775 +    }
   1.776 +
   1.777 +    *ps=s;
   1.778 +    return bLen;
   1.779 +}
   1.780 +
   1.781 +/* parse a mapping line; must not be empty */
   1.782 +U_CAPI UBool U_EXPORT2
   1.783 +ucm_parseMappingLine(UCMapping *m,
   1.784 +                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1.785 +                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
   1.786 +                     const char *line) {
   1.787 +    const char *s;
   1.788 +    char *end;
   1.789 +    UChar32 cp;
   1.790 +    int32_t u16Length;
   1.791 +    int8_t uLen, bLen, f;
   1.792 +
   1.793 +    s=line;
   1.794 +    uLen=bLen=0;
   1.795 +
   1.796 +    /* parse code points */
   1.797 +    for(;;) {
   1.798 +        /* skip an optional plus sign */
   1.799 +        if(uLen>0 && *s=='+') {
   1.800 +            ++s;
   1.801 +        }
   1.802 +        if(*s!='<') {
   1.803 +            break;
   1.804 +        }
   1.805 +
   1.806 +        if( s[1]!='U' ||
   1.807 +            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
   1.808 +            *end!='>'
   1.809 +        ) {
   1.810 +            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
   1.811 +            return FALSE;
   1.812 +        }
   1.813 +        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
   1.814 +            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
   1.815 +            return FALSE;
   1.816 +        }
   1.817 +
   1.818 +        if(uLen==UCNV_EXT_MAX_UCHARS) {
   1.819 +            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
   1.820 +            return FALSE;
   1.821 +        }
   1.822 +        codePoints[uLen++]=cp;
   1.823 +        s=end+1;
   1.824 +    }
   1.825 +
   1.826 +    if(uLen==0) {
   1.827 +        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
   1.828 +        return FALSE;
   1.829 +    } else if(uLen==1) {
   1.830 +        m->u=codePoints[0];
   1.831 +    } else {
   1.832 +        UErrorCode errorCode=U_ZERO_ERROR;
   1.833 +        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
   1.834 +        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
   1.835 +            u16Length>UCNV_EXT_MAX_UCHARS
   1.836 +        ) {
   1.837 +            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
   1.838 +            return FALSE;
   1.839 +        }
   1.840 +    }
   1.841 +
   1.842 +    s=u_skipWhitespace(s);
   1.843 +
   1.844 +    /* parse bytes */
   1.845 +    bLen=ucm_parseBytes(bytes, line, &s);
   1.846 +
   1.847 +    if(bLen<0) {
   1.848 +        return FALSE;
   1.849 +    } else if(bLen==0) {
   1.850 +        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
   1.851 +        return FALSE;
   1.852 +    } else if(bLen<=4) {
   1.853 +        uprv_memcpy(m->b.bytes, bytes, bLen);
   1.854 +    }
   1.855 +
   1.856 +    /* skip everything until the fallback indicator, even the start of a comment */
   1.857 +    for(;;) {
   1.858 +        if(*s==0) {
   1.859 +            f=-1; /* no fallback indicator */
   1.860 +            break;
   1.861 +        } else if(*s=='|') {
   1.862 +            f=(int8_t)(s[1]-'0');
   1.863 +            if((uint8_t)f>4) {
   1.864 +                fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
   1.865 +                return FALSE;
   1.866 +            }
   1.867 +            break;
   1.868 +        }
   1.869 +        ++s;
   1.870 +    }
   1.871 +
   1.872 +    m->uLen=uLen;
   1.873 +    m->bLen=bLen;
   1.874 +    m->f=f;
   1.875 +    return TRUE;
   1.876 +}
   1.877 +
   1.878 +/* general APIs ------------------------------------------------------------- */
   1.879 +
   1.880 +U_CAPI UCMTable * U_EXPORT2
   1.881 +ucm_openTable() {
   1.882 +    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
   1.883 +    if(table==NULL) {
   1.884 +        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
   1.885 +        exit(U_MEMORY_ALLOCATION_ERROR);
   1.886 +    }
   1.887 +
   1.888 +    memset(table, 0, sizeof(UCMTable));
   1.889 +    return table;
   1.890 +}
   1.891 +
   1.892 +U_CAPI void U_EXPORT2
   1.893 +ucm_closeTable(UCMTable *table) {
   1.894 +    if(table!=NULL) {
   1.895 +        uprv_free(table->mappings);
   1.896 +        uprv_free(table->codePoints);
   1.897 +        uprv_free(table->bytes);
   1.898 +        uprv_free(table->reverseMap);
   1.899 +        uprv_free(table);
   1.900 +    }
   1.901 +}
   1.902 +
   1.903 +U_CAPI void U_EXPORT2
   1.904 +ucm_resetTable(UCMTable *table) {
   1.905 +    if(table!=NULL) {
   1.906 +        table->mappingsLength=0;
   1.907 +        table->flagsType=0;
   1.908 +        table->unicodeMask=0;
   1.909 +        table->bytesLength=table->codePointsLength=0;
   1.910 +        table->isSorted=FALSE;
   1.911 +    }
   1.912 +}
   1.913 +
   1.914 +U_CAPI void U_EXPORT2
   1.915 +ucm_addMapping(UCMTable *table,
   1.916 +               UCMapping *m,
   1.917 +               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1.918 +               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1.919 +    UCMapping *tm;
   1.920 +    UChar32 c;
   1.921 +    int32_t idx;
   1.922 +
   1.923 +    if(table->mappingsLength>=table->mappingsCapacity) {
   1.924 +        /* make the mappings array larger */
   1.925 +        if(table->mappingsCapacity==0) {
   1.926 +            table->mappingsCapacity=1000;
   1.927 +        } else {
   1.928 +            table->mappingsCapacity*=10;
   1.929 +        }
   1.930 +        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
   1.931 +                                             table->mappingsCapacity*sizeof(UCMapping));
   1.932 +        if(table->mappings==NULL) {
   1.933 +            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
   1.934 +                            (int)table->mappingsCapacity);
   1.935 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.936 +        }
   1.937 +
   1.938 +        if(table->reverseMap!=NULL) {
   1.939 +            /* the reverseMap must be reallocated in a new sort */
   1.940 +            uprv_free(table->reverseMap);
   1.941 +            table->reverseMap=NULL;
   1.942 +        }
   1.943 +    }
   1.944 +
   1.945 +    if(m->uLen>1 && table->codePointsCapacity==0) {
   1.946 +        table->codePointsCapacity=10000;
   1.947 +        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
   1.948 +        if(table->codePoints==NULL) {
   1.949 +            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
   1.950 +                            (int)table->codePointsCapacity);
   1.951 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.952 +        }
   1.953 +    }
   1.954 +
   1.955 +    if(m->bLen>4 && table->bytesCapacity==0) {
   1.956 +        table->bytesCapacity=10000;
   1.957 +        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
   1.958 +        if(table->bytes==NULL) {
   1.959 +            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
   1.960 +                            (int)table->bytesCapacity);
   1.961 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.962 +        }
   1.963 +    }
   1.964 +
   1.965 +    if(m->uLen>1) {
   1.966 +        idx=table->codePointsLength;
   1.967 +        table->codePointsLength+=m->uLen;
   1.968 +        if(table->codePointsLength>table->codePointsCapacity) {
   1.969 +            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
   1.970 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.971 +        }
   1.972 +
   1.973 +        uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
   1.974 +        m->u=idx;
   1.975 +    }
   1.976 +
   1.977 +    if(m->bLen>4) {
   1.978 +        idx=table->bytesLength;
   1.979 +        table->bytesLength+=m->bLen;
   1.980 +        if(table->bytesLength>table->bytesCapacity) {
   1.981 +            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
   1.982 +            exit(U_MEMORY_ALLOCATION_ERROR);
   1.983 +        }
   1.984 +
   1.985 +        uprv_memcpy(table->bytes+idx, bytes, m->bLen);
   1.986 +        m->b.idx=idx;
   1.987 +    }
   1.988 +
   1.989 +    /* set unicodeMask */
   1.990 +    for(idx=0; idx<m->uLen; ++idx) {
   1.991 +        c=codePoints[idx];
   1.992 +        if(c>=0x10000) {
   1.993 +            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
   1.994 +        } else if(U_IS_SURROGATE(c)) {
   1.995 +            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
   1.996 +        }
   1.997 +    }
   1.998 +
   1.999 +    /* set flagsType */
  1.1000 +    if(m->f<0) {
  1.1001 +        table->flagsType|=UCM_FLAGS_IMPLICIT;
  1.1002 +    } else {
  1.1003 +        table->flagsType|=UCM_FLAGS_EXPLICIT;
  1.1004 +    }
  1.1005 +
  1.1006 +    tm=table->mappings+table->mappingsLength++;
  1.1007 +    uprv_memcpy(tm, m, sizeof(UCMapping));
  1.1008 +
  1.1009 +    table->isSorted=FALSE;
  1.1010 +}
  1.1011 +
  1.1012 +U_CAPI UCMFile * U_EXPORT2
  1.1013 +ucm_open() {
  1.1014 +    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
  1.1015 +    if(ucm==NULL) {
  1.1016 +        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
  1.1017 +        exit(U_MEMORY_ALLOCATION_ERROR);
  1.1018 +    }
  1.1019 +
  1.1020 +    memset(ucm, 0, sizeof(UCMFile));
  1.1021 +
  1.1022 +    ucm->base=ucm_openTable();
  1.1023 +    ucm->ext=ucm_openTable();
  1.1024 +
  1.1025 +    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
  1.1026 +    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
  1.1027 +    ucm->states.outputType=-1;
  1.1028 +    ucm->states.minCharLength=ucm->states.maxCharLength=1;
  1.1029 +
  1.1030 +    return ucm;
  1.1031 +}
  1.1032 +
  1.1033 +U_CAPI void U_EXPORT2
  1.1034 +ucm_close(UCMFile *ucm) {
  1.1035 +    if(ucm!=NULL) {
  1.1036 +        ucm_closeTable(ucm->base);
  1.1037 +        ucm_closeTable(ucm->ext);
  1.1038 +        uprv_free(ucm);
  1.1039 +    }
  1.1040 +}
  1.1041 +
  1.1042 +U_CAPI int32_t U_EXPORT2
  1.1043 +ucm_mappingType(UCMStates *baseStates,
  1.1044 +                UCMapping *m,
  1.1045 +                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
  1.1046 +                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
  1.1047 +    /* check validity of the bytes and count the characters in them */
  1.1048 +    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
  1.1049 +    if(count<1) {
  1.1050 +        /* illegal byte sequence */
  1.1051 +        return -1;
  1.1052 +    }
  1.1053 +
  1.1054 +    /*
  1.1055 +     * Suitable for an ICU conversion base table means:
  1.1056 +     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
  1.1057 +     * - precision flag 0..3
  1.1058 +     * - SBCS: any 1:1 mapping
  1.1059 +     *         (the table stores additional bits to distinguish mapping types)
  1.1060 +     * - MBCS: not a |2 SUB mapping for <subchar1>
  1.1061 +     * - MBCS: not a |1 fallback to 0x00
  1.1062 +     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
  1.1063 +     *
  1.1064 +     * Further restrictions for fromUnicode tables
  1.1065 +     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
  1.1066 +     *
  1.1067 +     * All of the MBCS fromUnicode specific tests could be removed from here,
  1.1068 +     * but the ones above are for unusual mappings, and removing the tests
  1.1069 +     * from here would change canonucm output which seems gratuitous.
  1.1070 +     * (Markus Scherer 2006-nov-28)
  1.1071 +     *
  1.1072 +     * Exception: All implicit mappings (f<0) that need to be moved
  1.1073 +     * because of fromUnicode restrictions _must_ be moved here because
  1.1074 +     * makeconv uses a hack for moving mappings only for the fromUnicode table
  1.1075 +     * that only works with non-negative values of f.
  1.1076 +     */
  1.1077 +    if( m->uLen==1 && count==1 && m->f<=3 &&
  1.1078 +        (baseStates->maxCharLength==1 ||
  1.1079 +            !((m->f==2 && m->bLen==1) ||
  1.1080 +              (m->f==1 && bytes[0]==0) ||
  1.1081 +              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
  1.1082 +    ) {
  1.1083 +        return 0; /* suitable for a base table */
  1.1084 +    } else {
  1.1085 +        return 1; /* needs to go into an extension table */
  1.1086 +    }
  1.1087 +}
  1.1088 +
  1.1089 +U_CAPI UBool U_EXPORT2
  1.1090 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
  1.1091 +                   UCMapping *m,
  1.1092 +                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
  1.1093 +                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
  1.1094 +    int32_t type;
  1.1095 +
  1.1096 +    if(m->f==2 && m->uLen>1) {
  1.1097 +        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
  1.1098 +        printMapping(m, codePoints, bytes, stderr);
  1.1099 +        return FALSE;
  1.1100 +    }
  1.1101 +
  1.1102 +    if(baseStates!=NULL) {
  1.1103 +        /* check validity of the bytes and count the characters in them */
  1.1104 +        type=ucm_mappingType(baseStates, m, codePoints, bytes);
  1.1105 +        if(type<0) {
  1.1106 +            /* illegal byte sequence */
  1.1107 +            printMapping(m, codePoints, bytes, stderr);
  1.1108 +            return FALSE;
  1.1109 +        }
  1.1110 +    } else {
  1.1111 +        /* not used - adding a mapping for an extension-only table before its base table is read */
  1.1112 +        type=1;
  1.1113 +    }
  1.1114 +
  1.1115 +    /*
  1.1116 +     * Add the mapping to the base table if this is requested and suitable.
  1.1117 +     * Otherwise, add it to the extension table.
  1.1118 +     */
  1.1119 +    if(forBase && type==0) {
  1.1120 +        ucm_addMapping(ucm->base, m, codePoints, bytes);
  1.1121 +    } else {
  1.1122 +        ucm_addMapping(ucm->ext, m, codePoints, bytes);
  1.1123 +    }
  1.1124 +
  1.1125 +    return TRUE;
  1.1126 +}
  1.1127 +
  1.1128 +U_CAPI UBool U_EXPORT2
  1.1129 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
  1.1130 +    UCMapping m={ 0 };
  1.1131 +    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
  1.1132 +    uint8_t bytes[UCNV_EXT_MAX_BYTES];
  1.1133 +
  1.1134 +    const char *s;
  1.1135 +
  1.1136 +    /* ignore empty and comment lines */
  1.1137 +    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
  1.1138 +        return TRUE;
  1.1139 +    }
  1.1140 +
  1.1141 +    return
  1.1142 +        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
  1.1143 +        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
  1.1144 +}
  1.1145 +
  1.1146 +U_CAPI void U_EXPORT2
  1.1147 +ucm_readTable(UCMFile *ucm, FileStream* convFile,
  1.1148 +              UBool forBase, UCMStates *baseStates,
  1.1149 +              UErrorCode *pErrorCode) {
  1.1150 +    char line[500];
  1.1151 +    char *end;
  1.1152 +    UBool isOK;
  1.1153 +
  1.1154 +    if(U_FAILURE(*pErrorCode)) {
  1.1155 +        return;
  1.1156 +    }
  1.1157 +
  1.1158 +    isOK=TRUE;
  1.1159 +
  1.1160 +    for(;;) {
  1.1161 +        /* read the next line */
  1.1162 +        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
  1.1163 +            fprintf(stderr, "incomplete charmap section\n");
  1.1164 +            isOK=FALSE;
  1.1165 +            break;
  1.1166 +        }
  1.1167 +
  1.1168 +        /* remove CR LF */
  1.1169 +        end=uprv_strchr(line, 0);
  1.1170 +        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
  1.1171 +            --end;
  1.1172 +        }
  1.1173 +        *end=0;
  1.1174 +
  1.1175 +        /* ignore empty and comment lines */
  1.1176 +        if(line[0]==0 || line[0]=='#') {
  1.1177 +            continue;
  1.1178 +        }
  1.1179 +
  1.1180 +        /* stop at the end of the mapping table */
  1.1181 +        if(0==uprv_strcmp(line, "END CHARMAP")) {
  1.1182 +            break;
  1.1183 +        }
  1.1184 +
  1.1185 +        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
  1.1186 +    }
  1.1187 +
  1.1188 +    if(!isOK) {
  1.1189 +        *pErrorCode=U_INVALID_TABLE_FORMAT;
  1.1190 +    }
  1.1191 +}
  1.1192 +#endif

mercurial