intl/icu/source/common/ucnv_ext.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnv_ext.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1137 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2003-2013, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  ucnv_ext.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2003jun13
    1.17 +*   created by: Markus W. Scherer
    1.18 +*
    1.19 +*   Conversion extensions
    1.20 +*/
    1.21 +
    1.22 +#include "unicode/utypes.h"
    1.23 +
    1.24 +#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
    1.25 +
    1.26 +#include "unicode/uset.h"
    1.27 +#include "ucnv_bld.h"
    1.28 +#include "ucnv_cnv.h"
    1.29 +#include "ucnv_ext.h"
    1.30 +#include "cmemory.h"
    1.31 +#include "uassert.h"
    1.32 +
    1.33 +/* to Unicode --------------------------------------------------------------- */
    1.34 +
    1.35 +/*
    1.36 + * @return lookup value for the byte, if found; else 0
    1.37 + */
    1.38 +static inline uint32_t
    1.39 +ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
    1.40 +    uint32_t word0, word;
    1.41 +    int32_t i, start, limit;
    1.42 +
    1.43 +    /* check the input byte against the lowest and highest section bytes */
    1.44 +    start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
    1.45 +    limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
    1.46 +    if(byte<start || limit<byte) {
    1.47 +        return 0; /* the byte is out of range */
    1.48 +    }
    1.49 +
    1.50 +    if(length==((limit-start)+1)) {
    1.51 +        /* direct access on a linear array */
    1.52 +        return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
    1.53 +    }
    1.54 +
    1.55 +    /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
    1.56 +    word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
    1.57 +
    1.58 +    /*
    1.59 +     * Shift byte once instead of each section word and add 0xffffff.
    1.60 +     * We will compare the shifted/added byte (bbffffff) against
    1.61 +     * section words which have byte values in the same bit position.
    1.62 +     * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
    1.63 +     * for all v=0..f
    1.64 +     * so we need not mask off the lower 24 bits of each section word.
    1.65 +     */
    1.66 +    word=word0|UCNV_EXT_TO_U_VALUE_MASK;
    1.67 +
    1.68 +    /* binary search */
    1.69 +    start=0;
    1.70 +    limit=length;
    1.71 +    for(;;) {
    1.72 +        i=limit-start;
    1.73 +        if(i<=1) {
    1.74 +            break; /* done */
    1.75 +        }
    1.76 +        /* start<limit-1 */
    1.77 +
    1.78 +        if(i<=4) {
    1.79 +            /* linear search for the last part */
    1.80 +            if(word0<=toUSection[start]) {
    1.81 +                break;
    1.82 +            }
    1.83 +            if(++start<limit && word0<=toUSection[start]) {
    1.84 +                break;
    1.85 +            }
    1.86 +            if(++start<limit && word0<=toUSection[start]) {
    1.87 +                break;
    1.88 +            }
    1.89 +            /* always break at start==limit-1 */
    1.90 +            ++start;
    1.91 +            break;
    1.92 +        }
    1.93 +
    1.94 +        i=(start+limit)/2;
    1.95 +        if(word<toUSection[i]) {
    1.96 +            limit=i;
    1.97 +        } else {
    1.98 +            start=i;
    1.99 +        }
   1.100 +    }
   1.101 +
   1.102 +    /* did we really find it? */
   1.103 +    if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
   1.104 +        return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
   1.105 +    } else {
   1.106 +        return 0; /* not found */
   1.107 +    }
   1.108 +}
   1.109 +
   1.110 +/*
   1.111 + * TRUE if not an SI/SO stateful converter,
   1.112 + * or if the match length fits with the current converter state
   1.113 + */
   1.114 +#define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
   1.115 +    ((sisoState)<0 || ((sisoState)==0) == (match==1))
   1.116 +
   1.117 +/*
   1.118 + * this works like ucnv_extMatchFromU() except
   1.119 + * - the first character is in pre
   1.120 + * - no trie is used
   1.121 + * - the returned matchLength is not offset by 2
   1.122 + */
   1.123 +static int32_t
   1.124 +ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
   1.125 +                 const char *pre, int32_t preLength,
   1.126 +                 const char *src, int32_t srcLength,
   1.127 +                 uint32_t *pMatchValue,
   1.128 +                 UBool /*useFallback*/, UBool flush) {
   1.129 +    const uint32_t *toUTable, *toUSection;
   1.130 +
   1.131 +    uint32_t value, matchValue;
   1.132 +    int32_t i, j, idx, length, matchLength;
   1.133 +    uint8_t b;
   1.134 +
   1.135 +    if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
   1.136 +        return 0; /* no extension data, no match */
   1.137 +    }
   1.138 +
   1.139 +    /* initialize */
   1.140 +    toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
   1.141 +    idx=0;
   1.142 +
   1.143 +    matchValue=0;
   1.144 +    i=j=matchLength=0;
   1.145 +
   1.146 +    if(sisoState==0) {
   1.147 +        /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
   1.148 +        if(preLength>1) {
   1.149 +            return 0; /* no match of a DBCS sequence in SBCS mode */
   1.150 +        } else if(preLength==1) {
   1.151 +            srcLength=0;
   1.152 +        } else /* preLength==0 */ {
   1.153 +            if(srcLength>1) {
   1.154 +                srcLength=1;
   1.155 +            }
   1.156 +        }
   1.157 +        flush=TRUE;
   1.158 +    }
   1.159 +
   1.160 +    /* we must not remember fallback matches when not using fallbacks */
   1.161 +
   1.162 +    /* match input units until there is a full match or the input is consumed */
   1.163 +    for(;;) {
   1.164 +        /* go to the next section */
   1.165 +        toUSection=toUTable+idx;
   1.166 +
   1.167 +        /* read first pair of the section */
   1.168 +        value=*toUSection++;
   1.169 +        length=UCNV_EXT_TO_U_GET_BYTE(value);
   1.170 +        value=UCNV_EXT_TO_U_GET_VALUE(value);
   1.171 +        if( value!=0 &&
   1.172 +            (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
   1.173 +             TO_U_USE_FALLBACK(useFallback)) &&
   1.174 +            UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
   1.175 +        ) {
   1.176 +            /* remember longest match so far */
   1.177 +            matchValue=value;
   1.178 +            matchLength=i+j;
   1.179 +        }
   1.180 +
   1.181 +        /* match pre[] then src[] */
   1.182 +        if(i<preLength) {
   1.183 +            b=(uint8_t)pre[i++];
   1.184 +        } else if(j<srcLength) {
   1.185 +            b=(uint8_t)src[j++];
   1.186 +        } else {
   1.187 +            /* all input consumed, partial match */
   1.188 +            if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
   1.189 +                /*
   1.190 +                 * end of the entire input stream, stop with the longest match so far
   1.191 +                 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
   1.192 +                 * because it must fit into state buffers
   1.193 +                 */
   1.194 +                break;
   1.195 +            } else {
   1.196 +                /* continue with more input next time */
   1.197 +                return -length;
   1.198 +            }
   1.199 +        }
   1.200 +
   1.201 +        /* search for the current UChar */
   1.202 +        value=ucnv_extFindToU(toUSection, length, b);
   1.203 +        if(value==0) {
   1.204 +            /* no match here, stop with the longest match so far */
   1.205 +            break;
   1.206 +        } else {
   1.207 +            if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
   1.208 +                /* partial match, continue */
   1.209 +                idx=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
   1.210 +            } else {
   1.211 +                if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
   1.212 +                     TO_U_USE_FALLBACK(useFallback)) &&
   1.213 +                    UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
   1.214 +                ) {
   1.215 +                    /* full match, stop with result */
   1.216 +                    matchValue=value;
   1.217 +                    matchLength=i+j;
   1.218 +                } else {
   1.219 +                    /* full match on fallback not taken, stop with the longest match so far */
   1.220 +                }
   1.221 +                break;
   1.222 +            }
   1.223 +        }
   1.224 +    }
   1.225 +
   1.226 +    if(matchLength==0) {
   1.227 +        /* no match at all */
   1.228 +        return 0;
   1.229 +    }
   1.230 +
   1.231 +    /* return result */
   1.232 +    *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
   1.233 +    return matchLength;
   1.234 +}
   1.235 +
   1.236 +static inline void
   1.237 +ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
   1.238 +                 uint32_t value,
   1.239 +                 UChar **target, const UChar *targetLimit,
   1.240 +                 int32_t **offsets, int32_t srcIndex,
   1.241 +                 UErrorCode *pErrorCode) {
   1.242 +    /* output the result */
   1.243 +    if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
   1.244 +        /* output a single code point */
   1.245 +        ucnv_toUWriteCodePoint(
   1.246 +            cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
   1.247 +            target, targetLimit,
   1.248 +            offsets, srcIndex,
   1.249 +            pErrorCode);
   1.250 +    } else {
   1.251 +        /* output a string - with correct data we have resultLength>0 */
   1.252 +        ucnv_toUWriteUChars(
   1.253 +            cnv,
   1.254 +            UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
   1.255 +                UCNV_EXT_TO_U_GET_INDEX(value),
   1.256 +            UCNV_EXT_TO_U_GET_LENGTH(value),
   1.257 +            target, targetLimit,
   1.258 +            offsets, srcIndex,
   1.259 +            pErrorCode);
   1.260 +    }
   1.261 +}
   1.262 +
   1.263 +/*
   1.264 + * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
   1.265 + * or 1 for DBCS-only,
   1.266 + * or -1 if the converter is not SI/SO stateful
   1.267 + *
   1.268 + * Note: For SI/SO stateful converters getting here,
   1.269 + * cnv->mode==0 is equivalent to firstLength==1.
   1.270 + */
   1.271 +#define UCNV_SISO_STATE(cnv) \
   1.272 +    ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
   1.273 +     (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
   1.274 +
   1.275 +/*
   1.276 + * target<targetLimit; set error code for overflow
   1.277 + */
   1.278 +U_CFUNC UBool
   1.279 +ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
   1.280 +                        int32_t firstLength,
   1.281 +                        const char **src, const char *srcLimit,
   1.282 +                        UChar **target, const UChar *targetLimit,
   1.283 +                        int32_t **offsets, int32_t srcIndex,
   1.284 +                        UBool flush,
   1.285 +                        UErrorCode *pErrorCode) {
   1.286 +    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   1.287 +    int32_t match;
   1.288 +
   1.289 +    /* try to match */
   1.290 +    match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
   1.291 +                           (const char *)cnv->toUBytes, firstLength,
   1.292 +                           *src, (int32_t)(srcLimit-*src),
   1.293 +                           &value,
   1.294 +                           cnv->useFallback, flush);
   1.295 +    if(match>0) {
   1.296 +        /* advance src pointer for the consumed input */
   1.297 +        *src+=match-firstLength;
   1.298 +
   1.299 +        /* write result to target */
   1.300 +        ucnv_extWriteToU(cnv, cx,
   1.301 +                         value,
   1.302 +                         target, targetLimit,
   1.303 +                         offsets, srcIndex,
   1.304 +                         pErrorCode);
   1.305 +        return TRUE;
   1.306 +    } else if(match<0) {
   1.307 +        /* save state for partial match */
   1.308 +        const char *s;
   1.309 +        int32_t j;
   1.310 +
   1.311 +        /* copy the first code point */
   1.312 +        s=(const char *)cnv->toUBytes;
   1.313 +        cnv->preToUFirstLength=(int8_t)firstLength;
   1.314 +        for(j=0; j<firstLength; ++j) {
   1.315 +            cnv->preToU[j]=*s++;
   1.316 +        }
   1.317 +
   1.318 +        /* now copy the newly consumed input */
   1.319 +        s=*src;
   1.320 +        match=-match;
   1.321 +        for(; j<match; ++j) {
   1.322 +            cnv->preToU[j]=*s++;
   1.323 +        }
   1.324 +        *src=s; /* same as *src=srcLimit; because we reached the end of input */
   1.325 +        cnv->preToULength=(int8_t)match;
   1.326 +        return TRUE;
   1.327 +    } else /* match==0 no match */ {
   1.328 +        return FALSE;
   1.329 +    }
   1.330 +}
   1.331 +
   1.332 +U_CFUNC UChar32
   1.333 +ucnv_extSimpleMatchToU(const int32_t *cx,
   1.334 +                       const char *source, int32_t length,
   1.335 +                       UBool useFallback) {
   1.336 +    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   1.337 +    int32_t match;
   1.338 +
   1.339 +    if(length<=0) {
   1.340 +        return 0xffff;
   1.341 +    }
   1.342 +
   1.343 +    /* try to match */
   1.344 +    match=ucnv_extMatchToU(cx, -1,
   1.345 +                           source, length,
   1.346 +                           NULL, 0,
   1.347 +                           &value,
   1.348 +                           useFallback, TRUE);
   1.349 +    if(match==length) {
   1.350 +        /* write result for simple, single-character conversion */
   1.351 +        if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
   1.352 +            return UCNV_EXT_TO_U_GET_CODE_POINT(value);
   1.353 +        }
   1.354 +    }
   1.355 +
   1.356 +    /*
   1.357 +     * return no match because
   1.358 +     * - match>0 && value points to string: simple conversion cannot handle multiple code points
   1.359 +     * - match>0 && match!=length: not all input consumed, forbidden for this function
   1.360 +     * - match==0: no match found in the first place
   1.361 +     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
   1.362 +     */
   1.363 +    return 0xfffe;
   1.364 +}
   1.365 +
   1.366 +/*
   1.367 + * continue partial match with new input
   1.368 + * never called for simple, single-character conversion
   1.369 + */
   1.370 +U_CFUNC void
   1.371 +ucnv_extContinueMatchToU(UConverter *cnv,
   1.372 +                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
   1.373 +                         UErrorCode *pErrorCode) {
   1.374 +    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   1.375 +    int32_t match, length;
   1.376 +
   1.377 +    match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
   1.378 +                           cnv->preToU, cnv->preToULength,
   1.379 +                           pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
   1.380 +                           &value,
   1.381 +                           cnv->useFallback, pArgs->flush);
   1.382 +    if(match>0) {
   1.383 +        if(match>=cnv->preToULength) {
   1.384 +            /* advance src pointer for the consumed input */
   1.385 +            pArgs->source+=match-cnv->preToULength;
   1.386 +            cnv->preToULength=0;
   1.387 +        } else {
   1.388 +            /* the match did not use all of preToU[] - keep the rest for replay */
   1.389 +            length=cnv->preToULength-match;
   1.390 +            uprv_memmove(cnv->preToU, cnv->preToU+match, length);
   1.391 +            cnv->preToULength=(int8_t)-length;
   1.392 +        }
   1.393 +
   1.394 +        /* write result */
   1.395 +        ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
   1.396 +                         value,
   1.397 +                         &pArgs->target, pArgs->targetLimit,
   1.398 +                         &pArgs->offsets, srcIndex,
   1.399 +                         pErrorCode);
   1.400 +    } else if(match<0) {
   1.401 +        /* save state for partial match */
   1.402 +        const char *s;
   1.403 +        int32_t j;
   1.404 +
   1.405 +        /* just _append_ the newly consumed input to preToU[] */
   1.406 +        s=pArgs->source;
   1.407 +        match=-match;
   1.408 +        for(j=cnv->preToULength; j<match; ++j) {
   1.409 +            cnv->preToU[j]=*s++;
   1.410 +        }
   1.411 +        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
   1.412 +        cnv->preToULength=(int8_t)match;
   1.413 +    } else /* match==0 */ {
   1.414 +        /*
   1.415 +         * no match
   1.416 +         *
   1.417 +         * We need to split the previous input into two parts:
   1.418 +         *
   1.419 +         * 1. The first codepage character is unmappable - that's how we got into
   1.420 +         *    trying the extension data in the first place.
   1.421 +         *    We need to move it from the preToU buffer
   1.422 +         *    to the error buffer, set an error code,
   1.423 +         *    and prepare the rest of the previous input for 2.
   1.424 +         *
   1.425 +         * 2. The rest of the previous input must be converted once we
   1.426 +         *    come back from the callback for the first character.
   1.427 +         *    At that time, we have to try again from scratch to convert
   1.428 +         *    these input characters.
   1.429 +         *    The replay will be handled by the ucnv.c conversion code.
   1.430 +         */
   1.431 +
   1.432 +        /* move the first codepage character to the error field */
   1.433 +        uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
   1.434 +        cnv->toULength=cnv->preToUFirstLength;
   1.435 +
   1.436 +        /* move the rest up inside the buffer */
   1.437 +        length=cnv->preToULength-cnv->preToUFirstLength;
   1.438 +        if(length>0) {
   1.439 +            uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
   1.440 +        }
   1.441 +
   1.442 +        /* mark preToU for replay */
   1.443 +        cnv->preToULength=(int8_t)-length;
   1.444 +
   1.445 +        /* set the error code for unassigned */
   1.446 +        *pErrorCode=U_INVALID_CHAR_FOUND;
   1.447 +    }
   1.448 +}
   1.449 +
   1.450 +/* from Unicode ------------------------------------------------------------- */
   1.451 +
   1.452 +// Use roundtrips, "good one-way" mappings, and some normal fallbacks.
   1.453 +static inline UBool
   1.454 +extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
   1.455 +    return
   1.456 +        ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
   1.457 +            FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
   1.458 +        (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
   1.459 +}
   1.460 +
   1.461 +/*
   1.462 + * @return index of the UChar, if found; else <0
   1.463 + */
   1.464 +static inline int32_t
   1.465 +ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
   1.466 +    int32_t i, start, limit;
   1.467 +
   1.468 +    /* binary search */
   1.469 +    start=0;
   1.470 +    limit=length;
   1.471 +    for(;;) {
   1.472 +        i=limit-start;
   1.473 +        if(i<=1) {
   1.474 +            break; /* done */
   1.475 +        }
   1.476 +        /* start<limit-1 */
   1.477 +
   1.478 +        if(i<=4) {
   1.479 +            /* linear search for the last part */
   1.480 +            if(u<=fromUSection[start]) {
   1.481 +                break;
   1.482 +            }
   1.483 +            if(++start<limit && u<=fromUSection[start]) {
   1.484 +                break;
   1.485 +            }
   1.486 +            if(++start<limit && u<=fromUSection[start]) {
   1.487 +                break;
   1.488 +            }
   1.489 +            /* always break at start==limit-1 */
   1.490 +            ++start;
   1.491 +            break;
   1.492 +        }
   1.493 +
   1.494 +        i=(start+limit)/2;
   1.495 +        if(u<fromUSection[i]) {
   1.496 +            limit=i;
   1.497 +        } else {
   1.498 +            start=i;
   1.499 +        }
   1.500 +    }
   1.501 +
   1.502 +    /* did we really find it? */
   1.503 +    if(start<limit && u==fromUSection[start]) {
   1.504 +        return start;
   1.505 +    } else {
   1.506 +        return -1; /* not found */
   1.507 +    }
   1.508 +}
   1.509 +
   1.510 +/*
   1.511 + * @param cx pointer to extension data; if NULL, returns 0
   1.512 + * @param firstCP the first code point before all the other UChars
   1.513 + * @param pre UChars that must match; !initialMatch: partial match with them
   1.514 + * @param preLength length of pre, >=0
   1.515 + * @param src UChars that can be used to complete a match
   1.516 + * @param srcLength length of src, >=0
   1.517 + * @param pMatchValue [out] output result value for the match from the data structure
   1.518 + * @param useFallback "use fallback" flag, usually from cnv->useFallback
   1.519 + * @param flush TRUE if the end of the input stream is reached
   1.520 + * @return >1: matched, return value=total match length (number of input units matched)
   1.521 + *          1: matched, no mapping but request for <subchar1>
   1.522 + *             (only for the first code point)
   1.523 + *          0: no match
   1.524 + *         <0: partial match, return value=negative total match length
   1.525 + *             (partial matches are never returned for flush==TRUE)
   1.526 + *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
   1.527 + *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
   1.528 + *         further code units matched
   1.529 + */
   1.530 +static int32_t
   1.531 +ucnv_extMatchFromU(const int32_t *cx,
   1.532 +                   UChar32 firstCP,
   1.533 +                   const UChar *pre, int32_t preLength,
   1.534 +                   const UChar *src, int32_t srcLength,
   1.535 +                   uint32_t *pMatchValue,
   1.536 +                   UBool useFallback, UBool flush) {
   1.537 +    const uint16_t *stage12, *stage3;
   1.538 +    const uint32_t *stage3b;
   1.539 +
   1.540 +    const UChar *fromUTableUChars, *fromUSectionUChars;
   1.541 +    const uint32_t *fromUTableValues, *fromUSectionValues;
   1.542 +
   1.543 +    uint32_t value, matchValue;
   1.544 +    int32_t i, j, idx, length, matchLength;
   1.545 +    UChar c;
   1.546 +
   1.547 +    if(cx==NULL) {
   1.548 +        return 0; /* no extension data, no match */
   1.549 +    }
   1.550 +
   1.551 +    /* trie lookup of firstCP */
   1.552 +    idx=firstCP>>10; /* stage 1 index */
   1.553 +    if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
   1.554 +        return 0; /* the first code point is outside the trie */
   1.555 +    }
   1.556 +
   1.557 +    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
   1.558 +    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
   1.559 +    idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
   1.560 +
   1.561 +    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
   1.562 +    value=stage3b[idx];
   1.563 +    if(value==0) {
   1.564 +        return 0;
   1.565 +    }
   1.566 +
   1.567 +    /*
   1.568 +     * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
   1.569 +     * Do not interpret values with reserved bits used, for forward compatibility,
   1.570 +     * and do not even remember intermediate results with reserved bits used.
   1.571 +     */
   1.572 +
   1.573 +    if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
   1.574 +        /* partial match, enter the loop below */
   1.575 +        idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
   1.576 +
   1.577 +        /* initialize */
   1.578 +        fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
   1.579 +        fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
   1.580 +
   1.581 +        matchValue=0;
   1.582 +        i=j=matchLength=0;
   1.583 +
   1.584 +        /* we must not remember fallback matches when not using fallbacks */
   1.585 +
   1.586 +        /* match input units until there is a full match or the input is consumed */
   1.587 +        for(;;) {
   1.588 +            /* go to the next section */
   1.589 +            fromUSectionUChars=fromUTableUChars+idx;
   1.590 +            fromUSectionValues=fromUTableValues+idx;
   1.591 +
   1.592 +            /* read first pair of the section */
   1.593 +            length=*fromUSectionUChars++;
   1.594 +            value=*fromUSectionValues++;
   1.595 +            if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
   1.596 +                /* remember longest match so far */
   1.597 +                matchValue=value;
   1.598 +                matchLength=2+i+j;
   1.599 +            }
   1.600 +
   1.601 +            /* match pre[] then src[] */
   1.602 +            if(i<preLength) {
   1.603 +                c=pre[i++];
   1.604 +            } else if(j<srcLength) {
   1.605 +                c=src[j++];
   1.606 +            } else {
   1.607 +                /* all input consumed, partial match */
   1.608 +                if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
   1.609 +                    /*
   1.610 +                     * end of the entire input stream, stop with the longest match so far
   1.611 +                     * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
   1.612 +                     * because it must fit into state buffers
   1.613 +                     */
   1.614 +                    break;
   1.615 +                } else {
   1.616 +                    /* continue with more input next time */
   1.617 +                    return -(2+length);
   1.618 +                }
   1.619 +            }
   1.620 +
   1.621 +            /* search for the current UChar */
   1.622 +            idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
   1.623 +            if(idx<0) {
   1.624 +                /* no match here, stop with the longest match so far */
   1.625 +                break;
   1.626 +            } else {
   1.627 +                value=fromUSectionValues[idx];
   1.628 +                if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
   1.629 +                    /* partial match, continue */
   1.630 +                    idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
   1.631 +                } else {
   1.632 +                    if(extFromUUseMapping(useFallback, value, firstCP)) {
   1.633 +                        /* full match, stop with result */
   1.634 +                        matchValue=value;
   1.635 +                        matchLength=2+i+j;
   1.636 +                    } else {
   1.637 +                        /* full match on fallback not taken, stop with the longest match so far */
   1.638 +                    }
   1.639 +                    break;
   1.640 +                }
   1.641 +            }
   1.642 +        }
   1.643 +
   1.644 +        if(matchLength==0) {
   1.645 +            /* no match at all */
   1.646 +            return 0;
   1.647 +        }
   1.648 +    } else /* result from firstCP trie lookup */ {
   1.649 +        if(extFromUUseMapping(useFallback, value, firstCP)) {
   1.650 +            /* full match, stop with result */
   1.651 +            matchValue=value;
   1.652 +            matchLength=2;
   1.653 +        } else {
   1.654 +            /* fallback not taken */
   1.655 +            return 0;
   1.656 +        }
   1.657 +    }
   1.658 +
   1.659 +    /* return result */
   1.660 +    if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
   1.661 +        return 1; /* assert matchLength==2 */
   1.662 +    }
   1.663 +
   1.664 +    *pMatchValue=matchValue;
   1.665 +    return matchLength;
   1.666 +}
   1.667 +
   1.668 +/*
   1.669 + * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
   1.670 + */
   1.671 +static inline void
   1.672 +ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
   1.673 +                   uint32_t value,
   1.674 +                   char **target, const char *targetLimit,
   1.675 +                   int32_t **offsets, int32_t srcIndex,
   1.676 +                   UErrorCode *pErrorCode) {
   1.677 +    uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
   1.678 +    const uint8_t *result;
   1.679 +    int32_t length, prevLength;
   1.680 +
   1.681 +    length=UCNV_EXT_FROM_U_GET_LENGTH(value);
   1.682 +    value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
   1.683 +
   1.684 +    /* output the result */
   1.685 +    if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
   1.686 +        /*
   1.687 +         * Generate a byte array and then write it below.
   1.688 +         * This is not the fastest possible way, but it should be ok for
   1.689 +         * extension mappings, and it is much simpler.
   1.690 +         * Offset and overflow handling are only done once this way.
   1.691 +         */
   1.692 +        uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
   1.693 +        switch(length) {
   1.694 +        case 3:
   1.695 +            *p++=(uint8_t)(value>>16);
   1.696 +        case 2: /*fall through*/
   1.697 +            *p++=(uint8_t)(value>>8);
   1.698 +        case 1: /*fall through*/
   1.699 +            *p++=(uint8_t)value;
   1.700 +        default:
   1.701 +            break; /* will never occur */
   1.702 +        }
   1.703 +        result=buffer+1;
   1.704 +    } else {
   1.705 +        result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
   1.706 +    }
   1.707 +
   1.708 +    /* with correct data we have length>0 */
   1.709 +
   1.710 +    if((prevLength=cnv->fromUnicodeStatus)!=0) {
   1.711 +        /* handle SI/SO stateful output */
   1.712 +        uint8_t shiftByte;
   1.713 +
   1.714 +        if(prevLength>1 && length==1) {
   1.715 +            /* change from double-byte mode to single-byte */
   1.716 +            shiftByte=(uint8_t)UCNV_SI;
   1.717 +            cnv->fromUnicodeStatus=1;
   1.718 +        } else if(prevLength==1 && length>1) {
   1.719 +            /* change from single-byte mode to double-byte */
   1.720 +            shiftByte=(uint8_t)UCNV_SO;
   1.721 +            cnv->fromUnicodeStatus=2;
   1.722 +        } else {
   1.723 +            shiftByte=0;
   1.724 +        }
   1.725 +
   1.726 +        if(shiftByte!=0) {
   1.727 +            /* prepend the shift byte to the result bytes */
   1.728 +            buffer[0]=shiftByte;
   1.729 +            if(result!=buffer+1) {
   1.730 +                uprv_memcpy(buffer+1, result, length);
   1.731 +            }
   1.732 +            result=buffer;
   1.733 +            ++length;
   1.734 +        }
   1.735 +    }
   1.736 +
   1.737 +    ucnv_fromUWriteBytes(cnv, (const char *)result, length,
   1.738 +                         target, targetLimit,
   1.739 +                         offsets, srcIndex,
   1.740 +                         pErrorCode);
   1.741 +}
   1.742 +
   1.743 +/*
   1.744 + * target<targetLimit; set error code for overflow
   1.745 + */
   1.746 +U_CFUNC UBool
   1.747 +ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
   1.748 +                          UChar32 cp,
   1.749 +                          const UChar **src, const UChar *srcLimit,
   1.750 +                          char **target, const char *targetLimit,
   1.751 +                          int32_t **offsets, int32_t srcIndex,
   1.752 +                          UBool flush,
   1.753 +                          UErrorCode *pErrorCode) {
   1.754 +    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   1.755 +    int32_t match;
   1.756 +
   1.757 +    /* try to match */
   1.758 +    match=ucnv_extMatchFromU(cx, cp,
   1.759 +                             NULL, 0,
   1.760 +                             *src, (int32_t)(srcLimit-*src),
   1.761 +                             &value,
   1.762 +                             cnv->useFallback, flush);
   1.763 +
   1.764 +    /* reject a match if the result is a single byte for DBCS-only */
   1.765 +    if( match>=2 &&
   1.766 +        !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
   1.767 +          cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
   1.768 +    ) {
   1.769 +        /* advance src pointer for the consumed input */
   1.770 +        *src+=match-2; /* remove 2 for the initial code point */
   1.771 +
   1.772 +        /* write result to target */
   1.773 +        ucnv_extWriteFromU(cnv, cx,
   1.774 +                           value,
   1.775 +                           target, targetLimit,
   1.776 +                           offsets, srcIndex,
   1.777 +                           pErrorCode);
   1.778 +        return TRUE;
   1.779 +    } else if(match<0) {
   1.780 +        /* save state for partial match */
   1.781 +        const UChar *s;
   1.782 +        int32_t j;
   1.783 +
   1.784 +        /* copy the first code point */
   1.785 +        cnv->preFromUFirstCP=cp;
   1.786 +
   1.787 +        /* now copy the newly consumed input */
   1.788 +        s=*src;
   1.789 +        match=-match-2; /* remove 2 for the initial code point */
   1.790 +        for(j=0; j<match; ++j) {
   1.791 +            cnv->preFromU[j]=*s++;
   1.792 +        }
   1.793 +        *src=s; /* same as *src=srcLimit; because we reached the end of input */
   1.794 +        cnv->preFromULength=(int8_t)match;
   1.795 +        return TRUE;
   1.796 +    } else if(match==1) {
   1.797 +        /* matched, no mapping but request for <subchar1> */
   1.798 +        cnv->useSubChar1=TRUE;
   1.799 +        return FALSE;
   1.800 +    } else /* match==0 no match */ {
   1.801 +        return FALSE;
   1.802 +    }
   1.803 +}
   1.804 +
   1.805 +/*
   1.806 + * Used by ISO 2022 implementation.
   1.807 + * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
   1.808 + */
   1.809 +U_CFUNC int32_t
   1.810 +ucnv_extSimpleMatchFromU(const int32_t *cx,
   1.811 +                         UChar32 cp, uint32_t *pValue,
   1.812 +                         UBool useFallback) {
   1.813 +    uint32_t value;
   1.814 +    int32_t match;
   1.815 +
   1.816 +    /* try to match */
   1.817 +    match=ucnv_extMatchFromU(cx,
   1.818 +                             cp,
   1.819 +                             NULL, 0,
   1.820 +                             NULL, 0,
   1.821 +                             &value,
   1.822 +                             useFallback, TRUE);
   1.823 +    if(match>=2) {
   1.824 +        /* write result for simple, single-character conversion */
   1.825 +        int32_t length;
   1.826 +        int isRoundtrip;
   1.827 +
   1.828 +        isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
   1.829 +        length=UCNV_EXT_FROM_U_GET_LENGTH(value);
   1.830 +        value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);
   1.831 +
   1.832 +        if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
   1.833 +            *pValue=value;
   1.834 +            return isRoundtrip ? length : -length;
   1.835 +#if 0 /* not currently used */
   1.836 +        } else if(length==4) {
   1.837 +            /* de-serialize a 4-byte result */
   1.838 +            const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
   1.839 +            *pValue=
   1.840 +                ((uint32_t)result[0]<<24)|
   1.841 +                ((uint32_t)result[1]<<16)|
   1.842 +                ((uint32_t)result[2]<<8)|
   1.843 +                result[3];
   1.844 +            return isRoundtrip ? 4 : -4;
   1.845 +#endif
   1.846 +        }
   1.847 +    }
   1.848 +
   1.849 +    /*
   1.850 +     * return no match because
   1.851 +     * - match>1 && resultLength>4: result too long for simple conversion
   1.852 +     * - match==1: no match found, <subchar1> preferred
   1.853 +     * - match==0: no match found in the first place
   1.854 +     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
   1.855 +     */
   1.856 +    return 0;
   1.857 +}
   1.858 +
   1.859 +/*
   1.860 + * continue partial match with new input, requires cnv->preFromUFirstCP>=0
   1.861 + * never called for simple, single-character conversion
   1.862 + */
   1.863 +U_CFUNC void
   1.864 +ucnv_extContinueMatchFromU(UConverter *cnv,
   1.865 +                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
   1.866 +                           UErrorCode *pErrorCode) {
   1.867 +    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
   1.868 +    int32_t match;
   1.869 +
   1.870 +    match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
   1.871 +                             cnv->preFromUFirstCP,
   1.872 +                             cnv->preFromU, cnv->preFromULength,
   1.873 +                             pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
   1.874 +                             &value,
   1.875 +                             cnv->useFallback, pArgs->flush);
   1.876 +    if(match>=2) {
   1.877 +        match-=2; /* remove 2 for the initial code point */
   1.878 +
   1.879 +        if(match>=cnv->preFromULength) {
   1.880 +            /* advance src pointer for the consumed input */
   1.881 +            pArgs->source+=match-cnv->preFromULength;
   1.882 +            cnv->preFromULength=0;
   1.883 +        } else {
   1.884 +            /* the match did not use all of preFromU[] - keep the rest for replay */
   1.885 +            int32_t length=cnv->preFromULength-match;
   1.886 +            uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
   1.887 +            cnv->preFromULength=(int8_t)-length;
   1.888 +        }
   1.889 +
   1.890 +        /* finish the partial match */
   1.891 +        cnv->preFromUFirstCP=U_SENTINEL;
   1.892 +
   1.893 +        /* write result */
   1.894 +        ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
   1.895 +                           value,
   1.896 +                           &pArgs->target, pArgs->targetLimit,
   1.897 +                           &pArgs->offsets, srcIndex,
   1.898 +                           pErrorCode);
   1.899 +    } else if(match<0) {
   1.900 +        /* save state for partial match */
   1.901 +        const UChar *s;
   1.902 +        int32_t j;
   1.903 +
   1.904 +        /* just _append_ the newly consumed input to preFromU[] */
   1.905 +        s=pArgs->source;
   1.906 +        match=-match-2; /* remove 2 for the initial code point */
   1.907 +        for(j=cnv->preFromULength; j<match; ++j) {
   1.908 +            U_ASSERT(j>=0);
   1.909 +            cnv->preFromU[j]=*s++;
   1.910 +        }
   1.911 +        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
   1.912 +        cnv->preFromULength=(int8_t)match;
   1.913 +    } else /* match==0 or 1 */ {
   1.914 +        /*
   1.915 +         * no match
   1.916 +         *
   1.917 +         * We need to split the previous input into two parts:
   1.918 +         *
   1.919 +         * 1. The first code point is unmappable - that's how we got into
   1.920 +         *    trying the extension data in the first place.
   1.921 +         *    We need to move it from the preFromU buffer
   1.922 +         *    to the error buffer, set an error code,
   1.923 +         *    and prepare the rest of the previous input for 2.
   1.924 +         *
   1.925 +         * 2. The rest of the previous input must be converted once we
   1.926 +         *    come back from the callback for the first code point.
   1.927 +         *    At that time, we have to try again from scratch to convert
   1.928 +         *    these input characters.
   1.929 +         *    The replay will be handled by the ucnv.c conversion code.
   1.930 +         */
   1.931 +
   1.932 +        if(match==1) {
   1.933 +            /* matched, no mapping but request for <subchar1> */
   1.934 +            cnv->useSubChar1=TRUE;
   1.935 +        }
   1.936 +
   1.937 +        /* move the first code point to the error field */
   1.938 +        cnv->fromUChar32=cnv->preFromUFirstCP;
   1.939 +        cnv->preFromUFirstCP=U_SENTINEL;
   1.940 +
   1.941 +        /* mark preFromU for replay */
   1.942 +        cnv->preFromULength=-cnv->preFromULength;
   1.943 +
   1.944 +        /* set the error code for unassigned */
   1.945 +        *pErrorCode=U_INVALID_CHAR_FOUND;
   1.946 +    }
   1.947 +}
   1.948 +
   1.949 +static UBool
   1.950 +extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
   1.951 +    if(which==UCNV_ROUNDTRIP_SET) {
   1.952 +        // Add only code points for which the roundtrip flag is set.
   1.953 +        // Do not add any fallbacks, even if ucnv_fromUnicode() would use them
   1.954 +        // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
   1.955 +        //
   1.956 +        // By analogy, also do not add "good one-way" mappings.
   1.957 +        //
   1.958 +        // Do not add entries with reserved bits set.
   1.959 +        if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
   1.960 +                UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
   1.961 +            return FALSE;
   1.962 +        }
   1.963 +    } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
   1.964 +        // Do not add entries with reserved bits set.
   1.965 +        if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
   1.966 +            return FALSE;
   1.967 +        }
   1.968 +    }
   1.969 +    // Do not add <subchar1> entries or other (future?) pseudo-entries
   1.970 +    // with an output length of 0.
   1.971 +    return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
   1.972 +}
   1.973 +
   1.974 +static void
   1.975 +ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
   1.976 +                            const int32_t *cx,
   1.977 +                            const USetAdder *sa,
   1.978 +                            UConverterUnicodeSet which,
   1.979 +                            int32_t minLength,
   1.980 +                            UChar32 firstCP,
   1.981 +                            UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
   1.982 +                            int32_t sectionIndex,
   1.983 +                            UErrorCode *pErrorCode) {
   1.984 +    const UChar *fromUSectionUChars;
   1.985 +    const uint32_t *fromUSectionValues;
   1.986 +
   1.987 +    uint32_t value;
   1.988 +    int32_t i, count;
   1.989 +
   1.990 +    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
   1.991 +    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
   1.992 +
   1.993 +    /* read first pair of the section */
   1.994 +    count=*fromUSectionUChars++;
   1.995 +    value=*fromUSectionValues++;
   1.996 +
   1.997 +    if(extSetUseMapping(which, minLength, value)) {
   1.998 +        if(length==U16_LENGTH(firstCP)) {
   1.999 +            /* add the initial code point */
  1.1000 +            sa->add(sa->set, firstCP);
  1.1001 +        } else {
  1.1002 +            /* add the string so far */
  1.1003 +            sa->addString(sa->set, s, length);
  1.1004 +        }
  1.1005 +    }
  1.1006 +
  1.1007 +    for(i=0; i<count; ++i) {
  1.1008 +        /* append this code unit and recurse or add the string */
  1.1009 +        s[length]=fromUSectionUChars[i];
  1.1010 +        value=fromUSectionValues[i];
  1.1011 +
  1.1012 +        if(value==0) {
  1.1013 +            /* no mapping, do nothing */
  1.1014 +        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  1.1015 +            ucnv_extGetUnicodeSetString(
  1.1016 +                sharedData, cx, sa, which, minLength,
  1.1017 +                firstCP, s, length+1,
  1.1018 +                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
  1.1019 +                pErrorCode);
  1.1020 +        } else if(extSetUseMapping(which, minLength, value)) {
  1.1021 +            sa->addString(sa->set, s, length+1);
  1.1022 +        }
  1.1023 +    }
  1.1024 +}
  1.1025 +
  1.1026 +U_CFUNC void
  1.1027 +ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
  1.1028 +                      const USetAdder *sa,
  1.1029 +                      UConverterUnicodeSet which,
  1.1030 +                      UConverterSetFilter filter,
  1.1031 +                      UErrorCode *pErrorCode) {
  1.1032 +    const int32_t *cx;
  1.1033 +    const uint16_t *stage12, *stage3, *ps2, *ps3;
  1.1034 +    const uint32_t *stage3b;
  1.1035 +
  1.1036 +    uint32_t value;
  1.1037 +    int32_t st1, stage1Length, st2, st3, minLength;
  1.1038 +
  1.1039 +    UChar s[UCNV_EXT_MAX_UCHARS];
  1.1040 +    UChar32 c;
  1.1041 +    int32_t length;
  1.1042 +
  1.1043 +    cx=sharedData->mbcs.extIndexes;
  1.1044 +    if(cx==NULL) {
  1.1045 +        return;
  1.1046 +    }
  1.1047 +
  1.1048 +    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
  1.1049 +    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
  1.1050 +    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
  1.1051 +
  1.1052 +    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
  1.1053 +
  1.1054 +    /* enumerate the from-Unicode trie table */
  1.1055 +    c=0; /* keep track of the current code point while enumerating */
  1.1056 +
  1.1057 +    if(filter==UCNV_SET_FILTER_2022_CN) {
  1.1058 +        minLength=3;
  1.1059 +    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
  1.1060 +               filter!=UCNV_SET_FILTER_NONE
  1.1061 +    ) {
  1.1062 +        /* DBCS-only, ignore single-byte results */
  1.1063 +        minLength=2;
  1.1064 +    } else {
  1.1065 +        minLength=1;
  1.1066 +    }
  1.1067 +
  1.1068 +    /*
  1.1069 +     * the trie enumeration is almost the same as
  1.1070 +     * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
  1.1071 +     */
  1.1072 +    for(st1=0; st1<stage1Length; ++st1) {
  1.1073 +        st2=stage12[st1];
  1.1074 +        if(st2>stage1Length) {
  1.1075 +            ps2=stage12+st2;
  1.1076 +            for(st2=0; st2<64; ++st2) {
  1.1077 +                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
  1.1078 +                    /* read the stage 3 block */
  1.1079 +                    ps3=stage3+st3;
  1.1080 +
  1.1081 +                    do {
  1.1082 +                        value=stage3b[*ps3++];
  1.1083 +                        if(value==0) {
  1.1084 +                            /* no mapping, do nothing */
  1.1085 +                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
  1.1086 +                            // Recurse for partial results.
  1.1087 +                            length=0;
  1.1088 +                            U16_APPEND_UNSAFE(s, length, c);
  1.1089 +                            ucnv_extGetUnicodeSetString(
  1.1090 +                                sharedData, cx, sa, which, minLength,
  1.1091 +                                c, s, length,
  1.1092 +                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
  1.1093 +                                pErrorCode);
  1.1094 +                        } else if(extSetUseMapping(which, minLength, value)) {
  1.1095 +                            switch(filter) {
  1.1096 +                            case UCNV_SET_FILTER_2022_CN:
  1.1097 +                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
  1.1098 +                                    continue;
  1.1099 +                                }
  1.1100 +                                break;
  1.1101 +                            case UCNV_SET_FILTER_SJIS:
  1.1102 +                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
  1.1103 +                                    continue;
  1.1104 +                                }
  1.1105 +                                break;
  1.1106 +                            case UCNV_SET_FILTER_GR94DBCS:
  1.1107 +                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  1.1108 +                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
  1.1109 +                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  1.1110 +                                    continue;
  1.1111 +                                }
  1.1112 +                                break;
  1.1113 +                            case UCNV_SET_FILTER_HZ:
  1.1114 +                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
  1.1115 +                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
  1.1116 +                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
  1.1117 +                                    continue;
  1.1118 +                                }
  1.1119 +                                break;
  1.1120 +                            default:
  1.1121 +                                /*
  1.1122 +                                 * UCNV_SET_FILTER_NONE,
  1.1123 +                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
  1.1124 +                                 */
  1.1125 +                                break;
  1.1126 +                            }
  1.1127 +                            sa->add(sa->set, c);
  1.1128 +                        }
  1.1129 +                    } while((++c&0xf)!=0);
  1.1130 +                } else {
  1.1131 +                    c+=16; /* empty stage 3 block */
  1.1132 +                }
  1.1133 +            }
  1.1134 +        } else {
  1.1135 +            c+=1024; /* empty stage 2 block */
  1.1136 +        }
  1.1137 +    }
  1.1138 +}
  1.1139 +
  1.1140 +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

mercurial